Actual source code: sbaijfact2.c
petsc-3.4.5 2014-06-29
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc-private/kernels/blockinvert.h>
12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13: {
14: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
15: IS isrow=a->row;
16: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
18: const PetscInt *r;
19: PetscInt nz,*vj,k,idx,k1;
20: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
21: MatScalar *aa=a->a,*v,*diag;
22: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
25: VecGetArray(bb,&b);
26: VecGetArray(xx,&x);
27: t = a->solve_work;
28: ISGetIndices(isrow,&r);
29: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
31: /* solve U^T * D * y = b by forward substitution */
32: xk = t;
33: for (k=0; k<mbs; k++) { /* t <- perm(b) */
34: idx = bs*r[k];
35: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
36: }
37: for (k=0; k<mbs; k++) {
38: v = aa + bs2*ai[k];
39: xk = t + k*bs; /* Dk*xk = k-th block of x */
40: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
41: nz = ai[k+1] - ai[k];
42: vj = aj + ai[k];
43: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
44: while (nz--) {
45: /* x(:) += U(k,:)^T*(Dk*xk) */
46: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
47: vj++; xj = t + (*vj)*bs;
48: v += bs2;
49: }
50: /* xk = inv(Dk)*(Dk*xk) */
51: diag = aa+k*bs2; /* ptr to inv(Dk) */
52: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
53: }
55: /* solve U*x = y by back substitution */
56: for (k=mbs-1; k>=0; k--) {
57: v = aa + bs2*ai[k];
58: xk = t + k*bs; /* xk */
59: nz = ai[k+1] - ai[k];
60: vj = aj + ai[k];
61: xj = t + (*vj)*bs;
62: while (nz--) {
63: /* xk += U(k,:)*x(:) */
64: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
65: vj++;
66: v += bs2; xj = t + (*vj)*bs;
67: }
68: idx = bs*r[k];
69: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
70: }
72: PetscFree(xk_tmp);
73: ISRestoreIndices(isrow,&r);
74: VecRestoreArray(bb,&b);
75: VecRestoreArray(xx,&x);
76: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
77: return(0);
78: }
82: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
83: {
85: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
86: return(0);
87: }
91: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
92: {
94: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
95: return(0);
96: }
100: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101: {
103: PetscInt nz,*vj,k;
104: PetscInt bs2 = bs*bs;
105: MatScalar *v,*diag;
106: PetscScalar *xk,*xj,*xk_tmp;
109: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
110: for (k=0; k<mbs; k++) {
111: v = aa + bs2*ai[k];
112: xk = x + k*bs; /* Dk*xk = k-th block of x */
113: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
114: nz = ai[k+1] - ai[k];
115: vj = aj + ai[k];
116: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
117: while (nz--) {
118: /* x(:) += U(k,:)^T*(Dk*xk) */
119: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120: vj++; xj = x + (*vj)*bs;
121: v += bs2;
122: }
123: /* xk = inv(Dk)*(Dk*xk) */
124: diag = aa+k*bs2; /* ptr to inv(Dk) */
125: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126: }
127: PetscFree(xk_tmp);
128: return(0);
129: }
133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134: {
135: PetscInt nz,*vj,k;
136: PetscInt bs2 = bs*bs;
137: MatScalar *v;
138: PetscScalar *xk,*xj;
141: for (k=mbs-1; k>=0; k--) {
142: v = aa + bs2*ai[k];
143: xk = x + k*bs; /* xk */
144: nz = ai[k+1] - ai[k];
145: vj = aj + ai[k];
146: xj = x + (*vj)*bs;
147: while (nz--) {
148: /* xk += U(k,:)*x(:) */
149: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150: vj++;
151: v += bs2; xj = x + (*vj)*bs;
152: }
153: }
154: return(0);
155: }
159: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160: {
161: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
163: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164: PetscInt bs =A->rmap->bs;
165: MatScalar *aa=a->a;
166: PetscScalar *x,*b;
169: VecGetArray(bb,&b);
170: VecGetArray(xx,&x);
172: /* solve U^T * D * y = b by forward substitution */
173: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
174: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
176: /* solve U*x = y by back substitution */
177: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
179: VecRestoreArray(bb,&b);
180: VecRestoreArray(xx,&x);
181: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
182: return(0);
183: }
187: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188: {
189: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
191: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192: PetscInt bs =A->rmap->bs;
193: MatScalar *aa=a->a;
194: PetscScalar *x,*b;
197: VecGetArray(bb,&b);
198: VecGetArray(xx,&x);
199: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
200: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
201: VecRestoreArray(bb,&b);
202: VecRestoreArray(xx,&x);
203: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
204: return(0);
205: }
209: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210: {
211: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
213: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214: PetscInt bs =A->rmap->bs;
215: MatScalar *aa=a->a;
216: PetscScalar *x,*b;
219: VecGetArray(bb,&b);
220: VecGetArray(xx,&x);
221: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
222: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
223: VecRestoreArray(bb,&b);
224: VecRestoreArray(xx,&x);
225: PetscLogFlops(2.0*bs2*(a->nz-mbs));
226: return(0);
227: }
231: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232: {
233: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
234: IS isrow=a->row;
235: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237: const PetscInt *r;
238: PetscInt nz,*vj,k,idx;
239: MatScalar *aa=a->a,*v,*d;
240: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
243: VecGetArray(bb,&b);
244: VecGetArray(xx,&x);
245: t = a->solve_work;
246: ISGetIndices(isrow,&r);
248: /* solve U^T * D * y = b by forward substitution */
249: tp = t;
250: for (k=0; k<mbs; k++) { /* t <- perm(b) */
251: idx = 7*r[k];
252: tp[0] = b[idx];
253: tp[1] = b[idx+1];
254: tp[2] = b[idx+2];
255: tp[3] = b[idx+3];
256: tp[4] = b[idx+4];
257: tp[5] = b[idx+5];
258: tp[6] = b[idx+6];
259: tp += 7;
260: }
262: for (k=0; k<mbs; k++) {
263: v = aa + 49*ai[k];
264: vj = aj + ai[k];
265: tp = t + k*7;
266: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267: nz = ai[k+1] - ai[k];
268: tp = t + (*vj)*7;
269: while (nz--) {
270: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277: vj++;
278: tp = t + (*vj)*7;
279: v += 49;
280: }
282: /* xk = inv(Dk)*(Dk*xk) */
283: d = aa+k*49; /* ptr to inv(Dk) */
284: tp = t + k*7;
285: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292: }
294: /* solve U*x = y by back substitution */
295: for (k=mbs-1; k>=0; k--) {
296: v = aa + 49*ai[k];
297: vj = aj + ai[k];
298: tp = t + k*7;
299: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
300: nz = ai[k+1] - ai[k];
302: tp = t + (*vj)*7;
303: while (nz--) {
304: /* xk += U(k,:)*x(:) */
305: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312: vj++;
313: tp = t + (*vj)*7;
314: v += 49;
315: }
316: tp = t + k*7;
317: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
318: idx = 7*r[k];
319: x[idx] = x0;
320: x[idx+1] = x1;
321: x[idx+2] = x2;
322: x[idx+3] = x3;
323: x[idx+4] = x4;
324: x[idx+5] = x5;
325: x[idx+6] = x6;
326: }
328: ISRestoreIndices(isrow,&r);
329: VecRestoreArray(bb,&b);
330: VecRestoreArray(xx,&x);
331: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
332: return(0);
333: }
337: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
338: {
339: MatScalar *v,*d;
340: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
341: PetscInt nz,*vj,k;
344: for (k=0; k<mbs; k++) {
345: v = aa + 49*ai[k];
346: xp = x + k*7;
347: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
348: nz = ai[k+1] - ai[k];
349: vj = aj + ai[k];
350: xp = x + (*vj)*7;
351: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
352: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353: while (nz--) {
354: /* x(:) += U(k,:)^T*(Dk*xk) */
355: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
356: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
357: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
358: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
359: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
360: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
361: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
362: vj++;
363: xp = x + (*vj)*7;
364: v += 49;
365: }
366: /* xk = inv(Dk)*(Dk*xk) */
367: d = aa+k*49; /* ptr to inv(Dk) */
368: xp = x + k*7;
369: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
370: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
371: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
372: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
373: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
374: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
375: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
376: }
377: return(0);
378: }
382: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
383: {
384: MatScalar *v;
385: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
386: PetscInt nz,*vj,k;
389: for (k=mbs-1; k>=0; k--) {
390: v = aa + 49*ai[k];
391: xp = x + k*7;
392: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
393: nz = ai[k+1] - ai[k];
394: vj = aj + ai[k];
395: xp = x + (*vj)*7;
396: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
397: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
398: while (nz--) {
399: /* xk += U(k,:)*x(:) */
400: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
401: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
402: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
403: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
404: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
405: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
406: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
407: vj++;
408: v += 49; xp = x + (*vj)*7;
409: }
410: xp = x + k*7;
411: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
412: }
413: return(0);
414: }
418: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
419: {
420: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
422: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
423: MatScalar *aa=a->a;
424: PetscScalar *x,*b;
427: VecGetArray(bb,&b);
428: VecGetArray(xx,&x);
430: /* solve U^T * D * y = b by forward substitution */
431: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
432: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
434: /* solve U*x = y by back substitution */
435: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
437: VecRestoreArray(bb,&b);
438: VecRestoreArray(xx,&x);
439: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
440: return(0);
441: }
445: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
446: {
447: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
449: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
450: MatScalar *aa=a->a;
451: PetscScalar *x,*b;
454: VecGetArray(bb,&b);
455: VecGetArray(xx,&x);
456: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
457: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458: VecRestoreArray(bb,&b);
459: VecRestoreArray(xx,&x);
460: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
461: return(0);
462: }
466: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
467: {
468: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
470: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
471: MatScalar *aa=a->a;
472: PetscScalar *x,*b;
475: VecGetArray(bb,&b);
476: VecGetArray(xx,&x);
477: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
478: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
479: VecRestoreArray(bb,&b);
480: VecRestoreArray(xx,&x);
481: PetscLogFlops(2.0*bs2*(a->nz-mbs));
482: return(0);
483: }
487: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
488: {
489: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
490: IS isrow=a->row;
491: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
493: const PetscInt *r;
494: PetscInt nz,*vj,k,idx;
495: MatScalar *aa=a->a,*v,*d;
496: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
499: VecGetArray(bb,&b);
500: VecGetArray(xx,&x);
501: t = a->solve_work;
502: ISGetIndices(isrow,&r);
504: /* solve U^T * D * y = b by forward substitution */
505: tp = t;
506: for (k=0; k<mbs; k++) { /* t <- perm(b) */
507: idx = 6*r[k];
508: tp[0] = b[idx];
509: tp[1] = b[idx+1];
510: tp[2] = b[idx+2];
511: tp[3] = b[idx+3];
512: tp[4] = b[idx+4];
513: tp[5] = b[idx+5];
514: tp += 6;
515: }
517: for (k=0; k<mbs; k++) {
518: v = aa + 36*ai[k];
519: vj = aj + ai[k];
520: tp = t + k*6;
521: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
522: nz = ai[k+1] - ai[k];
523: tp = t + (*vj)*6;
524: while (nz--) {
525: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
526: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
527: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
528: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
529: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
530: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
531: vj++;
532: tp = t + (*vj)*6;
533: v += 36;
534: }
536: /* xk = inv(Dk)*(Dk*xk) */
537: d = aa+k*36; /* ptr to inv(Dk) */
538: tp = t + k*6;
539: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
540: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
541: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
542: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
543: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
544: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
545: }
547: /* solve U*x = y by back substitution */
548: for (k=mbs-1; k>=0; k--) {
549: v = aa + 36*ai[k];
550: vj = aj + ai[k];
551: tp = t + k*6;
552: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
553: nz = ai[k+1] - ai[k];
555: tp = t + (*vj)*6;
556: while (nz--) {
557: /* xk += U(k,:)*x(:) */
558: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
559: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
560: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
561: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
562: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
563: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
564: vj++;
565: tp = t + (*vj)*6;
566: v += 36;
567: }
568: tp = t + k*6;
569: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
570: idx = 6*r[k];
571: x[idx] = x0;
572: x[idx+1] = x1;
573: x[idx+2] = x2;
574: x[idx+3] = x3;
575: x[idx+4] = x4;
576: x[idx+5] = x5;
577: }
579: ISRestoreIndices(isrow,&r);
580: VecRestoreArray(bb,&b);
581: VecRestoreArray(xx,&x);
582: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
583: return(0);
584: }
588: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
589: {
590: MatScalar *v,*d;
591: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
592: PetscInt nz,*vj,k;
595: for (k=0; k<mbs; k++) {
596: v = aa + 36*ai[k];
597: xp = x + k*6;
598: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
599: nz = ai[k+1] - ai[k];
600: vj = aj + ai[k];
601: xp = x + (*vj)*6;
602: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
603: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
604: while (nz--) {
605: /* x(:) += U(k,:)^T*(Dk*xk) */
606: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
607: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
608: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
609: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
610: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
611: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
612: vj++;
613: xp = x + (*vj)*6;
614: v += 36;
615: }
616: /* xk = inv(Dk)*(Dk*xk) */
617: d = aa+k*36; /* ptr to inv(Dk) */
618: xp = x + k*6;
619: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
620: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
621: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
622: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
623: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
624: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
625: }
626: return(0);
627: }
630: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
631: {
632: MatScalar *v;
633: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
634: PetscInt nz,*vj,k;
637: for (k=mbs-1; k>=0; k--) {
638: v = aa + 36*ai[k];
639: xp = x + k*6;
640: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
641: nz = ai[k+1] - ai[k];
642: vj = aj + ai[k];
643: xp = x + (*vj)*6;
644: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
645: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
646: while (nz--) {
647: /* xk += U(k,:)*x(:) */
648: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
649: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
650: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
651: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
652: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
653: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
654: vj++;
655: v += 36; xp = x + (*vj)*6;
656: }
657: xp = x + k*6;
658: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
659: }
660: return(0);
661: }
666: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
667: {
668: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
669: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
670: MatScalar *aa=a->a;
671: PetscScalar *x,*b;
675: VecGetArray(bb,&b);
676: VecGetArray(xx,&x);
678: /* solve U^T * D * y = b by forward substitution */
679: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
680: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
682: /* solve U*x = y by back substitution */
683: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
685: VecRestoreArray(bb,&b);
686: VecRestoreArray(xx,&x);
687: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
688: return(0);
689: }
693: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
694: {
695: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
696: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
697: MatScalar *aa=a->a;
698: PetscScalar *x,*b;
702: VecGetArray(bb,&b);
703: VecGetArray(xx,&x);
704: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
705: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
706: VecRestoreArray(bb,&b);
707: VecRestoreArray(xx,&x);
708: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
709: return(0);
710: }
714: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
715: {
716: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
717: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
718: MatScalar *aa=a->a;
719: PetscScalar *x,*b;
723: VecGetArray(bb,&b);
724: VecGetArray(xx,&x);
725: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
726: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
727: VecRestoreArray(bb,&b);
728: VecRestoreArray(xx,&x);
729: PetscLogFlops(2.0*bs2*(a->nz - mbs));
730: return(0);
731: }
735: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
736: {
737: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
738: IS isrow=a->row;
739: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
741: const PetscInt *r;
742: PetscInt nz,*vj,k,idx;
743: MatScalar *aa=a->a,*v,*diag;
744: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
747: VecGetArray(bb,&b);
748: VecGetArray(xx,&x);
749: t = a->solve_work;
750: ISGetIndices(isrow,&r);
752: /* solve U^T * D * y = b by forward substitution */
753: tp = t;
754: for (k=0; k<mbs; k++) { /* t <- perm(b) */
755: idx = 5*r[k];
756: tp[0] = b[idx];
757: tp[1] = b[idx+1];
758: tp[2] = b[idx+2];
759: tp[3] = b[idx+3];
760: tp[4] = b[idx+4];
761: tp += 5;
762: }
764: for (k=0; k<mbs; k++) {
765: v = aa + 25*ai[k];
766: vj = aj + ai[k];
767: tp = t + k*5;
768: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
769: nz = ai[k+1] - ai[k];
771: tp = t + (*vj)*5;
772: while (nz--) {
773: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
774: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
775: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
776: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
777: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
778: vj++;
779: tp = t + (*vj)*5;
780: v += 25;
781: }
783: /* xk = inv(Dk)*(Dk*xk) */
784: diag = aa+k*25; /* ptr to inv(Dk) */
785: tp = t + k*5;
786: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
787: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
788: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
789: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
790: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
791: }
793: /* solve U*x = y by back substitution */
794: for (k=mbs-1; k>=0; k--) {
795: v = aa + 25*ai[k];
796: vj = aj + ai[k];
797: tp = t + k*5;
798: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
799: nz = ai[k+1] - ai[k];
801: tp = t + (*vj)*5;
802: while (nz--) {
803: /* xk += U(k,:)*x(:) */
804: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
805: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
806: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
807: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
808: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
809: vj++;
810: tp = t + (*vj)*5;
811: v += 25;
812: }
813: tp = t + k*5;
814: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
815: idx = 5*r[k];
816: x[idx] = x0;
817: x[idx+1] = x1;
818: x[idx+2] = x2;
819: x[idx+3] = x3;
820: x[idx+4] = x4;
821: }
823: ISRestoreIndices(isrow,&r);
824: VecRestoreArray(bb,&b);
825: VecRestoreArray(xx,&x);
826: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
827: return(0);
828: }
832: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
833: {
834: MatScalar *v,*diag;
835: PetscScalar *xp,x0,x1,x2,x3,x4;
836: PetscInt nz,*vj,k;
839: for (k=0; k<mbs; k++) {
840: v = aa + 25*ai[k];
841: xp = x + k*5;
842: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
843: nz = ai[k+1] - ai[k];
844: vj = aj + ai[k];
845: xp = x + (*vj)*5;
846: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
847: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
848: while (nz--) {
849: /* x(:) += U(k,:)^T*(Dk*xk) */
850: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
851: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
852: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
853: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
854: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
855: vj++;
856: xp = x + (*vj)*5;
857: v += 25;
858: }
859: /* xk = inv(Dk)*(Dk*xk) */
860: diag = aa+k*25; /* ptr to inv(Dk) */
861: xp = x + k*5;
862: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
863: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
864: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
865: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
866: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
867: }
868: return(0);
869: }
873: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
874: {
875: MatScalar *v;
876: PetscScalar *xp,x0,x1,x2,x3,x4;
877: PetscInt nz,*vj,k;
880: for (k=mbs-1; k>=0; k--) {
881: v = aa + 25*ai[k];
882: xp = x + k*5;
883: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
884: nz = ai[k+1] - ai[k];
885: vj = aj + ai[k];
886: xp = x + (*vj)*5;
887: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
888: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889: while (nz--) {
890: /* xk += U(k,:)*x(:) */
891: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
892: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
893: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
894: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
895: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
896: vj++;
897: v += 25; xp = x + (*vj)*5;
898: }
899: xp = x + k*5;
900: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
901: }
902: return(0);
903: }
907: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
908: {
909: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
910: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
911: MatScalar *aa=a->a;
912: PetscScalar *x,*b;
916: VecGetArray(bb,&b);
917: VecGetArray(xx,&x);
919: /* solve U^T * D * y = b by forward substitution */
920: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
921: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
923: /* solve U*x = y by back substitution */
924: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
926: VecRestoreArray(bb,&b);
927: VecRestoreArray(xx,&x);
928: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
929: return(0);
930: }
934: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
935: {
936: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
937: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
938: MatScalar *aa=a->a;
939: PetscScalar *x,*b;
943: VecGetArray(bb,&b);
944: VecGetArray(xx,&x);
945: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
946: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
947: VecRestoreArray(bb,&b);
948: VecRestoreArray(xx,&x);
949: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
950: return(0);
951: }
955: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
956: {
957: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
958: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
959: MatScalar *aa=a->a;
960: PetscScalar *x,*b;
964: VecGetArray(bb,&b);
965: VecGetArray(xx,&x);
966: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
967: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
968: VecRestoreArray(bb,&b);
969: VecRestoreArray(xx,&x);
970: PetscLogFlops(2.0*bs2*(a->nz-mbs));
971: return(0);
972: }
976: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
977: {
978: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
979: IS isrow=a->row;
980: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
982: const PetscInt *r;
983: PetscInt nz,*vj,k,idx;
984: MatScalar *aa=a->a,*v,*diag;
985: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
988: VecGetArray(bb,&b);
989: VecGetArray(xx,&x);
990: t = a->solve_work;
991: ISGetIndices(isrow,&r);
993: /* solve U^T * D * y = b by forward substitution */
994: tp = t;
995: for (k=0; k<mbs; k++) { /* t <- perm(b) */
996: idx = 4*r[k];
997: tp[0] = b[idx];
998: tp[1] = b[idx+1];
999: tp[2] = b[idx+2];
1000: tp[3] = b[idx+3];
1001: tp += 4;
1002: }
1004: for (k=0; k<mbs; k++) {
1005: v = aa + 16*ai[k];
1006: vj = aj + ai[k];
1007: tp = t + k*4;
1008: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1009: nz = ai[k+1] - ai[k];
1011: tp = t + (*vj)*4;
1012: while (nz--) {
1013: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1014: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1015: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1016: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1017: vj++;
1018: tp = t + (*vj)*4;
1019: v += 16;
1020: }
1022: /* xk = inv(Dk)*(Dk*xk) */
1023: diag = aa+k*16; /* ptr to inv(Dk) */
1024: tp = t + k*4;
1025: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1026: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1027: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1028: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1029: }
1031: /* solve U*x = y by back substitution */
1032: for (k=mbs-1; k>=0; k--) {
1033: v = aa + 16*ai[k];
1034: vj = aj + ai[k];
1035: tp = t + k*4;
1036: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1037: nz = ai[k+1] - ai[k];
1039: tp = t + (*vj)*4;
1040: while (nz--) {
1041: /* xk += U(k,:)*x(:) */
1042: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1043: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1044: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1045: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1046: vj++;
1047: tp = t + (*vj)*4;
1048: v += 16;
1049: }
1050: tp = t + k*4;
1051: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1052: idx = 4*r[k];
1053: x[idx] = x0;
1054: x[idx+1] = x1;
1055: x[idx+2] = x2;
1056: x[idx+3] = x3;
1057: }
1059: ISRestoreIndices(isrow,&r);
1060: VecRestoreArray(bb,&b);
1061: VecRestoreArray(xx,&x);
1062: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1063: return(0);
1064: }
1068: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1069: {
1070: MatScalar *v,*diag;
1071: PetscScalar *xp,x0,x1,x2,x3;
1072: PetscInt nz,*vj,k;
1075: for (k=0; k<mbs; k++) {
1076: v = aa + 16*ai[k];
1077: xp = x + k*4;
1078: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1079: nz = ai[k+1] - ai[k];
1080: vj = aj + ai[k];
1081: xp = x + (*vj)*4;
1082: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1083: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084: while (nz--) {
1085: /* x(:) += U(k,:)^T*(Dk*xk) */
1086: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1087: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1088: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1089: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1090: vj++;
1091: xp = x + (*vj)*4;
1092: v += 16;
1093: }
1094: /* xk = inv(Dk)*(Dk*xk) */
1095: diag = aa+k*16; /* ptr to inv(Dk) */
1096: xp = x + k*4;
1097: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1098: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1099: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1100: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1101: }
1102: return(0);
1103: }
1107: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1108: {
1109: MatScalar *v;
1110: PetscScalar *xp,x0,x1,x2,x3;
1111: PetscInt nz,*vj,k;
1114: for (k=mbs-1; k>=0; k--) {
1115: v = aa + 16*ai[k];
1116: xp = x + k*4;
1117: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1118: nz = ai[k+1] - ai[k];
1119: vj = aj + ai[k];
1120: xp = x + (*vj)*4;
1121: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1122: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123: while (nz--) {
1124: /* xk += U(k,:)*x(:) */
1125: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1126: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1127: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1128: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1129: vj++;
1130: v += 16; xp = x + (*vj)*4;
1131: }
1132: xp = x + k*4;
1133: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1134: }
1135: return(0);
1136: }
1140: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1141: {
1142: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1143: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1144: MatScalar *aa=a->a;
1145: PetscScalar *x,*b;
1149: VecGetArray(bb,&b);
1150: VecGetArray(xx,&x);
1152: /* solve U^T * D * y = b by forward substitution */
1153: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1154: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1156: /* solve U*x = y by back substitution */
1157: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1158: VecRestoreArray(bb,&b);
1159: VecRestoreArray(xx,&x);
1160: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1161: return(0);
1162: }
1166: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1167: {
1168: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1169: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1170: MatScalar *aa=a->a;
1171: PetscScalar *x,*b;
1175: VecGetArray(bb,&b);
1176: VecGetArray(xx,&x);
1177: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1178: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1179: VecRestoreArray(bb,&b);
1180: VecRestoreArray(xx,&x);
1181: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1182: return(0);
1183: }
1187: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1188: {
1189: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1190: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1191: MatScalar *aa=a->a;
1192: PetscScalar *x,*b;
1196: VecGetArray(bb,&b);
1197: VecGetArray(xx,&x);
1198: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1199: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1200: VecRestoreArray(bb,&b);
1201: VecRestoreArray(xx,&x);
1202: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1203: return(0);
1204: }
1208: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1209: {
1210: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1211: IS isrow=a->row;
1212: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1214: const PetscInt *r;
1215: PetscInt nz,*vj,k,idx;
1216: MatScalar *aa=a->a,*v,*diag;
1217: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
1220: VecGetArray(bb,&b);
1221: VecGetArray(xx,&x);
1222: t = a->solve_work;
1223: ISGetIndices(isrow,&r);
1225: /* solve U^T * D * y = b by forward substitution */
1226: tp = t;
1227: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1228: idx = 3*r[k];
1229: tp[0] = b[idx];
1230: tp[1] = b[idx+1];
1231: tp[2] = b[idx+2];
1232: tp += 3;
1233: }
1235: for (k=0; k<mbs; k++) {
1236: v = aa + 9*ai[k];
1237: vj = aj + ai[k];
1238: tp = t + k*3;
1239: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1240: nz = ai[k+1] - ai[k];
1242: tp = t + (*vj)*3;
1243: while (nz--) {
1244: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1245: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1246: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1247: vj++;
1248: tp = t + (*vj)*3;
1249: v += 9;
1250: }
1252: /* xk = inv(Dk)*(Dk*xk) */
1253: diag = aa+k*9; /* ptr to inv(Dk) */
1254: tp = t + k*3;
1255: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1256: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1257: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1258: }
1260: /* solve U*x = y by back substitution */
1261: for (k=mbs-1; k>=0; k--) {
1262: v = aa + 9*ai[k];
1263: vj = aj + ai[k];
1264: tp = t + k*3;
1265: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1266: nz = ai[k+1] - ai[k];
1268: tp = t + (*vj)*3;
1269: while (nz--) {
1270: /* xk += U(k,:)*x(:) */
1271: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1272: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1273: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1274: vj++;
1275: tp = t + (*vj)*3;
1276: v += 9;
1277: }
1278: tp = t + k*3;
1279: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1280: idx = 3*r[k];
1281: x[idx] = x0;
1282: x[idx+1] = x1;
1283: x[idx+2] = x2;
1284: }
1286: ISRestoreIndices(isrow,&r);
1287: VecRestoreArray(bb,&b);
1288: VecRestoreArray(xx,&x);
1289: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1290: return(0);
1291: }
1295: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1296: {
1297: MatScalar *v,*diag;
1298: PetscScalar *xp,x0,x1,x2;
1299: PetscInt nz,*vj,k;
1302: for (k=0; k<mbs; k++) {
1303: v = aa + 9*ai[k];
1304: xp = x + k*3;
1305: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1306: nz = ai[k+1] - ai[k];
1307: vj = aj + ai[k];
1308: xp = x + (*vj)*3;
1309: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1310: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1311: while (nz--) {
1312: /* x(:) += U(k,:)^T*(Dk*xk) */
1313: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1314: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1315: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1316: vj++;
1317: xp = x + (*vj)*3;
1318: v += 9;
1319: }
1320: /* xk = inv(Dk)*(Dk*xk) */
1321: diag = aa+k*9; /* ptr to inv(Dk) */
1322: xp = x + k*3;
1323: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1324: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1325: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1326: }
1327: return(0);
1328: }
1332: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1333: {
1334: MatScalar *v;
1335: PetscScalar *xp,x0,x1,x2;
1336: PetscInt nz,*vj,k;
1339: for (k=mbs-1; k>=0; k--) {
1340: v = aa + 9*ai[k];
1341: xp = x + k*3;
1342: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1343: nz = ai[k+1] - ai[k];
1344: vj = aj + ai[k];
1345: xp = x + (*vj)*3;
1346: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1347: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1348: while (nz--) {
1349: /* xk += U(k,:)*x(:) */
1350: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1351: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1352: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1353: vj++;
1354: v += 9; xp = x + (*vj)*3;
1355: }
1356: xp = x + k*3;
1357: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1358: }
1359: return(0);
1360: }
1364: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1365: {
1366: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1367: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1368: MatScalar *aa=a->a;
1369: PetscScalar *x,*b;
1373: VecGetArray(bb,&b);
1374: VecGetArray(xx,&x);
1376: /* solve U^T * D * y = b by forward substitution */
1377: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1378: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1380: /* solve U*x = y by back substitution */
1381: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1383: VecRestoreArray(bb,&b);
1384: VecRestoreArray(xx,&x);
1385: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1386: return(0);
1387: }
1391: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1392: {
1393: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1394: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1395: MatScalar *aa=a->a;
1396: PetscScalar *x,*b;
1400: VecGetArray(bb,&b);
1401: VecGetArray(xx,&x);
1402: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1403: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1404: VecRestoreArray(bb,&b);
1405: VecRestoreArray(xx,&x);
1406: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1407: return(0);
1408: }
1412: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1413: {
1414: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1415: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1416: MatScalar *aa=a->a;
1417: PetscScalar *x,*b;
1421: VecGetArray(bb,&b);
1422: VecGetArray(xx,&x);
1423: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1424: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1425: VecRestoreArray(bb,&b);
1426: VecRestoreArray(xx,&x);
1427: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1428: return(0);
1429: }
1433: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1434: {
1435: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1436: IS isrow=a->row;
1437: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1439: const PetscInt *r;
1440: PetscInt nz,*vj,k,k2,idx;
1441: MatScalar *aa=a->a,*v,*diag;
1442: PetscScalar *x,*b,x0,x1,*t;
1445: VecGetArray(bb,&b);
1446: VecGetArray(xx,&x);
1447: t = a->solve_work;
1448: ISGetIndices(isrow,&r);
1450: /* solve U^T * D * y = perm(b) by forward substitution */
1451: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1452: idx = 2*r[k];
1453: t[k*2] = b[idx];
1454: t[k*2+1] = b[idx+1];
1455: }
1456: for (k=0; k<mbs; k++) {
1457: v = aa + 4*ai[k];
1458: vj = aj + ai[k];
1459: k2 = k*2;
1460: x0 = t[k2]; x1 = t[k2+1];
1461: nz = ai[k+1] - ai[k];
1462: while (nz--) {
1463: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1464: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1465: vj++; v += 4;
1466: }
1467: diag = aa+k*4; /* ptr to inv(Dk) */
1468: t[k2] = diag[0]*x0 + diag[2]*x1;
1469: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1470: }
1472: /* solve U*x = y by back substitution */
1473: for (k=mbs-1; k>=0; k--) {
1474: v = aa + 4*ai[k];
1475: vj = aj + ai[k];
1476: k2 = k*2;
1477: x0 = t[k2]; x1 = t[k2+1];
1478: nz = ai[k+1] - ai[k];
1479: while (nz--) {
1480: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1481: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1482: vj++;
1483: v += 4;
1484: }
1485: t[k2] = x0;
1486: t[k2+1] = x1;
1487: idx = 2*r[k];
1488: x[idx] = x0;
1489: x[idx+1] = x1;
1490: }
1492: ISRestoreIndices(isrow,&r);
1493: VecRestoreArray(bb,&b);
1494: VecRestoreArray(xx,&x);
1495: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1496: return(0);
1497: }
1501: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1502: {
1503: MatScalar *v,*diag;
1504: PetscScalar x0,x1;
1505: PetscInt nz,*vj,k,k2;
1508: for (k=0; k<mbs; k++) {
1509: v = aa + 4*ai[k];
1510: vj = aj + ai[k];
1511: k2 = k*2;
1512: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1513: nz = ai[k+1] - ai[k];
1514: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1515: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1516: while (nz--) {
1517: /* x(:) += U(k,:)^T*(Dk*xk) */
1518: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1519: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1520: vj++; v += 4;
1521: }
1522: /* xk = inv(Dk)*(Dk*xk) */
1523: diag = aa+k*4; /* ptr to inv(Dk) */
1524: x[k2] = diag[0]*x0 + diag[2]*x1;
1525: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1526: }
1527: return(0);
1528: }
1532: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1533: {
1534: MatScalar *v;
1535: PetscScalar x0,x1;
1536: PetscInt nz,*vj,k,k2;
1539: for (k=mbs-1; k>=0; k--) {
1540: v = aa + 4*ai[k];
1541: vj = aj + ai[k];
1542: k2 = k*2;
1543: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1544: nz = ai[k+1] - ai[k];
1545: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1546: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1547: while (nz--) {
1548: /* xk += U(k,:)*x(:) */
1549: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1550: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1551: vj++;
1552: v += 4;
1553: }
1554: x[k2] = x0;
1555: x[k2+1] = x1;
1556: }
1557: return(0);
1558: }
1562: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1563: {
1564: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1565: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1566: MatScalar *aa=a->a;
1567: PetscScalar *x,*b;
1571: VecGetArray(bb,&b);
1572: VecGetArray(xx,&x);
1574: /* solve U^T * D * y = b by forward substitution */
1575: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1576: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1578: /* solve U*x = y by back substitution */
1579: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1581: VecRestoreArray(bb,&b);
1582: VecRestoreArray(xx,&x);
1583: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1584: return(0);
1585: }
1589: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1590: {
1591: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1592: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1593: MatScalar *aa=a->a;
1594: PetscScalar *x,*b;
1598: VecGetArray(bb,&b);
1599: VecGetArray(xx,&x);
1600: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1601: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1602: VecRestoreArray(bb,&b);
1603: VecRestoreArray(xx,&x);
1604: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1605: return(0);
1606: }
1610: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1611: {
1612: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1613: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1614: MatScalar *aa=a->a;
1615: PetscScalar *x,*b;
1619: VecGetArray(bb,&b);
1620: VecGetArray(xx,&x);
1621: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1622: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1623: VecRestoreArray(bb,&b);
1624: VecRestoreArray(xx,&x);
1625: PetscLogFlops(2.0*bs2*(a->nz - mbs));
1626: return(0);
1627: }
1631: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1632: {
1633: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1634: IS isrow=a->row;
1635: PetscErrorCode ierr;
1636: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1637: const MatScalar *aa=a->a,*v;
1638: const PetscScalar *b;
1639: PetscScalar *x,xk,*t;
1640: PetscInt nz,k,j;
1643: VecGetArrayRead(bb,&b);
1644: VecGetArray(xx,&x);
1645: t = a->solve_work;
1646: ISGetIndices(isrow,&rp);
1648: /* solve U^T*D*y = perm(b) by forward substitution */
1649: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1650: for (k=0; k<mbs; k++) {
1651: v = aa + ai[k];
1652: vj = aj + ai[k];
1653: xk = t[k];
1654: nz = ai[k+1] - ai[k] - 1;
1655: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1656: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1657: }
1659: /* solve U*perm(x) = y by back substitution */
1660: for (k=mbs-1; k>=0; k--) {
1661: v = aa + adiag[k] - 1;
1662: vj = aj + adiag[k] - 1;
1663: nz = ai[k+1] - ai[k] - 1;
1664: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1665: x[rp[k]] = t[k];
1666: }
1668: ISRestoreIndices(isrow,&rp);
1669: VecRestoreArrayRead(bb,&b);
1670: VecRestoreArray(xx,&x);
1671: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1672: return(0);
1673: }
1677: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1678: {
1679: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1680: IS isrow=a->row;
1681: PetscErrorCode ierr;
1682: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1683: const MatScalar *aa=a->a,*v;
1684: PetscScalar *x,*b,xk,*t;
1685: PetscInt nz,k;
1688: VecGetArray(bb,&b);
1689: VecGetArray(xx,&x);
1690: t = a->solve_work;
1691: ISGetIndices(isrow,&rp);
1693: /* solve U^T*D*y = perm(b) by forward substitution */
1694: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1695: for (k=0; k<mbs; k++) {
1696: v = aa + ai[k] + 1;
1697: vj = aj + ai[k] + 1;
1698: xk = t[k];
1699: nz = ai[k+1] - ai[k] - 1;
1700: while (nz--) t[*vj++] += (*v++) * xk;
1701: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1702: }
1704: /* solve U*perm(x) = y by back substitution */
1705: for (k=mbs-1; k>=0; k--) {
1706: v = aa + ai[k] + 1;
1707: vj = aj + ai[k] + 1;
1708: nz = ai[k+1] - ai[k] - 1;
1709: while (nz--) t[k] += (*v++) * t[*vj++];
1710: x[rp[k]] = t[k];
1711: }
1713: ISRestoreIndices(isrow,&rp);
1714: VecRestoreArray(bb,&b);
1715: VecRestoreArray(xx,&x);
1716: PetscLogFlops(4.0*a->nz - 3*mbs);
1717: return(0);
1718: }
1722: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1723: {
1724: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1725: IS isrow=a->row;
1726: PetscErrorCode ierr;
1727: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1728: const MatScalar *aa=a->a,*v;
1729: PetscReal diagk;
1730: PetscScalar *x,*b,xk;
1731: PetscInt nz,k;
1734: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1735: VecGetArray(bb,&b);
1736: VecGetArray(xx,&x);
1737: ISGetIndices(isrow,&rp);
1739: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1740: for (k=0; k<mbs; k++) {
1741: v = aa + ai[k];
1742: vj = aj + ai[k];
1743: xk = x[k];
1744: nz = ai[k+1] - ai[k] - 1;
1745: while (nz--) x[*vj++] += (*v++) * xk;
1747: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1748: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1749: x[k] = xk*PetscSqrtReal(diagk);
1750: }
1751: ISRestoreIndices(isrow,&rp);
1752: VecRestoreArray(bb,&b);
1753: VecRestoreArray(xx,&x);
1754: PetscLogFlops(2.0*a->nz - mbs);
1755: return(0);
1756: }
1760: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1761: {
1762: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1763: IS isrow=a->row;
1764: PetscErrorCode ierr;
1765: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1766: const MatScalar *aa=a->a,*v;
1767: PetscReal diagk;
1768: PetscScalar *x,*b,xk;
1769: PetscInt nz,k;
1772: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773: VecGetArray(bb,&b);
1774: VecGetArray(xx,&x);
1775: ISGetIndices(isrow,&rp);
1777: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1778: for (k=0; k<mbs; k++) {
1779: v = aa + ai[k] + 1;
1780: vj = aj + ai[k] + 1;
1781: xk = x[k];
1782: nz = ai[k+1] - ai[k] - 1;
1783: while (nz--) x[*vj++] += (*v++) * xk;
1785: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1787: x[k] = xk*PetscSqrtReal(diagk);
1788: }
1789: ISRestoreIndices(isrow,&rp);
1790: VecRestoreArray(bb,&b);
1791: VecRestoreArray(xx,&x);
1792: PetscLogFlops(2.0*a->nz - mbs);
1793: return(0);
1794: }
1798: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1799: {
1800: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1801: IS isrow=a->row;
1802: PetscErrorCode ierr;
1803: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1804: const MatScalar *aa=a->a,*v;
1805: PetscReal diagk;
1806: PetscScalar *x,*b,*t;
1807: PetscInt nz,k;
1810: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1811: VecGetArray(bb,&b);
1812: VecGetArray(xx,&x);
1813: t = a->solve_work;
1814: ISGetIndices(isrow,&rp);
1816: for (k=mbs-1; k>=0; k--) {
1817: v = aa + ai[k];
1818: vj = aj + ai[k];
1819: diagk = PetscRealPart(aa[adiag[k]]);
1820: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1821: t[k] = b[k] * PetscSqrtReal(diagk);
1822: nz = ai[k+1] - ai[k] - 1;
1823: while (nz--) t[k] += (*v++) * t[*vj++];
1824: x[rp[k]] = t[k];
1825: }
1826: ISRestoreIndices(isrow,&rp);
1827: VecRestoreArray(bb,&b);
1828: VecRestoreArray(xx,&x);
1829: PetscLogFlops(2.0*a->nz - mbs);
1830: return(0);
1831: }
1835: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1836: {
1837: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1838: IS isrow=a->row;
1839: PetscErrorCode ierr;
1840: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1841: const MatScalar *aa=a->a,*v;
1842: PetscReal diagk;
1843: PetscScalar *x,*b,*t;
1844: PetscInt nz,k;
1847: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1848: VecGetArray(bb,&b);
1849: VecGetArray(xx,&x);
1850: t = a->solve_work;
1851: ISGetIndices(isrow,&rp);
1853: for (k=mbs-1; k>=0; k--) {
1854: v = aa + ai[k] + 1;
1855: vj = aj + ai[k] + 1;
1856: diagk = PetscRealPart(aa[ai[k]]);
1857: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1858: t[k] = b[k] * PetscSqrtReal(diagk);
1859: nz = ai[k+1] - ai[k] - 1;
1860: while (nz--) t[k] += (*v++) * t[*vj++];
1861: x[rp[k]] = t[k];
1862: }
1863: ISRestoreIndices(isrow,&rp);
1864: VecRestoreArray(bb,&b);
1865: VecRestoreArray(xx,&x);
1866: PetscLogFlops(2.0*a->nz - mbs);
1867: return(0);
1868: }
1872: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1873: {
1874: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1878: if (A->rmap->bs == 1) {
1879: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1880: } else {
1881: IS isrow=a->row;
1882: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1883: const MatScalar *aa=a->a,*v;
1884: PetscScalar *x,*b,*t;
1885: PetscInt nz,k,n,i,j;
1886: if (bb->n > a->solves_work_n) {
1887: PetscFree(a->solves_work);
1888: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1890: a->solves_work_n = bb->n;
1891: }
1892: n = bb->n;
1893: VecGetArray(bb->v,&b);
1894: VecGetArray(xx->v,&x);
1895: t = a->solves_work;
1897: ISGetIndices(isrow,&rp);
1899: /* solve U^T*D*y = perm(b) by forward substitution */
1900: for (k=0; k<mbs; k++) {
1901: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1902: }
1903: for (k=0; k<mbs; k++) {
1904: v = aa + ai[k];
1905: vj = aj + ai[k];
1906: nz = ai[k+1] - ai[k] - 1;
1907: for (j=0; j<nz; j++) {
1908: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1909: v++;vj++;
1910: }
1911: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1912: }
1914: /* solve U*perm(x) = y by back substitution */
1915: for (k=mbs-1; k>=0; k--) {
1916: v = aa + ai[k] - 1;
1917: vj = aj + ai[k] - 1;
1918: nz = ai[k+1] - ai[k] - 1;
1919: for (j=0; j<nz; j++) {
1920: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1921: v++;vj++;
1922: }
1923: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1924: }
1926: ISRestoreIndices(isrow,&rp);
1927: VecRestoreArray(bb->v,&b);
1928: VecRestoreArray(xx->v,&x);
1929: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1930: }
1931: return(0);
1932: }
1936: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1937: {
1938: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1942: if (A->rmap->bs == 1) {
1943: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1944: } else {
1945: IS isrow=a->row;
1946: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1947: const MatScalar *aa=a->a,*v;
1948: PetscScalar *x,*b,*t;
1949: PetscInt nz,k,n,i;
1950: if (bb->n > a->solves_work_n) {
1951: PetscFree(a->solves_work);
1952: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1954: a->solves_work_n = bb->n;
1955: }
1956: n = bb->n;
1957: VecGetArray(bb->v,&b);
1958: VecGetArray(xx->v,&x);
1959: t = a->solves_work;
1961: ISGetIndices(isrow,&rp);
1963: /* solve U^T*D*y = perm(b) by forward substitution */
1964: for (k=0; k<mbs; k++) {
1965: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1966: }
1967: for (k=0; k<mbs; k++) {
1968: v = aa + ai[k];
1969: vj = aj + ai[k];
1970: nz = ai[k+1] - ai[k];
1971: while (nz--) {
1972: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1973: v++;vj++;
1974: }
1975: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1976: }
1978: /* solve U*perm(x) = y by back substitution */
1979: for (k=mbs-1; k>=0; k--) {
1980: v = aa + ai[k];
1981: vj = aj + ai[k];
1982: nz = ai[k+1] - ai[k];
1983: while (nz--) {
1984: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1985: v++;vj++;
1986: }
1987: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1988: }
1990: ISRestoreIndices(isrow,&rp);
1991: VecRestoreArray(bb->v,&b);
1992: VecRestoreArray(xx->v,&x);
1993: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1994: }
1995: return(0);
1996: }
2000: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2001: {
2002: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2003: PetscErrorCode ierr;
2004: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2005: const MatScalar *aa=a->a,*v;
2006: const PetscScalar *b;
2007: PetscScalar *x,xi;
2008: PetscInt nz,i,j;
2011: VecGetArrayRead(bb,&b);
2012: VecGetArray(xx,&x);
2014: /* solve U^T*D*y = b by forward substitution */
2015: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2016: for (i=0; i<mbs; i++) {
2017: v = aa + ai[i];
2018: vj = aj + ai[i];
2019: xi = x[i];
2020: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2021: for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2022: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2023: }
2025: /* solve U*x = y by backward substitution */
2026: for (i=mbs-2; i>=0; i--) {
2027: xi = x[i];
2028: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2029: vj = aj + adiag[i] - 1;
2030: nz = ai[i+1] - ai[i] - 1;
2031: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2032: x[i] = xi;
2033: }
2035: VecRestoreArrayRead(bb,&b);
2036: VecRestoreArray(xx,&x);
2037: PetscLogFlops(4.0*a->nz - 3*mbs);
2038: return(0);
2039: }
2043: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2044: {
2045: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2047: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2048: MatScalar *aa=a->a,*v;
2049: PetscScalar *x,*b,xk;
2050: PetscInt nz,*vj,k;
2053: VecGetArray(bb,&b);
2054: VecGetArray(xx,&x);
2056: /* solve U^T*D*y = b by forward substitution */
2057: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2058: for (k=0; k<mbs; k++) {
2059: v = aa + ai[k] + 1;
2060: vj = aj + ai[k] + 1;
2061: xk = x[k];
2062: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2063: while (nz--) x[*vj++] += (*v++) * xk;
2064: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2065: }
2067: /* solve U*x = y by back substitution */
2068: for (k=mbs-2; k>=0; k--) {
2069: v = aa + ai[k] + 1;
2070: vj = aj + ai[k] + 1;
2071: xk = x[k];
2072: nz = ai[k+1] - ai[k] - 1;
2073: while (nz--) {
2074: xk += (*v++) * x[*vj++];
2075: }
2076: x[k] = xk;
2077: }
2079: VecRestoreArray(bb,&b);
2080: VecRestoreArray(xx,&x);
2081: PetscLogFlops(4.0*a->nz - 3*mbs);
2082: return(0);
2083: }
2087: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2088: {
2089: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2090: PetscErrorCode ierr;
2091: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2092: const MatScalar *aa=a->a,*v;
2093: PetscReal diagk;
2094: PetscScalar *x,*b;
2095: PetscInt nz,*vj,k;
2098: /* solve U^T*D^(1/2)*x = b by forward substitution */
2099: VecGetArray(bb,&b);
2100: VecGetArray(xx,&x);
2101: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2102: for (k=0; k<mbs; k++) {
2103: v = aa + ai[k];
2104: vj = aj + ai[k];
2105: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2106: while (nz--) x[*vj++] += (*v++) * x[k];
2107: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2108: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2109: x[k] *= PetscSqrtReal(diagk);
2110: }
2111: VecRestoreArray(bb,&b);
2112: VecRestoreArray(xx,&x);
2113: PetscLogFlops(2.0*a->nz - mbs);
2114: return(0);
2115: }
2119: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2120: {
2121: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2123: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2124: MatScalar *aa=a->a,*v;
2125: PetscReal diagk;
2126: PetscScalar *x,*b;
2127: PetscInt nz,*vj,k;
2130: /* solve U^T*D^(1/2)*x = b by forward substitution */
2131: VecGetArray(bb,&b);
2132: VecGetArray(xx,&x);
2133: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2134: for (k=0; k<mbs; k++) {
2135: v = aa + ai[k] + 1;
2136: vj = aj + ai[k] + 1;
2137: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2138: while (nz--) x[*vj++] += (*v++) * x[k];
2139: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2140: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2141: x[k] *= PetscSqrtReal(diagk);
2142: }
2143: VecRestoreArray(bb,&b);
2144: VecRestoreArray(xx,&x);
2145: PetscLogFlops(2.0*a->nz - mbs);
2146: return(0);
2147: }
2151: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2152: {
2153: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2155: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2156: MatScalar *aa=a->a,*v;
2157: PetscReal diagk;
2158: PetscScalar *x,*b;
2159: PetscInt nz,*vj,k;
2162: /* solve D^(1/2)*U*x = b by back substitution */
2163: VecGetArray(bb,&b);
2164: VecGetArray(xx,&x);
2166: for (k=mbs-1; k>=0; k--) {
2167: v = aa + ai[k];
2168: vj = aj + ai[k];
2169: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2170: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2171: x[k] = PetscSqrtReal(diagk)*b[k];
2172: nz = ai[k+1] - ai[k] - 1;
2173: while (nz--) x[k] += (*v++) * x[*vj++];
2174: }
2175: VecRestoreArray(bb,&b);
2176: VecRestoreArray(xx,&x);
2177: PetscLogFlops(2.0*a->nz - mbs);
2178: return(0);
2179: }
2183: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2184: {
2185: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2187: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2188: MatScalar *aa=a->a,*v;
2189: PetscReal diagk;
2190: PetscScalar *x,*b;
2191: PetscInt nz,*vj,k;
2194: /* solve D^(1/2)*U*x = b by back substitution */
2195: VecGetArray(bb,&b);
2196: VecGetArray(xx,&x);
2198: for (k=mbs-1; k>=0; k--) {
2199: v = aa + ai[k] + 1;
2200: vj = aj + ai[k] + 1;
2201: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2202: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2203: x[k] = PetscSqrtReal(diagk)*b[k];
2204: nz = ai[k+1] - ai[k] - 1;
2205: while (nz--) x[k] += (*v++) * x[*vj++];
2206: }
2207: VecRestoreArray(bb,&b);
2208: VecRestoreArray(xx,&x);
2209: PetscLogFlops(2.0*a->nz - mbs);
2210: return(0);
2211: }
2213: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2216: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2217: {
2218: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2220: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2221: PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2222: PetscInt m,reallocs = 0,*levtmp;
2223: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2224: PetscInt incrlev,*lev,shift,prow,nz;
2225: PetscReal f = info->fill,levels = info->levels;
2226: PetscBool perm_identity;
2229: /* check whether perm is the identity mapping */
2230: ISIdentity(perm,&perm_identity);
2232: if (perm_identity) {
2233: a->permute = PETSC_FALSE;
2234: ai = a->i; aj = a->j;
2235: } else { /* non-trivial permutation */
2236: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2237: a->permute = PETSC_TRUE;
2238: MatReorderingSeqSBAIJ(A, perm);
2239: ai = a->inew; aj = a->jnew;
2240: }
2242: /* initialization */
2243: ISGetIndices(perm,&rip);
2244: umax = (PetscInt)(f*ai[mbs] + 1);
2245: PetscMalloc(umax*sizeof(PetscInt),&lev);
2246: umax += mbs + 1;
2247: shift = mbs + 1;
2248: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2249: PetscMalloc(umax*sizeof(PetscInt),&ju);
2250: iu[0] = mbs + 1;
2251: juidx = mbs + 1;
2252: /* prowl: linked list for pivot row */
2253: PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2254: /* q: linked list for col index */
2256: for (i=0; i<mbs; i++) {
2257: prowl[i] = mbs;
2258: q[i] = 0;
2259: }
2261: /* for each row k */
2262: for (k=0; k<mbs; k++) {
2263: nzk = 0;
2264: q[k] = mbs;
2265: /* copy current row into linked list */
2266: nz = ai[rip[k]+1] - ai[rip[k]];
2267: j = ai[rip[k]];
2268: while (nz--) {
2269: vj = rip[aj[j++]];
2270: if (vj > k) {
2271: qm = k;
2272: do {
2273: m = qm; qm = q[m];
2274: } while (qm < vj);
2275: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2276: nzk++;
2277: q[m] = vj;
2278: q[vj] = qm;
2279: levtmp[vj] = 0; /* initialize lev for nonzero element */
2280: }
2281: }
2283: /* modify nonzero structure of k-th row by computing fill-in
2284: for each row prow to be merged in */
2285: prow = k;
2286: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2288: while (prow < k) {
2289: /* merge row prow into k-th row */
2290: jmin = iu[prow] + 1;
2291: jmax = iu[prow+1];
2292: qm = k;
2293: for (j=jmin; j<jmax; j++) {
2294: incrlev = lev[j-shift] + 1;
2295: if (incrlev > levels) continue;
2296: vj = ju[j];
2297: do {
2298: m = qm; qm = q[m];
2299: } while (qm < vj);
2300: if (qm != vj) { /* a fill */
2301: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2302: levtmp[vj] = incrlev;
2303: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2304: }
2305: prow = prowl[prow]; /* next pivot row */
2306: }
2308: /* add k to row list for first nonzero element in k-th row */
2309: if (nzk > 1) {
2310: i = q[k]; /* col value of first nonzero element in k_th row of U */
2311: prowl[k] = prowl[i]; prowl[i] = k;
2312: }
2313: iu[k+1] = iu[k] + nzk;
2315: /* allocate more space to ju and lev if needed */
2316: if (iu[k+1] > umax) {
2317: /* estimate how much additional space we will need */
2318: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2319: /* just double the memory each time */
2320: maxadd = umax;
2321: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2322: umax += maxadd;
2324: /* allocate a longer ju */
2325: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2326: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2327: PetscFree(ju);
2328: ju = jutmp;
2330: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2331: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2332: PetscFree(lev);
2333: lev = jutmp;
2334: reallocs += 2; /* count how many times we realloc */
2335: }
2337: /* save nonzero structure of k-th row in ju */
2338: i=k;
2339: while (nzk--) {
2340: i = q[i];
2341: ju[juidx] = i;
2342: lev[juidx-shift] = levtmp[i];
2343: juidx++;
2344: }
2345: }
2347: #if defined(PETSC_USE_INFO)
2348: if (ai[mbs] != 0) {
2349: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2350: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2351: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2352: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2353: PetscInfo(A,"for best performance.\n");
2354: } else {
2355: PetscInfo(A,"Empty matrix.\n");
2356: }
2357: #endif
2359: ISRestoreIndices(perm,&rip);
2360: PetscFree3(prowl,q,levtmp);
2361: PetscFree(lev);
2363: /* put together the new matrix */
2364: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);
2366: /* PetscLogObjectParent(B,iperm); */
2367: b = (Mat_SeqSBAIJ*)(B)->data;
2368: PetscFree2(b->imax,b->ilen);
2370: b->singlemalloc = PETSC_FALSE;
2371: b->free_a = PETSC_TRUE;
2372: b->free_ij = PETSC_TRUE;
2373: /* the next line frees the default space generated by the Create() */
2374: PetscFree3(b->a,b->j,b->i);
2375: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2376: b->j = ju;
2377: b->i = iu;
2378: b->diag = 0;
2379: b->ilen = 0;
2380: b->imax = 0;
2382: ISDestroy(&b->row);
2383: ISDestroy(&b->icol);
2384: b->row = perm;
2385: b->icol = perm;
2386: PetscObjectReference((PetscObject)perm);
2387: PetscObjectReference((PetscObject)perm);
2388: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2389: /* In b structure: Free imax, ilen, old a, old j.
2390: Allocate idnew, solve_work, new a, new j */
2391: PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2392: b->maxnz = b->nz = iu[mbs];
2394: (B)->info.factor_mallocs = reallocs;
2395: (B)->info.fill_ratio_given = f;
2396: if (ai[mbs] != 0) {
2397: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2398: } else {
2399: (B)->info.fill_ratio_needed = 0.0;
2400: }
2401: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2402: return(0);
2403: }
2405: /*
2406: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2407: */
2408: #include <petscbt.h>
2409: #include <../src/mat/utils/freespace.h>
2412: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2413: {
2414: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2415: PetscErrorCode ierr;
2416: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2417: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2418: const PetscInt *rip;
2419: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2420: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2421: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2422: PetscReal fill =info->fill,levels=info->levels;
2423: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2424: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2425: PetscBT lnkbt;
2428: if (bs > 1) {
2429: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2430: return(0);
2431: }
2432: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2433: MatMissingDiagonal(A,&missing,&d);
2434: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2436: /* check whether perm is the identity mapping */
2437: ISIdentity(perm,&perm_identity);
2438: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2439: a->permute = PETSC_FALSE;
2441: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2442: PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2443: ui[0] = 0;
2445: /* ICC(0) without matrix ordering: simply rearrange column indices */
2446: if (!levels) {
2447: /* reuse the column pointers and row offsets for memory savings */
2448: for (i=0; i<am; i++) {
2449: ncols = ai[i+1] - ai[i];
2450: ui[i+1] = ui[i] + ncols;
2451: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2452: }
2453: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2454: cols = uj;
2455: for (i=0; i<am; i++) {
2456: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2457: ncols = ai[i+1] - ai[i] -1;
2458: for (j=0; j<ncols; j++) *cols++ = aj[j];
2459: *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2460: }
2461: } else { /* case: levels>0 */
2462: ISGetIndices(perm,&rip);
2464: /* initialization */
2465: /* jl: linked list for storing indices of the pivot rows
2466: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2467: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2468: for (i=0; i<am; i++) {
2469: jl[i] = am; il[i] = 0;
2470: }
2472: /* create and initialize a linked list for storing column indices of the active row k */
2473: nlnk = am + 1;
2474: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2476: /* initial FreeSpace size is fill*(ai[am]+1) */
2477: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2479: current_space = free_space;
2481: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2483: current_space_lvl = free_space_lvl;
2485: for (k=0; k<am; k++) { /* for each active row k */
2486: /* initialize lnk by the column indices of row k */
2487: nzk = 0;
2488: ncols = ai[k+1] - ai[k];
2489: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2490: cols = aj+ai[k];
2491: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2492: nzk += nlnk;
2494: /* update lnk by computing fill-in for each pivot row to be merged in */
2495: prow = jl[k]; /* 1st pivot row */
2497: while (prow < k) {
2498: nextprow = jl[prow];
2500: /* merge prow into k-th row */
2501: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2502: jmax = ui[prow+1];
2503: ncols = jmax-jmin;
2504: i = jmin - ui[prow];
2506: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2507: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2508: j = *(uj - 1);
2509: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2510: nzk += nlnk;
2512: /* update il and jl for prow */
2513: if (jmin < jmax) {
2514: il[prow] = jmin;
2516: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2517: }
2518: prow = nextprow;
2519: }
2521: /* if free space is not available, make more free space */
2522: if (current_space->local_remaining<nzk) {
2523: i = am - k + 1; /* num of unfactored rows */
2524: i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2525: PetscFreeSpaceGet(i,¤t_space);
2526: PetscFreeSpaceGet(i,¤t_space_lvl);
2527: reallocs++;
2528: }
2530: /* copy data into free_space and free_space_lvl, then initialize lnk */
2531: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2532: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2534: /* add the k-th row into il and jl */
2535: if (nzk > 1) {
2536: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2537: jl[k] = jl[i]; jl[i] = k;
2538: il[k] = ui[k] + 1;
2539: }
2540: uj_ptr[k] = current_space->array;
2541: uj_lvl_ptr[k] = current_space_lvl->array;
2543: current_space->array += nzk;
2544: current_space->local_used += nzk;
2545: current_space->local_remaining -= nzk;
2546: current_space_lvl->array += nzk;
2547: current_space_lvl->local_used += nzk;
2548: current_space_lvl->local_remaining -= nzk;
2550: ui[k+1] = ui[k] + nzk;
2551: }
2553: ISRestoreIndices(perm,&rip);
2554: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2556: /* destroy list of free space and other temporary array(s) */
2557: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2558: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2559: PetscIncompleteLLDestroy(lnk,lnkbt);
2560: PetscFreeSpaceDestroy(free_space_lvl);
2562: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2564: /* put together the new matrix in MATSEQSBAIJ format */
2565: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2567: b = (Mat_SeqSBAIJ*)(fact)->data;
2568: PetscFree2(b->imax,b->ilen);
2570: b->singlemalloc = PETSC_FALSE;
2571: b->free_a = PETSC_TRUE;
2572: b->free_ij = free_ij;
2574: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2576: b->j = uj;
2577: b->i = ui;
2578: b->diag = udiag;
2579: b->free_diag = PETSC_TRUE;
2580: b->ilen = 0;
2581: b->imax = 0;
2582: b->row = perm;
2583: b->col = perm;
2585: PetscObjectReference((PetscObject)perm);
2586: PetscObjectReference((PetscObject)perm);
2588: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2590: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2591: PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2593: b->maxnz = b->nz = ui[am];
2595: fact->info.factor_mallocs = reallocs;
2596: fact->info.fill_ratio_given = fill;
2597: if (ai[am] != 0) {
2598: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2599: } else {
2600: fact->info.fill_ratio_needed = 0.0;
2601: }
2602: #if defined(PETSC_USE_INFO)
2603: if (ai[am] != 0) {
2604: PetscReal af = fact->info.fill_ratio_needed;
2605: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2606: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2607: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2608: } else {
2609: PetscInfo(A,"Empty matrix.\n");
2610: }
2611: #endif
2612: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2613: return(0);
2614: }
2618: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2619: {
2620: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2621: Mat_SeqSBAIJ *b;
2622: PetscErrorCode ierr;
2623: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2624: PetscInt bs=A->rmap->bs,am=a->mbs,d;
2625: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2626: PetscInt reallocs=0,i,*ui;
2627: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2628: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2629: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2630: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2631: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2632: PetscBT lnkbt;
2635: MatMissingDiagonal(A,&missing,&d);
2636: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2638: /*
2639: This code originally uses Modified Sparse Row (MSR) storage
2640: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2641: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2642: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2643: thus the original code in MSR format is still used for these cases.
2644: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2645: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2646: */
2647: if (bs > 1) {
2648: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2649: return(0);
2650: }
2652: /* check whether perm is the identity mapping */
2653: ISIdentity(perm,&perm_identity);
2654: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2655: a->permute = PETSC_FALSE;
2657: /* special case that simply copies fill pattern */
2658: if (!levels) {
2659: /* reuse the column pointers and row offsets for memory savings */
2660: ui = a->i;
2661: uj = a->j;
2662: free_ij = PETSC_FALSE;
2663: ratio_needed = 1.0;
2664: } else { /* case: levels>0 */
2665: ISGetIndices(perm,&rip);
2667: /* initialization */
2668: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2669: ui[0] = 0;
2671: /* jl: linked list for storing indices of the pivot rows
2672: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2673: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2674: for (i=0; i<am; i++) {
2675: jl[i] = am; il[i] = 0;
2676: }
2678: /* create and initialize a linked list for storing column indices of the active row k */
2679: nlnk = am + 1;
2680: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2682: /* initial FreeSpace size is fill*(ai[am]+1) */
2683: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2685: current_space = free_space;
2687: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2689: current_space_lvl = free_space_lvl;
2691: for (k=0; k<am; k++) { /* for each active row k */
2692: /* initialize lnk by the column indices of row rip[k] */
2693: nzk = 0;
2694: ncols = ai[rip[k]+1] - ai[rip[k]];
2695: cols = aj+ai[rip[k]];
2696: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2697: nzk += nlnk;
2699: /* update lnk by computing fill-in for each pivot row to be merged in */
2700: prow = jl[k]; /* 1st pivot row */
2702: while (prow < k) {
2703: nextprow = jl[prow];
2705: /* merge prow into k-th row */
2706: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2707: jmax = ui[prow+1];
2708: ncols = jmax-jmin;
2709: i = jmin - ui[prow];
2710: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2711: j = *(uj_lvl_ptr[prow] + i - 1);
2712: cols_lvl = uj_lvl_ptr[prow]+i;
2713: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2714: nzk += nlnk;
2716: /* update il and jl for prow */
2717: if (jmin < jmax) {
2718: il[prow] = jmin;
2719: j = *cols;
2720: jl[prow] = jl[j];
2721: jl[j] = prow;
2722: }
2723: prow = nextprow;
2724: }
2726: /* if free space is not available, make more free space */
2727: if (current_space->local_remaining<nzk) {
2728: i = am - k + 1; /* num of unfactored rows */
2729: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2730: PetscFreeSpaceGet(i,¤t_space);
2731: PetscFreeSpaceGet(i,¤t_space_lvl);
2732: reallocs++;
2733: }
2735: /* copy data into free_space and free_space_lvl, then initialize lnk */
2736: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2738: /* add the k-th row into il and jl */
2739: if (nzk-1 > 0) {
2740: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2741: jl[k] = jl[i]; jl[i] = k;
2742: il[k] = ui[k] + 1;
2743: }
2744: uj_ptr[k] = current_space->array;
2745: uj_lvl_ptr[k] = current_space_lvl->array;
2747: current_space->array += nzk;
2748: current_space->local_used += nzk;
2749: current_space->local_remaining -= nzk;
2750: current_space_lvl->array += nzk;
2751: current_space_lvl->local_used += nzk;
2752: current_space_lvl->local_remaining -= nzk;
2754: ui[k+1] = ui[k] + nzk;
2755: }
2757: ISRestoreIndices(perm,&rip);
2758: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2760: /* destroy list of free space and other temporary array(s) */
2761: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2762: PetscFreeSpaceContiguous(&free_space,uj);
2763: PetscIncompleteLLDestroy(lnk,lnkbt);
2764: PetscFreeSpaceDestroy(free_space_lvl);
2765: if (ai[am] != 0) {
2766: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2767: } else {
2768: ratio_needed = 0.0;
2769: }
2770: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2772: /* put together the new matrix in MATSEQSBAIJ format */
2773: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2775: b = (Mat_SeqSBAIJ*)(fact)->data;
2777: PetscFree2(b->imax,b->ilen);
2779: b->singlemalloc = PETSC_FALSE;
2780: b->free_a = PETSC_TRUE;
2781: b->free_ij = free_ij;
2783: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2785: b->j = uj;
2786: b->i = ui;
2787: b->diag = 0;
2788: b->ilen = 0;
2789: b->imax = 0;
2790: b->row = perm;
2791: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2793: PetscObjectReference((PetscObject)perm);
2794: b->icol = perm;
2795: PetscObjectReference((PetscObject)perm);
2796: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2798: b->maxnz = b->nz = ui[am];
2800: fact->info.factor_mallocs = reallocs;
2801: fact->info.fill_ratio_given = fill;
2802: fact->info.fill_ratio_needed = ratio_needed;
2803: #if defined(PETSC_USE_INFO)
2804: if (ai[am] != 0) {
2805: PetscReal af = fact->info.fill_ratio_needed;
2806: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2807: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2808: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2809: } else {
2810: PetscInfo(A,"Empty matrix.\n");
2811: }
2812: #endif
2813: if (perm_identity) {
2814: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2815: } else {
2816: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2817: }
2818: return(0);
2819: }