Actual source code: baijsolvnat.c
petsc-3.9.4 2018-09-11
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
6: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
7: {
8: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
9: PetscErrorCode ierr;
10: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
11: PetscInt i,nz,idx,idt,m;
12: const MatScalar *aa=a->a,*v;
13: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
14: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
15: PetscScalar *x;
16: const PetscScalar *b;
19: VecGetArrayRead(bb,&b);
20: VecGetArray(xx,&x);
22: /* forward solve the lower triangular */
23: idx = 0;
24: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
25: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
26: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
28: for (i=1; i<n; i++) {
29: v = aa + bs2*ai[i];
30: vi = aj + ai[i];
31: nz = ai[i+1] - ai[i];
32: idt = bs*i;
33: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
34: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
35: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
36: for (m=0; m<nz; m++) {
37: idx = bs*vi[m];
38: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
39: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
40: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
43: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
44: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
45: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
46: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
47: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
48: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
49: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
50: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
51: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
52: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
53: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
54: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
55: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
56: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
57: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
59: v += bs2;
60: }
61: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
62: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
63: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
65: }
66: /* backward solve the upper triangular */
67: for (i=n-1; i>=0; i--) {
68: v = aa + bs2*(adiag[i+1]+1);
69: vi = aj + adiag[i+1]+1;
70: nz = adiag[i] - adiag[i+1] - 1;
71: idt = bs*i;
72: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
73: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
74: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
76: for (m=0; m<nz; m++) {
77: idx = bs*vi[m];
78: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
79: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
80: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
82: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
83: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
84: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
85: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
86: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
87: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
88: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
89: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
90: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
91: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
92: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
93: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
94: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
95: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
96: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
98: v += bs2;
99: }
101: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
102: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
103: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
104: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
105: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
106: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
107: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
108: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
109: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
110: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
111: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
112: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
113: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
114: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
115: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
117: }
119: VecRestoreArrayRead(bb,&b);
120: VecRestoreArray(xx,&x);
121: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
122: return(0);
123: }
125: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
126: /* Default MatSolve for block size 15 */
128: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
129: {
130: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
131: PetscErrorCode ierr;
132: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
133: PetscInt i,k,nz,idx,idt,m;
134: const MatScalar *aa=a->a,*v;
135: PetscScalar s[15];
136: PetscScalar *x,xv;
137: const PetscScalar *b;
140: VecGetArrayRead(bb,&b);
141: VecGetArray(xx,&x);
143: /* forward solve the lower triangular */
144: for (i=0; i<n; i++) {
145: v = aa + bs2*ai[i];
146: vi = aj + ai[i];
147: nz = ai[i+1] - ai[i];
148: idt = bs*i;
149: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
150: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
151: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
152: for (m=0; m<nz; m++) {
153: idx = bs*vi[m];
154: for (k=0; k<15; k++) {
155: xv = x[k + idx];
156: x[idt] -= v[0]*xv;
157: x[1+idt] -= v[1]*xv;
158: x[2+idt] -= v[2]*xv;
159: x[3+idt] -= v[3]*xv;
160: x[4+idt] -= v[4]*xv;
161: x[5+idt] -= v[5]*xv;
162: x[6+idt] -= v[6]*xv;
163: x[7+idt] -= v[7]*xv;
164: x[8+idt] -= v[8]*xv;
165: x[9+idt] -= v[9]*xv;
166: x[10+idt] -= v[10]*xv;
167: x[11+idt] -= v[11]*xv;
168: x[12+idt] -= v[12]*xv;
169: x[13+idt] -= v[13]*xv;
170: x[14+idt] -= v[14]*xv;
171: v += 15;
172: }
173: }
174: }
175: /* backward solve the upper triangular */
176: for (i=n-1; i>=0; i--) {
177: v = aa + bs2*(adiag[i+1]+1);
178: vi = aj + adiag[i+1]+1;
179: nz = adiag[i] - adiag[i+1] - 1;
180: idt = bs*i;
181: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
182: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
183: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
185: for (m=0; m<nz; m++) {
186: idx = bs*vi[m];
187: for (k=0; k<15; k++) {
188: xv = x[k + idx];
189: s[0] -= v[0]*xv;
190: s[1] -= v[1]*xv;
191: s[2] -= v[2]*xv;
192: s[3] -= v[3]*xv;
193: s[4] -= v[4]*xv;
194: s[5] -= v[5]*xv;
195: s[6] -= v[6]*xv;
196: s[7] -= v[7]*xv;
197: s[8] -= v[8]*xv;
198: s[9] -= v[9]*xv;
199: s[10] -= v[10]*xv;
200: s[11] -= v[11]*xv;
201: s[12] -= v[12]*xv;
202: s[13] -= v[13]*xv;
203: s[14] -= v[14]*xv;
204: v += 15;
205: }
206: }
207: PetscMemzero(x+idt,bs*sizeof(MatScalar));
208: for (k=0; k<15; k++) {
209: x[idt] += v[0]*s[k];
210: x[1+idt] += v[1]*s[k];
211: x[2+idt] += v[2]*s[k];
212: x[3+idt] += v[3]*s[k];
213: x[4+idt] += v[4]*s[k];
214: x[5+idt] += v[5]*s[k];
215: x[6+idt] += v[6]*s[k];
216: x[7+idt] += v[7]*s[k];
217: x[8+idt] += v[8]*s[k];
218: x[9+idt] += v[9]*s[k];
219: x[10+idt] += v[10]*s[k];
220: x[11+idt] += v[11]*s[k];
221: x[12+idt] += v[12]*s[k];
222: x[13+idt] += v[13]*s[k];
223: x[14+idt] += v[14]*s[k];
224: v += 15;
225: }
226: }
227: VecRestoreArrayRead(bb,&b);
228: VecRestoreArray(xx,&x);
229: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
230: return(0);
231: }
233: /* Block operations are done by accessing one column at at time */
234: /* Default MatSolve for block size 14 */
236: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
237: {
238: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
239: PetscErrorCode ierr;
240: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
241: PetscInt i,k,nz,idx,idt,m;
242: const MatScalar *aa=a->a,*v;
243: PetscScalar s[14];
244: PetscScalar *x,xv;
245: const PetscScalar *b;
248: VecGetArrayRead(bb,&b);
249: VecGetArray(xx,&x);
251: /* forward solve the lower triangular */
252: for (i=0; i<n; i++) {
253: v = aa + bs2*ai[i];
254: vi = aj + ai[i];
255: nz = ai[i+1] - ai[i];
256: idt = bs*i;
257: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
258: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
259: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
260: for (m=0; m<nz; m++) {
261: idx = bs*vi[m];
262: for (k=0; k<bs; k++) {
263: xv = x[k + idx];
264: x[idt] -= v[0]*xv;
265: x[1+idt] -= v[1]*xv;
266: x[2+idt] -= v[2]*xv;
267: x[3+idt] -= v[3]*xv;
268: x[4+idt] -= v[4]*xv;
269: x[5+idt] -= v[5]*xv;
270: x[6+idt] -= v[6]*xv;
271: x[7+idt] -= v[7]*xv;
272: x[8+idt] -= v[8]*xv;
273: x[9+idt] -= v[9]*xv;
274: x[10+idt] -= v[10]*xv;
275: x[11+idt] -= v[11]*xv;
276: x[12+idt] -= v[12]*xv;
277: x[13+idt] -= v[13]*xv;
278: v += 14;
279: }
280: }
281: }
282: /* backward solve the upper triangular */
283: for (i=n-1; i>=0; i--) {
284: v = aa + bs2*(adiag[i+1]+1);
285: vi = aj + adiag[i+1]+1;
286: nz = adiag[i] - adiag[i+1] - 1;
287: idt = bs*i;
288: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
289: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
290: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];
292: for (m=0; m<nz; m++) {
293: idx = bs*vi[m];
294: for (k=0; k<bs; k++) {
295: xv = x[k + idx];
296: s[0] -= v[0]*xv;
297: s[1] -= v[1]*xv;
298: s[2] -= v[2]*xv;
299: s[3] -= v[3]*xv;
300: s[4] -= v[4]*xv;
301: s[5] -= v[5]*xv;
302: s[6] -= v[6]*xv;
303: s[7] -= v[7]*xv;
304: s[8] -= v[8]*xv;
305: s[9] -= v[9]*xv;
306: s[10] -= v[10]*xv;
307: s[11] -= v[11]*xv;
308: s[12] -= v[12]*xv;
309: s[13] -= v[13]*xv;
310: v += 14;
311: }
312: }
313: PetscMemzero(x+idt,bs*sizeof(MatScalar));
314: for (k=0; k<bs; k++) {
315: x[idt] += v[0]*s[k];
316: x[1+idt] += v[1]*s[k];
317: x[2+idt] += v[2]*s[k];
318: x[3+idt] += v[3]*s[k];
319: x[4+idt] += v[4]*s[k];
320: x[5+idt] += v[5]*s[k];
321: x[6+idt] += v[6]*s[k];
322: x[7+idt] += v[7]*s[k];
323: x[8+idt] += v[8]*s[k];
324: x[9+idt] += v[9]*s[k];
325: x[10+idt] += v[10]*s[k];
326: x[11+idt] += v[11]*s[k];
327: x[12+idt] += v[12]*s[k];
328: x[13+idt] += v[13]*s[k];
329: v += 14;
330: }
331: }
332: VecRestoreArrayRead(bb,&b);
333: VecRestoreArray(xx,&x);
334: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
335: return(0);
336: }
338: /* Block operations are done by accessing one column at at time */
339: /* Default MatSolve for block size 13 */
341: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
342: {
343: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
344: PetscErrorCode ierr;
345: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
346: PetscInt i,k,nz,idx,idt,m;
347: const MatScalar *aa=a->a,*v;
348: PetscScalar s[13];
349: PetscScalar *x,xv;
350: const PetscScalar *b;
353: VecGetArrayRead(bb,&b);
354: VecGetArray(xx,&x);
356: /* forward solve the lower triangular */
357: for (i=0; i<n; i++) {
358: v = aa + bs2*ai[i];
359: vi = aj + ai[i];
360: nz = ai[i+1] - ai[i];
361: idt = bs*i;
362: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
363: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
364: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
365: for (m=0; m<nz; m++) {
366: idx = bs*vi[m];
367: for (k=0; k<bs; k++) {
368: xv = x[k + idx];
369: x[idt] -= v[0]*xv;
370: x[1+idt] -= v[1]*xv;
371: x[2+idt] -= v[2]*xv;
372: x[3+idt] -= v[3]*xv;
373: x[4+idt] -= v[4]*xv;
374: x[5+idt] -= v[5]*xv;
375: x[6+idt] -= v[6]*xv;
376: x[7+idt] -= v[7]*xv;
377: x[8+idt] -= v[8]*xv;
378: x[9+idt] -= v[9]*xv;
379: x[10+idt] -= v[10]*xv;
380: x[11+idt] -= v[11]*xv;
381: x[12+idt] -= v[12]*xv;
382: v += 13;
383: }
384: }
385: }
386: /* backward solve the upper triangular */
387: for (i=n-1; i>=0; i--) {
388: v = aa + bs2*(adiag[i+1]+1);
389: vi = aj + adiag[i+1]+1;
390: nz = adiag[i] - adiag[i+1] - 1;
391: idt = bs*i;
392: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
393: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
394: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];
396: for (m=0; m<nz; m++) {
397: idx = bs*vi[m];
398: for (k=0; k<bs; k++) {
399: xv = x[k + idx];
400: s[0] -= v[0]*xv;
401: s[1] -= v[1]*xv;
402: s[2] -= v[2]*xv;
403: s[3] -= v[3]*xv;
404: s[4] -= v[4]*xv;
405: s[5] -= v[5]*xv;
406: s[6] -= v[6]*xv;
407: s[7] -= v[7]*xv;
408: s[8] -= v[8]*xv;
409: s[9] -= v[9]*xv;
410: s[10] -= v[10]*xv;
411: s[11] -= v[11]*xv;
412: s[12] -= v[12]*xv;
413: v += 13;
414: }
415: }
416: PetscMemzero(x+idt,bs*sizeof(MatScalar));
417: for (k=0; k<bs; k++) {
418: x[idt] += v[0]*s[k];
419: x[1+idt] += v[1]*s[k];
420: x[2+idt] += v[2]*s[k];
421: x[3+idt] += v[3]*s[k];
422: x[4+idt] += v[4]*s[k];
423: x[5+idt] += v[5]*s[k];
424: x[6+idt] += v[6]*s[k];
425: x[7+idt] += v[7]*s[k];
426: x[8+idt] += v[8]*s[k];
427: x[9+idt] += v[9]*s[k];
428: x[10+idt] += v[10]*s[k];
429: x[11+idt] += v[11]*s[k];
430: x[12+idt] += v[12]*s[k];
431: v += 13;
432: }
433: }
434: VecRestoreArrayRead(bb,&b);
435: VecRestoreArray(xx,&x);
436: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
437: return(0);
438: }
440: /* Block operations are done by accessing one column at at time */
441: /* Default MatSolve for block size 12 */
443: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
444: {
445: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
446: PetscErrorCode ierr;
447: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
448: PetscInt i,k,nz,idx,idt,m;
449: const MatScalar *aa=a->a,*v;
450: PetscScalar s[12];
451: PetscScalar *x,xv;
452: const PetscScalar *b;
455: VecGetArrayRead(bb,&b);
456: VecGetArray(xx,&x);
458: /* forward solve the lower triangular */
459: for (i=0; i<n; i++) {
460: v = aa + bs2*ai[i];
461: vi = aj + ai[i];
462: nz = ai[i+1] - ai[i];
463: idt = bs*i;
464: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
465: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
466: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
467: for (m=0; m<nz; m++) {
468: idx = bs*vi[m];
469: for (k=0; k<bs; k++) {
470: xv = x[k + idx];
471: x[idt] -= v[0]*xv;
472: x[1+idt] -= v[1]*xv;
473: x[2+idt] -= v[2]*xv;
474: x[3+idt] -= v[3]*xv;
475: x[4+idt] -= v[4]*xv;
476: x[5+idt] -= v[5]*xv;
477: x[6+idt] -= v[6]*xv;
478: x[7+idt] -= v[7]*xv;
479: x[8+idt] -= v[8]*xv;
480: x[9+idt] -= v[9]*xv;
481: x[10+idt] -= v[10]*xv;
482: x[11+idt] -= v[11]*xv;
483: v += 12;
484: }
485: }
486: }
487: /* backward solve the upper triangular */
488: for (i=n-1; i>=0; i--) {
489: v = aa + bs2*(adiag[i+1]+1);
490: vi = aj + adiag[i+1]+1;
491: nz = adiag[i] - adiag[i+1] - 1;
492: idt = bs*i;
493: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
494: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
495: s[10] = x[10+idt]; s[11] = x[11+idt];
497: for (m=0; m<nz; m++) {
498: idx = bs*vi[m];
499: for (k=0; k<bs; k++) {
500: xv = x[k + idx];
501: s[0] -= v[0]*xv;
502: s[1] -= v[1]*xv;
503: s[2] -= v[2]*xv;
504: s[3] -= v[3]*xv;
505: s[4] -= v[4]*xv;
506: s[5] -= v[5]*xv;
507: s[6] -= v[6]*xv;
508: s[7] -= v[7]*xv;
509: s[8] -= v[8]*xv;
510: s[9] -= v[9]*xv;
511: s[10] -= v[10]*xv;
512: s[11] -= v[11]*xv;
513: v += 12;
514: }
515: }
516: PetscMemzero(x+idt,bs*sizeof(MatScalar));
517: for (k=0; k<bs; k++) {
518: x[idt] += v[0]*s[k];
519: x[1+idt] += v[1]*s[k];
520: x[2+idt] += v[2]*s[k];
521: x[3+idt] += v[3]*s[k];
522: x[4+idt] += v[4]*s[k];
523: x[5+idt] += v[5]*s[k];
524: x[6+idt] += v[6]*s[k];
525: x[7+idt] += v[7]*s[k];
526: x[8+idt] += v[8]*s[k];
527: x[9+idt] += v[9]*s[k];
528: x[10+idt] += v[10]*s[k];
529: x[11+idt] += v[11]*s[k];
530: v += 12;
531: }
532: }
533: VecRestoreArrayRead(bb,&b);
534: VecRestoreArray(xx,&x);
535: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
536: return(0);
537: }
539: /* Block operations are done by accessing one column at at time */
540: /* Default MatSolve for block size 11 */
542: PetscErrorCode MatSolve_SeqBAIJ_11_NaturalOrdering(Mat A,Vec bb,Vec xx)
543: {
544: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
545: PetscErrorCode ierr;
546: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
547: PetscInt i,k,nz,idx,idt,m;
548: const MatScalar *aa=a->a,*v;
549: PetscScalar s[11];
550: PetscScalar *x,xv;
551: const PetscScalar *b;
554: VecGetArrayRead(bb,&b);
555: VecGetArray(xx,&x);
557: /* forward solve the lower triangular */
558: for (i=0; i<n; i++) {
559: v = aa + bs2*ai[i];
560: vi = aj + ai[i];
561: nz = ai[i+1] - ai[i];
562: idt = bs*i;
563: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
564: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
565: x[10+idt] = b[10+idt];
566: for (m=0; m<nz; m++) {
567: idx = bs*vi[m];
568: for (k=0; k<11; k++) {
569: xv = x[k + idx];
570: x[idt] -= v[0]*xv;
571: x[1+idt] -= v[1]*xv;
572: x[2+idt] -= v[2]*xv;
573: x[3+idt] -= v[3]*xv;
574: x[4+idt] -= v[4]*xv;
575: x[5+idt] -= v[5]*xv;
576: x[6+idt] -= v[6]*xv;
577: x[7+idt] -= v[7]*xv;
578: x[8+idt] -= v[8]*xv;
579: x[9+idt] -= v[9]*xv;
580: x[10+idt] -= v[10]*xv;
581: v += 11;
582: }
583: }
584: }
585: /* backward solve the upper triangular */
586: for (i=n-1; i>=0; i--) {
587: v = aa + bs2*(adiag[i+1]+1);
588: vi = aj + adiag[i+1]+1;
589: nz = adiag[i] - adiag[i+1] - 1;
590: idt = bs*i;
591: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
592: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
593: s[10] = x[10+idt];
595: for (m=0; m<nz; m++) {
596: idx = bs*vi[m];
597: for (k=0; k<11; k++) {
598: xv = x[k + idx];
599: s[0] -= v[0]*xv;
600: s[1] -= v[1]*xv;
601: s[2] -= v[2]*xv;
602: s[3] -= v[3]*xv;
603: s[4] -= v[4]*xv;
604: s[5] -= v[5]*xv;
605: s[6] -= v[6]*xv;
606: s[7] -= v[7]*xv;
607: s[8] -= v[8]*xv;
608: s[9] -= v[9]*xv;
609: s[10] -= v[10]*xv;
610: v += 11;
611: }
612: }
613: PetscMemzero(x+idt,bs*sizeof(MatScalar));
614: for (k=0; k<11; k++) {
615: x[idt] += v[0]*s[k];
616: x[1+idt] += v[1]*s[k];
617: x[2+idt] += v[2]*s[k];
618: x[3+idt] += v[3]*s[k];
619: x[4+idt] += v[4]*s[k];
620: x[5+idt] += v[5]*s[k];
621: x[6+idt] += v[6]*s[k];
622: x[7+idt] += v[7]*s[k];
623: x[8+idt] += v[8]*s[k];
624: x[9+idt] += v[9]*s[k];
625: x[10+idt] += v[10]*s[k];
626: v += 11;
627: }
628: }
629: VecRestoreArrayRead(bb,&b);
630: VecRestoreArray(xx,&x);
631: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
632: return(0);
633: }
635: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
636: {
637: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
638: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
639: PetscErrorCode ierr;
640: PetscInt i,nz,idx,idt,jdx;
641: const MatScalar *aa=a->a,*v;
642: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
643: const PetscScalar *b;
646: VecGetArrayRead(bb,&b);
647: VecGetArray(xx,&x);
648: /* forward solve the lower triangular */
649: idx = 0;
650: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
651: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
652: x[6] = b[6+idx];
653: for (i=1; i<n; i++) {
654: v = aa + 49*ai[i];
655: vi = aj + ai[i];
656: nz = diag[i] - ai[i];
657: idx = 7*i;
658: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
659: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
660: s7 = b[6+idx];
661: while (nz--) {
662: jdx = 7*(*vi++);
663: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
664: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
665: x7 = x[6+jdx];
666: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
667: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
668: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
669: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
670: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
671: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
672: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
673: v += 49;
674: }
675: x[idx] = s1;
676: x[1+idx] = s2;
677: x[2+idx] = s3;
678: x[3+idx] = s4;
679: x[4+idx] = s5;
680: x[5+idx] = s6;
681: x[6+idx] = s7;
682: }
683: /* backward solve the upper triangular */
684: for (i=n-1; i>=0; i--) {
685: v = aa + 49*diag[i] + 49;
686: vi = aj + diag[i] + 1;
687: nz = ai[i+1] - diag[i] - 1;
688: idt = 7*i;
689: s1 = x[idt]; s2 = x[1+idt];
690: s3 = x[2+idt]; s4 = x[3+idt];
691: s5 = x[4+idt]; s6 = x[5+idt];
692: s7 = x[6+idt];
693: while (nz--) {
694: idx = 7*(*vi++);
695: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
696: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
697: x7 = x[6+idx];
698: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
699: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
700: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
701: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
702: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
703: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
704: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
705: v += 49;
706: }
707: v = aa + 49*diag[i];
708: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
709: + v[28]*s5 + v[35]*s6 + v[42]*s7;
710: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
711: + v[29]*s5 + v[36]*s6 + v[43]*s7;
712: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
713: + v[30]*s5 + v[37]*s6 + v[44]*s7;
714: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
715: + v[31]*s5 + v[38]*s6 + v[45]*s7;
716: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
717: + v[32]*s5 + v[39]*s6 + v[46]*s7;
718: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
719: + v[33]*s5 + v[40]*s6 + v[47]*s7;
720: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
721: + v[34]*s5 + v[41]*s6 + v[48]*s7;
722: }
724: VecRestoreArrayRead(bb,&b);
725: VecRestoreArray(xx,&x);
726: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
727: return(0);
728: }
730: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
731: {
732: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
733: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
734: PetscErrorCode ierr;
735: PetscInt i,k,nz,idx,jdx,idt;
736: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
737: const MatScalar *aa=a->a,*v;
738: PetscScalar *x;
739: const PetscScalar *b;
740: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
743: VecGetArrayRead(bb,&b);
744: VecGetArray(xx,&x);
745: /* forward solve the lower triangular */
746: idx = 0;
747: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
748: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
749: for (i=1; i<n; i++) {
750: v = aa + bs2*ai[i];
751: vi = aj + ai[i];
752: nz = ai[i+1] - ai[i];
753: idx = bs*i;
754: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
755: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
756: for (k=0; k<nz; k++) {
757: jdx = bs*vi[k];
758: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
759: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
760: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
761: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
762: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
763: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
764: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
765: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
766: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
767: v += bs2;
768: }
770: x[idx] = s1;
771: x[1+idx] = s2;
772: x[2+idx] = s3;
773: x[3+idx] = s4;
774: x[4+idx] = s5;
775: x[5+idx] = s6;
776: x[6+idx] = s7;
777: }
779: /* backward solve the upper triangular */
780: for (i=n-1; i>=0; i--) {
781: v = aa + bs2*(adiag[i+1]+1);
782: vi = aj + adiag[i+1]+1;
783: nz = adiag[i] - adiag[i+1]-1;
784: idt = bs*i;
785: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
786: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
787: for (k=0; k<nz; k++) {
788: idx = bs*vi[k];
789: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
790: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
791: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
792: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
793: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
794: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
795: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
796: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
797: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
798: v += bs2;
799: }
800: /* x = inv_diagonal*x */
801: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
802: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
803: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
804: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
805: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
806: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
807: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
808: }
810: VecRestoreArrayRead(bb,&b);
811: VecRestoreArray(xx,&x);
812: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
813: return(0);
814: }
816: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
817: {
818: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
819: PetscInt i,nz,idx,idt,jdx;
820: PetscErrorCode ierr;
821: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
822: const MatScalar *aa =a->a,*v;
823: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
824: const PetscScalar *b;
827: VecGetArrayRead(bb,&b);
828: VecGetArray(xx,&x);
829: /* forward solve the lower triangular */
830: idx = 0;
831: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
832: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
833: for (i=1; i<n; i++) {
834: v = aa + 36*ai[i];
835: vi = aj + ai[i];
836: nz = diag[i] - ai[i];
837: idx = 6*i;
838: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
839: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
840: while (nz--) {
841: jdx = 6*(*vi++);
842: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
843: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
844: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
845: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
846: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
847: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
848: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
849: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
850: v += 36;
851: }
852: x[idx] = s1;
853: x[1+idx] = s2;
854: x[2+idx] = s3;
855: x[3+idx] = s4;
856: x[4+idx] = s5;
857: x[5+idx] = s6;
858: }
859: /* backward solve the upper triangular */
860: for (i=n-1; i>=0; i--) {
861: v = aa + 36*diag[i] + 36;
862: vi = aj + diag[i] + 1;
863: nz = ai[i+1] - diag[i] - 1;
864: idt = 6*i;
865: s1 = x[idt]; s2 = x[1+idt];
866: s3 = x[2+idt]; s4 = x[3+idt];
867: s5 = x[4+idt]; s6 = x[5+idt];
868: while (nz--) {
869: idx = 6*(*vi++);
870: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
871: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
872: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
873: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
874: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
875: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
876: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
877: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
878: v += 36;
879: }
880: v = aa + 36*diag[i];
881: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
882: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
883: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
884: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
885: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
886: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
887: }
889: VecRestoreArrayRead(bb,&b);
890: VecRestoreArray(xx,&x);
891: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
892: return(0);
893: }
895: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
896: {
897: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
898: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
899: PetscErrorCode ierr;
900: PetscInt i,k,nz,idx,jdx,idt;
901: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
902: const MatScalar *aa=a->a,*v;
903: PetscScalar *x;
904: const PetscScalar *b;
905: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
908: VecGetArrayRead(bb,&b);
909: VecGetArray(xx,&x);
910: /* forward solve the lower triangular */
911: idx = 0;
912: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
913: x[4] = b[4+idx];x[5] = b[5+idx];
914: for (i=1; i<n; i++) {
915: v = aa + bs2*ai[i];
916: vi = aj + ai[i];
917: nz = ai[i+1] - ai[i];
918: idx = bs*i;
919: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
920: s5 = b[4+idx];s6 = b[5+idx];
921: for (k=0; k<nz; k++) {
922: jdx = bs*vi[k];
923: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
924: x5 = x[4+jdx]; x6 = x[5+jdx];
925: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
926: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
927: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
928: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
929: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
930: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
931: v += bs2;
932: }
934: x[idx] = s1;
935: x[1+idx] = s2;
936: x[2+idx] = s3;
937: x[3+idx] = s4;
938: x[4+idx] = s5;
939: x[5+idx] = s6;
940: }
942: /* backward solve the upper triangular */
943: for (i=n-1; i>=0; i--) {
944: v = aa + bs2*(adiag[i+1]+1);
945: vi = aj + adiag[i+1]+1;
946: nz = adiag[i] - adiag[i+1]-1;
947: idt = bs*i;
948: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
949: s5 = x[4+idt];s6 = x[5+idt];
950: for (k=0; k<nz; k++) {
951: idx = bs*vi[k];
952: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
953: x5 = x[4+idx];x6 = x[5+idx];
954: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
955: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
956: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
957: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
958: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
959: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
960: v += bs2;
961: }
962: /* x = inv_diagonal*x */
963: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
964: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
965: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
966: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
967: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
968: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
969: }
971: VecRestoreArrayRead(bb,&b);
972: VecRestoreArray(xx,&x);
973: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
974: return(0);
975: }
977: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
978: {
979: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
980: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
981: PetscInt i,nz,idx,idt,jdx;
982: PetscErrorCode ierr;
983: const MatScalar *aa=a->a,*v;
984: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
985: const PetscScalar *b;
988: VecGetArrayRead(bb,&b);
989: VecGetArray(xx,&x);
990: /* forward solve the lower triangular */
991: idx = 0;
992: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
993: for (i=1; i<n; i++) {
994: v = aa + 25*ai[i];
995: vi = aj + ai[i];
996: nz = diag[i] - ai[i];
997: idx = 5*i;
998: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
999: while (nz--) {
1000: jdx = 5*(*vi++);
1001: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1002: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1003: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1004: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1005: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1006: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1007: v += 25;
1008: }
1009: x[idx] = s1;
1010: x[1+idx] = s2;
1011: x[2+idx] = s3;
1012: x[3+idx] = s4;
1013: x[4+idx] = s5;
1014: }
1015: /* backward solve the upper triangular */
1016: for (i=n-1; i>=0; i--) {
1017: v = aa + 25*diag[i] + 25;
1018: vi = aj + diag[i] + 1;
1019: nz = ai[i+1] - diag[i] - 1;
1020: idt = 5*i;
1021: s1 = x[idt]; s2 = x[1+idt];
1022: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1023: while (nz--) {
1024: idx = 5*(*vi++);
1025: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1026: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1027: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1028: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1029: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1030: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1031: v += 25;
1032: }
1033: v = aa + 25*diag[i];
1034: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1035: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1036: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1037: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1038: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1039: }
1041: VecRestoreArrayRead(bb,&b);
1042: VecRestoreArray(xx,&x);
1043: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1044: return(0);
1045: }
1047: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1048: {
1049: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1050: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1051: PetscInt i,k,nz,idx,idt,jdx;
1052: PetscErrorCode ierr;
1053: const MatScalar *aa=a->a,*v;
1054: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1055: const PetscScalar *b;
1058: VecGetArrayRead(bb,&b);
1059: VecGetArray(xx,&x);
1060: /* forward solve the lower triangular */
1061: idx = 0;
1062: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1063: for (i=1; i<n; i++) {
1064: v = aa + 25*ai[i];
1065: vi = aj + ai[i];
1066: nz = ai[i+1] - ai[i];
1067: idx = 5*i;
1068: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1069: for (k=0; k<nz; k++) {
1070: jdx = 5*vi[k];
1071: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1072: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1073: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1074: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1075: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1076: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1077: v += 25;
1078: }
1079: x[idx] = s1;
1080: x[1+idx] = s2;
1081: x[2+idx] = s3;
1082: x[3+idx] = s4;
1083: x[4+idx] = s5;
1084: }
1086: /* backward solve the upper triangular */
1087: for (i=n-1; i>=0; i--) {
1088: v = aa + 25*(adiag[i+1]+1);
1089: vi = aj + adiag[i+1]+1;
1090: nz = adiag[i] - adiag[i+1]-1;
1091: idt = 5*i;
1092: s1 = x[idt]; s2 = x[1+idt];
1093: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1094: for (k=0; k<nz; k++) {
1095: idx = 5*vi[k];
1096: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1097: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1098: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1099: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1100: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1101: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1102: v += 25;
1103: }
1104: /* x = inv_diagonal*x */
1105: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1106: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1107: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1108: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1109: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1110: }
1112: VecRestoreArrayRead(bb,&b);
1113: VecRestoreArray(xx,&x);
1114: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1115: return(0);
1116: }
1118: /*
1119: Special case where the matrix was ILU(0) factored in the natural
1120: ordering. This eliminates the need for the column and row permutation.
1121: */
1122: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1123: {
1124: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1125: PetscInt n =a->mbs;
1126: const PetscInt *ai=a->i,*aj=a->j;
1127: PetscErrorCode ierr;
1128: const PetscInt *diag = a->diag;
1129: const MatScalar *aa =a->a;
1130: PetscScalar *x;
1131: const PetscScalar *b;
1134: VecGetArrayRead(bb,&b);
1135: VecGetArray(xx,&x);
1137: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1138: {
1139: static PetscScalar w[2000]; /* very BAD need to fix */
1140: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1141: }
1142: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1143: {
1144: static PetscScalar w[2000]; /* very BAD need to fix */
1145: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1146: }
1147: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1148: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1149: #else
1150: {
1151: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
1152: const MatScalar *v;
1153: PetscInt jdx,idt,idx,nz,i,ai16;
1154: const PetscInt *vi;
1156: /* forward solve the lower triangular */
1157: idx = 0;
1158: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1159: for (i=1; i<n; i++) {
1160: v = aa + 16*ai[i];
1161: vi = aj + ai[i];
1162: nz = diag[i] - ai[i];
1163: idx += 4;
1164: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1165: while (nz--) {
1166: jdx = 4*(*vi++);
1167: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1168: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1169: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1170: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1171: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1172: v += 16;
1173: }
1174: x[idx] = s1;
1175: x[1+idx] = s2;
1176: x[2+idx] = s3;
1177: x[3+idx] = s4;
1178: }
1179: /* backward solve the upper triangular */
1180: idt = 4*(n-1);
1181: for (i=n-1; i>=0; i--) {
1182: ai16 = 16*diag[i];
1183: v = aa + ai16 + 16;
1184: vi = aj + diag[i] + 1;
1185: nz = ai[i+1] - diag[i] - 1;
1186: s1 = x[idt]; s2 = x[1+idt];
1187: s3 = x[2+idt];s4 = x[3+idt];
1188: while (nz--) {
1189: idx = 4*(*vi++);
1190: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
1191: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1192: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1193: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1194: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1195: v += 16;
1196: }
1197: v = aa + ai16;
1198: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1199: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
1200: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1201: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1202: idt -= 4;
1203: }
1204: }
1205: #endif
1207: VecRestoreArrayRead(bb,&b);
1208: VecRestoreArray(xx,&x);
1209: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1210: return(0);
1211: }
1213: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1214: {
1215: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1216: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1217: PetscInt i,k,nz,idx,jdx,idt;
1218: PetscErrorCode ierr;
1219: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1220: const MatScalar *aa=a->a,*v;
1221: PetscScalar *x;
1222: const PetscScalar *b;
1223: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
1226: VecGetArrayRead(bb,&b);
1227: VecGetArray(xx,&x);
1228: /* forward solve the lower triangular */
1229: idx = 0;
1230: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1231: for (i=1; i<n; i++) {
1232: v = aa + bs2*ai[i];
1233: vi = aj + ai[i];
1234: nz = ai[i+1] - ai[i];
1235: idx = bs*i;
1236: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1237: for (k=0; k<nz; k++) {
1238: jdx = bs*vi[k];
1239: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1240: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1241: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1242: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1243: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1245: v += bs2;
1246: }
1248: x[idx] = s1;
1249: x[1+idx] = s2;
1250: x[2+idx] = s3;
1251: x[3+idx] = s4;
1252: }
1254: /* backward solve the upper triangular */
1255: for (i=n-1; i>=0; i--) {
1256: v = aa + bs2*(adiag[i+1]+1);
1257: vi = aj + adiag[i+1]+1;
1258: nz = adiag[i] - adiag[i+1]-1;
1259: idt = bs*i;
1260: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1262: for (k=0; k<nz; k++) {
1263: idx = bs*vi[k];
1264: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1265: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1266: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1267: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1268: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1270: v += bs2;
1271: }
1272: /* x = inv_diagonal*x */
1273: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1274: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
1275: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1276: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1278: }
1280: VecRestoreArrayRead(bb,&b);
1281: VecRestoreArray(xx,&x);
1282: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1283: return(0);
1284: }
1286: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1287: {
1288: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1289: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1290: PetscErrorCode ierr;
1291: const MatScalar *aa=a->a;
1292: const PetscScalar *b;
1293: PetscScalar *x;
1296: VecGetArrayRead(bb,&b);
1297: VecGetArray(xx,&x);
1299: {
1300: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
1301: const MatScalar *v;
1302: MatScalar *t=(MatScalar*)x;
1303: PetscInt jdx,idt,idx,nz,i,ai16;
1304: const PetscInt *vi;
1306: /* forward solve the lower triangular */
1307: idx = 0;
1308: t[0] = (MatScalar)b[0];
1309: t[1] = (MatScalar)b[1];
1310: t[2] = (MatScalar)b[2];
1311: t[3] = (MatScalar)b[3];
1312: for (i=1; i<n; i++) {
1313: v = aa + 16*ai[i];
1314: vi = aj + ai[i];
1315: nz = diag[i] - ai[i];
1316: idx += 4;
1317: s1 = (MatScalar)b[idx];
1318: s2 = (MatScalar)b[1+idx];
1319: s3 = (MatScalar)b[2+idx];
1320: s4 = (MatScalar)b[3+idx];
1321: while (nz--) {
1322: jdx = 4*(*vi++);
1323: x1 = t[jdx];
1324: x2 = t[1+jdx];
1325: x3 = t[2+jdx];
1326: x4 = t[3+jdx];
1327: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1328: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1329: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1330: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1331: v += 16;
1332: }
1333: t[idx] = s1;
1334: t[1+idx] = s2;
1335: t[2+idx] = s3;
1336: t[3+idx] = s4;
1337: }
1338: /* backward solve the upper triangular */
1339: idt = 4*(n-1);
1340: for (i=n-1; i>=0; i--) {
1341: ai16 = 16*diag[i];
1342: v = aa + ai16 + 16;
1343: vi = aj + diag[i] + 1;
1344: nz = ai[i+1] - diag[i] - 1;
1345: s1 = t[idt];
1346: s2 = t[1+idt];
1347: s3 = t[2+idt];
1348: s4 = t[3+idt];
1349: while (nz--) {
1350: idx = 4*(*vi++);
1351: x1 = (MatScalar)x[idx];
1352: x2 = (MatScalar)x[1+idx];
1353: x3 = (MatScalar)x[2+idx];
1354: x4 = (MatScalar)x[3+idx];
1355: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1356: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1357: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1358: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1359: v += 16;
1360: }
1361: v = aa + ai16;
1362: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
1363: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
1364: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
1365: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
1366: idt -= 4;
1367: }
1368: }
1370: VecRestoreArrayRead(bb,&b);
1371: VecRestoreArray(xx,&x);
1372: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1373: return(0);
1374: }
1376: #if defined(PETSC_HAVE_SSE)
1378: #include PETSC_HAVE_SSE
1379: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1380: {
1381: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1382: unsigned short *aj=(unsigned short*)a->j;
1384: int *ai=a->i,n=a->mbs,*diag = a->diag;
1385: MatScalar *aa=a->a;
1386: PetscScalar *x,*b;
1389: SSE_SCOPE_BEGIN;
1390: /*
1391: Note: This code currently uses demotion of double
1392: to float when performing the mixed-mode computation.
1393: This may not be numerically reasonable for all applications.
1394: */
1395: PREFETCH_NTA(aa+16*ai[1]);
1397: VecGetArray(bb,&b);
1398: VecGetArray(xx,&x);
1399: {
1400: /* x will first be computed in single precision then promoted inplace to double */
1401: MatScalar *v,*t=(MatScalar*)x;
1402: int nz,i,idt,ai16;
1403: unsigned int jdx,idx;
1404: unsigned short *vi;
1405: /* Forward solve the lower triangular factor. */
1407: /* First block is the identity. */
1408: idx = 0;
1409: CONVERT_DOUBLE4_FLOAT4(t,b);
1410: v = aa + 16*((unsigned int)ai[1]);
1412: for (i=1; i<n; ) {
1413: PREFETCH_NTA(&v[8]);
1414: vi = aj + ai[i];
1415: nz = diag[i] - ai[i];
1416: idx += 4;
1418: /* Demote RHS from double to float. */
1419: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1420: LOAD_PS(&t[idx],XMM7);
1422: while (nz--) {
1423: PREFETCH_NTA(&v[16]);
1424: jdx = 4*((unsigned int)(*vi++));
1426: /* 4x4 Matrix-Vector product with negative accumulation: */
1427: SSE_INLINE_BEGIN_2(&t[jdx],v)
1428: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1430: /* First Column */
1431: SSE_COPY_PS(XMM0,XMM6)
1432: SSE_SHUFFLE(XMM0,XMM0,0x00)
1433: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1434: SSE_SUB_PS(XMM7,XMM0)
1436: /* Second Column */
1437: SSE_COPY_PS(XMM1,XMM6)
1438: SSE_SHUFFLE(XMM1,XMM1,0x55)
1439: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1440: SSE_SUB_PS(XMM7,XMM1)
1442: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1444: /* Third Column */
1445: SSE_COPY_PS(XMM2,XMM6)
1446: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1447: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1448: SSE_SUB_PS(XMM7,XMM2)
1450: /* Fourth Column */
1451: SSE_COPY_PS(XMM3,XMM6)
1452: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1453: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1454: SSE_SUB_PS(XMM7,XMM3)
1455: SSE_INLINE_END_2
1457: v += 16;
1458: }
1459: v = aa + 16*ai[++i];
1460: PREFETCH_NTA(v);
1461: STORE_PS(&t[idx],XMM7);
1462: }
1464: /* Backward solve the upper triangular factor.*/
1466: idt = 4*(n-1);
1467: ai16 = 16*diag[n-1];
1468: v = aa + ai16 + 16;
1469: for (i=n-1; i>=0; ) {
1470: PREFETCH_NTA(&v[8]);
1471: vi = aj + diag[i] + 1;
1472: nz = ai[i+1] - diag[i] - 1;
1474: LOAD_PS(&t[idt],XMM7);
1476: while (nz--) {
1477: PREFETCH_NTA(&v[16]);
1478: idx = 4*((unsigned int)(*vi++));
1480: /* 4x4 Matrix-Vector Product with negative accumulation: */
1481: SSE_INLINE_BEGIN_2(&t[idx],v)
1482: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1484: /* First Column */
1485: SSE_COPY_PS(XMM0,XMM6)
1486: SSE_SHUFFLE(XMM0,XMM0,0x00)
1487: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1488: SSE_SUB_PS(XMM7,XMM0)
1490: /* Second Column */
1491: SSE_COPY_PS(XMM1,XMM6)
1492: SSE_SHUFFLE(XMM1,XMM1,0x55)
1493: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1494: SSE_SUB_PS(XMM7,XMM1)
1496: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1498: /* Third Column */
1499: SSE_COPY_PS(XMM2,XMM6)
1500: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1501: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1502: SSE_SUB_PS(XMM7,XMM2)
1504: /* Fourth Column */
1505: SSE_COPY_PS(XMM3,XMM6)
1506: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1507: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1508: SSE_SUB_PS(XMM7,XMM3)
1509: SSE_INLINE_END_2
1510: v += 16;
1511: }
1512: v = aa + ai16;
1513: ai16 = 16*diag[--i];
1514: PREFETCH_NTA(aa+ai16+16);
1515: /*
1516: Scale the result by the diagonal 4x4 block,
1517: which was inverted as part of the factorization
1518: */
1519: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1520: /* First Column */
1521: SSE_COPY_PS(XMM0,XMM7)
1522: SSE_SHUFFLE(XMM0,XMM0,0x00)
1523: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1525: /* Second Column */
1526: SSE_COPY_PS(XMM1,XMM7)
1527: SSE_SHUFFLE(XMM1,XMM1,0x55)
1528: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1529: SSE_ADD_PS(XMM0,XMM1)
1531: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1533: /* Third Column */
1534: SSE_COPY_PS(XMM2,XMM7)
1535: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1536: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1537: SSE_ADD_PS(XMM0,XMM2)
1539: /* Fourth Column */
1540: SSE_COPY_PS(XMM3,XMM7)
1541: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1542: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1543: SSE_ADD_PS(XMM0,XMM3)
1545: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1546: SSE_INLINE_END_3
1548: v = aa + ai16 + 16;
1549: idt -= 4;
1550: }
1552: /* Convert t from single precision back to double precision (inplace)*/
1553: idt = 4*(n-1);
1554: for (i=n-1; i>=0; i--) {
1555: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1556: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1557: PetscScalar *xtemp=&x[idt];
1558: MatScalar *ttemp=&t[idt];
1559: xtemp[3] = (PetscScalar)ttemp[3];
1560: xtemp[2] = (PetscScalar)ttemp[2];
1561: xtemp[1] = (PetscScalar)ttemp[1];
1562: xtemp[0] = (PetscScalar)ttemp[0];
1563: idt -= 4;
1564: }
1566: } /* End of artificial scope. */
1567: VecRestoreArray(bb,&b);
1568: VecRestoreArray(xx,&x);
1569: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1570: SSE_SCOPE_END;
1571: return(0);
1572: }
1574: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1575: {
1576: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1577: int *aj=a->j;
1579: int *ai=a->i,n=a->mbs,*diag = a->diag;
1580: MatScalar *aa=a->a;
1581: PetscScalar *x,*b;
1584: SSE_SCOPE_BEGIN;
1585: /*
1586: Note: This code currently uses demotion of double
1587: to float when performing the mixed-mode computation.
1588: This may not be numerically reasonable for all applications.
1589: */
1590: PREFETCH_NTA(aa+16*ai[1]);
1592: VecGetArray(bb,&b);
1593: VecGetArray(xx,&x);
1594: {
1595: /* x will first be computed in single precision then promoted inplace to double */
1596: MatScalar *v,*t=(MatScalar*)x;
1597: int nz,i,idt,ai16;
1598: int jdx,idx;
1599: int *vi;
1600: /* Forward solve the lower triangular factor. */
1602: /* First block is the identity. */
1603: idx = 0;
1604: CONVERT_DOUBLE4_FLOAT4(t,b);
1605: v = aa + 16*ai[1];
1607: for (i=1; i<n; ) {
1608: PREFETCH_NTA(&v[8]);
1609: vi = aj + ai[i];
1610: nz = diag[i] - ai[i];
1611: idx += 4;
1613: /* Demote RHS from double to float. */
1614: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1615: LOAD_PS(&t[idx],XMM7);
1617: while (nz--) {
1618: PREFETCH_NTA(&v[16]);
1619: jdx = 4*(*vi++);
1620: /* jdx = *vi++; */
1622: /* 4x4 Matrix-Vector product with negative accumulation: */
1623: SSE_INLINE_BEGIN_2(&t[jdx],v)
1624: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1626: /* First Column */
1627: SSE_COPY_PS(XMM0,XMM6)
1628: SSE_SHUFFLE(XMM0,XMM0,0x00)
1629: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1630: SSE_SUB_PS(XMM7,XMM0)
1632: /* Second Column */
1633: SSE_COPY_PS(XMM1,XMM6)
1634: SSE_SHUFFLE(XMM1,XMM1,0x55)
1635: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1636: SSE_SUB_PS(XMM7,XMM1)
1638: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1640: /* Third Column */
1641: SSE_COPY_PS(XMM2,XMM6)
1642: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1643: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1644: SSE_SUB_PS(XMM7,XMM2)
1646: /* Fourth Column */
1647: SSE_COPY_PS(XMM3,XMM6)
1648: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1649: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1650: SSE_SUB_PS(XMM7,XMM3)
1651: SSE_INLINE_END_2
1653: v += 16;
1654: }
1655: v = aa + 16*ai[++i];
1656: PREFETCH_NTA(v);
1657: STORE_PS(&t[idx],XMM7);
1658: }
1660: /* Backward solve the upper triangular factor.*/
1662: idt = 4*(n-1);
1663: ai16 = 16*diag[n-1];
1664: v = aa + ai16 + 16;
1665: for (i=n-1; i>=0; ) {
1666: PREFETCH_NTA(&v[8]);
1667: vi = aj + diag[i] + 1;
1668: nz = ai[i+1] - diag[i] - 1;
1670: LOAD_PS(&t[idt],XMM7);
1672: while (nz--) {
1673: PREFETCH_NTA(&v[16]);
1674: idx = 4*(*vi++);
1675: /* idx = *vi++; */
1677: /* 4x4 Matrix-Vector Product with negative accumulation: */
1678: SSE_INLINE_BEGIN_2(&t[idx],v)
1679: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1681: /* First Column */
1682: SSE_COPY_PS(XMM0,XMM6)
1683: SSE_SHUFFLE(XMM0,XMM0,0x00)
1684: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1685: SSE_SUB_PS(XMM7,XMM0)
1687: /* Second Column */
1688: SSE_COPY_PS(XMM1,XMM6)
1689: SSE_SHUFFLE(XMM1,XMM1,0x55)
1690: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1691: SSE_SUB_PS(XMM7,XMM1)
1693: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1695: /* Third Column */
1696: SSE_COPY_PS(XMM2,XMM6)
1697: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1698: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1699: SSE_SUB_PS(XMM7,XMM2)
1701: /* Fourth Column */
1702: SSE_COPY_PS(XMM3,XMM6)
1703: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1704: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1705: SSE_SUB_PS(XMM7,XMM3)
1706: SSE_INLINE_END_2
1707: v += 16;
1708: }
1709: v = aa + ai16;
1710: ai16 = 16*diag[--i];
1711: PREFETCH_NTA(aa+ai16+16);
1712: /*
1713: Scale the result by the diagonal 4x4 block,
1714: which was inverted as part of the factorization
1715: */
1716: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1717: /* First Column */
1718: SSE_COPY_PS(XMM0,XMM7)
1719: SSE_SHUFFLE(XMM0,XMM0,0x00)
1720: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1722: /* Second Column */
1723: SSE_COPY_PS(XMM1,XMM7)
1724: SSE_SHUFFLE(XMM1,XMM1,0x55)
1725: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1726: SSE_ADD_PS(XMM0,XMM1)
1728: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1730: /* Third Column */
1731: SSE_COPY_PS(XMM2,XMM7)
1732: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1733: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1734: SSE_ADD_PS(XMM0,XMM2)
1736: /* Fourth Column */
1737: SSE_COPY_PS(XMM3,XMM7)
1738: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1739: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1740: SSE_ADD_PS(XMM0,XMM3)
1742: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1743: SSE_INLINE_END_3
1745: v = aa + ai16 + 16;
1746: idt -= 4;
1747: }
1749: /* Convert t from single precision back to double precision (inplace)*/
1750: idt = 4*(n-1);
1751: for (i=n-1; i>=0; i--) {
1752: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1753: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1754: PetscScalar *xtemp=&x[idt];
1755: MatScalar *ttemp=&t[idt];
1756: xtemp[3] = (PetscScalar)ttemp[3];
1757: xtemp[2] = (PetscScalar)ttemp[2];
1758: xtemp[1] = (PetscScalar)ttemp[1];
1759: xtemp[0] = (PetscScalar)ttemp[0];
1760: idt -= 4;
1761: }
1763: } /* End of artificial scope. */
1764: VecRestoreArray(bb,&b);
1765: VecRestoreArray(xx,&x);
1766: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1767: SSE_SCOPE_END;
1768: return(0);
1769: }
1771: #endif
1773: /*
1774: Special case where the matrix was ILU(0) factored in the natural
1775: ordering. This eliminates the need for the column and row permutation.
1776: */
1777: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1778: {
1779: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1780: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j;
1781: PetscErrorCode ierr;
1782: const PetscInt *diag = a->diag,*vi;
1783: const MatScalar *aa =a->a,*v;
1784: PetscScalar *x,s1,s2,s3,x1,x2,x3;
1785: const PetscScalar *b;
1786: PetscInt jdx,idt,idx,nz,i;
1789: VecGetArrayRead(bb,&b);
1790: VecGetArray(xx,&x);
1792: /* forward solve the lower triangular */
1793: idx = 0;
1794: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1795: for (i=1; i<n; i++) {
1796: v = aa + 9*ai[i];
1797: vi = aj + ai[i];
1798: nz = diag[i] - ai[i];
1799: idx += 3;
1800: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1801: while (nz--) {
1802: jdx = 3*(*vi++);
1803: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1804: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1805: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1806: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1807: v += 9;
1808: }
1809: x[idx] = s1;
1810: x[1+idx] = s2;
1811: x[2+idx] = s3;
1812: }
1813: /* backward solve the upper triangular */
1814: for (i=n-1; i>=0; i--) {
1815: v = aa + 9*diag[i] + 9;
1816: vi = aj + diag[i] + 1;
1817: nz = ai[i+1] - diag[i] - 1;
1818: idt = 3*i;
1819: s1 = x[idt]; s2 = x[1+idt];
1820: s3 = x[2+idt];
1821: while (nz--) {
1822: idx = 3*(*vi++);
1823: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
1824: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1825: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1826: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1827: v += 9;
1828: }
1829: v = aa + 9*diag[i];
1830: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1831: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1832: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1833: }
1835: VecRestoreArrayRead(bb,&b);
1836: VecRestoreArray(xx,&x);
1837: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1838: return(0);
1839: }
1841: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1842: {
1843: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1844: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1845: PetscErrorCode ierr;
1846: PetscInt i,k,nz,idx,jdx,idt;
1847: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1848: const MatScalar *aa=a->a,*v;
1849: PetscScalar *x;
1850: const PetscScalar *b;
1851: PetscScalar s1,s2,s3,x1,x2,x3;
1854: VecGetArrayRead(bb,&b);
1855: VecGetArray(xx,&x);
1856: /* forward solve the lower triangular */
1857: idx = 0;
1858: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1859: for (i=1; i<n; i++) {
1860: v = aa + bs2*ai[i];
1861: vi = aj + ai[i];
1862: nz = ai[i+1] - ai[i];
1863: idx = bs*i;
1864: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1865: for (k=0; k<nz; k++) {
1866: jdx = bs*vi[k];
1867: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1868: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1869: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1870: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1872: v += bs2;
1873: }
1875: x[idx] = s1;
1876: x[1+idx] = s2;
1877: x[2+idx] = s3;
1878: }
1880: /* backward solve the upper triangular */
1881: for (i=n-1; i>=0; i--) {
1882: v = aa + bs2*(adiag[i+1]+1);
1883: vi = aj + adiag[i+1]+1;
1884: nz = adiag[i] - adiag[i+1]-1;
1885: idt = bs*i;
1886: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
1888: for (k=0; k<nz; k++) {
1889: idx = bs*vi[k];
1890: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1891: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1892: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1893: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1895: v += bs2;
1896: }
1897: /* x = inv_diagonal*x */
1898: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1899: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1900: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1902: }
1904: VecRestoreArrayRead(bb,&b);
1905: VecRestoreArray(xx,&x);
1906: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1907: return(0);
1908: }
1910: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1911: {
1912: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1913: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j;
1914: PetscErrorCode ierr;
1915: PetscInt i,k,nz,idx,jdx;
1916: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1917: const MatScalar *aa=a->a,*v;
1918: PetscScalar *x;
1919: const PetscScalar *b;
1920: PetscScalar s1,s2,s3,x1,x2,x3;
1923: VecGetArrayRead(bb,&b);
1924: VecGetArray(xx,&x);
1925: /* forward solve the lower triangular */
1926: idx = 0;
1927: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1928: for (i=1; i<n; i++) {
1929: v = aa + bs2*ai[i];
1930: vi = aj + ai[i];
1931: nz = ai[i+1] - ai[i];
1932: idx = bs*i;
1933: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1934: for (k=0; k<nz; k++) {
1935: jdx = bs*vi[k];
1936: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1937: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1938: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1939: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1941: v += bs2;
1942: }
1944: x[idx] = s1;
1945: x[1+idx] = s2;
1946: x[2+idx] = s3;
1947: }
1949: VecRestoreArrayRead(bb,&b);
1950: VecRestoreArray(xx,&x);
1951: PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1952: return(0);
1953: }
1956: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1957: {
1958: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1959: const PetscInt n =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1960: PetscErrorCode ierr;
1961: PetscInt i,k,nz,idx,idt;
1962: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1963: const MatScalar *aa=a->a,*v;
1964: PetscScalar *x;
1965: const PetscScalar *b;
1966: PetscScalar s1,s2,s3,x1,x2,x3;
1969: VecGetArrayRead(bb,&b);
1970: VecGetArray(xx,&x);
1972: /* backward solve the upper triangular */
1973: for (i=n-1; i>=0; i--) {
1974: v = aa + bs2*(adiag[i+1]+1);
1975: vi = aj + adiag[i+1]+1;
1976: nz = adiag[i] - adiag[i+1]-1;
1977: idt = bs*i;
1978: s1 = b[idt]; s2 = b[1+idt];s3 = b[2+idt];
1980: for (k=0; k<nz; k++) {
1981: idx = bs*vi[k];
1982: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1983: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1984: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1985: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1987: v += bs2;
1988: }
1989: /* x = inv_diagonal*x */
1990: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1991: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1992: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1994: }
1996: VecRestoreArrayRead(bb,&b);
1997: VecRestoreArray(xx,&x);
1998: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1999: return(0);
2000: }
2002: /*
2003: Special case where the matrix was ILU(0) factored in the natural
2004: ordering. This eliminates the need for the column and row permutation.
2005: */
2006: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2007: {
2008: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2009: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2010: PetscErrorCode ierr;
2011: const MatScalar *aa=a->a,*v;
2012: PetscScalar *x,s1,s2,x1,x2;
2013: const PetscScalar *b;
2014: PetscInt jdx,idt,idx,nz,i;
2017: VecGetArrayRead(bb,&b);
2018: VecGetArray(xx,&x);
2020: /* forward solve the lower triangular */
2021: idx = 0;
2022: x[0] = b[0]; x[1] = b[1];
2023: for (i=1; i<n; i++) {
2024: v = aa + 4*ai[i];
2025: vi = aj + ai[i];
2026: nz = diag[i] - ai[i];
2027: idx += 2;
2028: s1 = b[idx];s2 = b[1+idx];
2029: while (nz--) {
2030: jdx = 2*(*vi++);
2031: x1 = x[jdx];x2 = x[1+jdx];
2032: s1 -= v[0]*x1 + v[2]*x2;
2033: s2 -= v[1]*x1 + v[3]*x2;
2034: v += 4;
2035: }
2036: x[idx] = s1;
2037: x[1+idx] = s2;
2038: }
2039: /* backward solve the upper triangular */
2040: for (i=n-1; i>=0; i--) {
2041: v = aa + 4*diag[i] + 4;
2042: vi = aj + diag[i] + 1;
2043: nz = ai[i+1] - diag[i] - 1;
2044: idt = 2*i;
2045: s1 = x[idt]; s2 = x[1+idt];
2046: while (nz--) {
2047: idx = 2*(*vi++);
2048: x1 = x[idx]; x2 = x[1+idx];
2049: s1 -= v[0]*x1 + v[2]*x2;
2050: s2 -= v[1]*x1 + v[3]*x2;
2051: v += 4;
2052: }
2053: v = aa + 4*diag[i];
2054: x[idt] = v[0]*s1 + v[2]*s2;
2055: x[1+idt] = v[1]*s1 + v[3]*s2;
2056: }
2058: VecRestoreArrayRead(bb,&b);
2059: VecRestoreArray(xx,&x);
2060: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2061: return(0);
2062: }
2064: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2065: {
2066: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2067: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2068: PetscInt i,k,nz,idx,idt,jdx;
2069: PetscErrorCode ierr;
2070: const MatScalar *aa=a->a,*v;
2071: PetscScalar *x,s1,s2,x1,x2;
2072: const PetscScalar *b;
2075: VecGetArrayRead(bb,&b);
2076: VecGetArray(xx,&x);
2077: /* forward solve the lower triangular */
2078: idx = 0;
2079: x[0] = b[idx]; x[1] = b[1+idx];
2080: for (i=1; i<n; i++) {
2081: v = aa + 4*ai[i];
2082: vi = aj + ai[i];
2083: nz = ai[i+1] - ai[i];
2084: idx = 2*i;
2085: s1 = b[idx];s2 = b[1+idx];
2086: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2087: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2088: for (k=0; k<nz; k++) {
2089: jdx = 2*vi[k];
2090: x1 = x[jdx];x2 = x[1+jdx];
2091: s1 -= v[0]*x1 + v[2]*x2;
2092: s2 -= v[1]*x1 + v[3]*x2;
2093: v += 4;
2094: }
2095: x[idx] = s1;
2096: x[1+idx] = s2;
2097: }
2099: /* backward solve the upper triangular */
2100: for (i=n-1; i>=0; i--) {
2101: v = aa + 4*(adiag[i+1]+1);
2102: vi = aj + adiag[i+1]+1;
2103: nz = adiag[i] - adiag[i+1]-1;
2104: idt = 2*i;
2105: s1 = x[idt]; s2 = x[1+idt];
2106: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2107: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2108: for (k=0; k<nz; k++) {
2109: idx = 2*vi[k];
2110: x1 = x[idx]; x2 = x[1+idx];
2111: s1 -= v[0]*x1 + v[2]*x2;
2112: s2 -= v[1]*x1 + v[3]*x2;
2113: v += 4;
2114: }
2115: /* x = inv_diagonal*x */
2116: x[idt] = v[0]*s1 + v[2]*s2;
2117: x[1+idt] = v[1]*s1 + v[3]*s2;
2118: }
2120: VecRestoreArrayRead(bb,&b);
2121: VecRestoreArray(xx,&x);
2122: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2123: return(0);
2124: }
2126: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2127: {
2128: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2129: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j;
2130: PetscInt i,k,nz,idx,jdx;
2131: PetscErrorCode ierr;
2132: const MatScalar *aa=a->a,*v;
2133: PetscScalar *x,s1,s2,x1,x2;
2134: const PetscScalar *b;
2137: VecGetArrayRead(bb,&b);
2138: VecGetArray(xx,&x);
2139: /* forward solve the lower triangular */
2140: idx = 0;
2141: x[0] = b[idx]; x[1] = b[1+idx];
2142: for (i=1; i<n; i++) {
2143: v = aa + 4*ai[i];
2144: vi = aj + ai[i];
2145: nz = ai[i+1] - ai[i];
2146: idx = 2*i;
2147: s1 = b[idx];s2 = b[1+idx];
2148: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2149: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2150: for (k=0; k<nz; k++) {
2151: jdx = 2*vi[k];
2152: x1 = x[jdx];x2 = x[1+jdx];
2153: s1 -= v[0]*x1 + v[2]*x2;
2154: s2 -= v[1]*x1 + v[3]*x2;
2155: v += 4;
2156: }
2157: x[idx] = s1;
2158: x[1+idx] = s2;
2159: }
2162: VecRestoreArrayRead(bb,&b);
2163: VecRestoreArray(xx,&x);
2164: PetscLogFlops(4.0*(a->nz) - A->cmap->n);
2165: return(0);
2166: }
2168: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2169: {
2170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2171: const PetscInt n = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
2172: PetscInt i,k,nz,idx,idt;
2173: PetscErrorCode ierr;
2174: const MatScalar *aa=a->a,*v;
2175: PetscScalar *x,s1,s2,x1,x2;
2176: const PetscScalar *b;
2179: VecGetArrayRead(bb,&b);
2180: VecGetArray(xx,&x);
2182: /* backward solve the upper triangular */
2183: for (i=n-1; i>=0; i--) {
2184: v = aa + 4*(adiag[i+1]+1);
2185: vi = aj + adiag[i+1]+1;
2186: nz = adiag[i] - adiag[i+1]-1;
2187: idt = 2*i;
2188: s1 = b[idt]; s2 = b[1+idt];
2189: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2190: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2191: for (k=0; k<nz; k++) {
2192: idx = 2*vi[k];
2193: x1 = x[idx]; x2 = x[1+idx];
2194: s1 -= v[0]*x1 + v[2]*x2;
2195: s2 -= v[1]*x1 + v[3]*x2;
2196: v += 4;
2197: }
2198: /* x = inv_diagonal*x */
2199: x[idt] = v[0]*s1 + v[2]*s2;
2200: x[1+idt] = v[1]*s1 + v[3]*s2;
2201: }
2203: VecRestoreArrayRead(bb,&b);
2204: VecRestoreArray(xx,&x);
2205: PetscLogFlops(4.0*a->nz - A->cmap->n);
2206: return(0);
2207: }
2209: /*
2210: Special case where the matrix was ILU(0) factored in the natural
2211: ordering. This eliminates the need for the column and row permutation.
2212: */
2213: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2214: {
2215: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2216: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2217: PetscErrorCode ierr;
2218: const MatScalar *aa=a->a,*v;
2219: PetscScalar *x;
2220: const PetscScalar *b;
2221: PetscScalar s1,x1;
2222: PetscInt jdx,idt,idx,nz,i;
2225: VecGetArrayRead(bb,&b);
2226: VecGetArray(xx,&x);
2228: /* forward solve the lower triangular */
2229: idx = 0;
2230: x[0] = b[0];
2231: for (i=1; i<n; i++) {
2232: v = aa + ai[i];
2233: vi = aj + ai[i];
2234: nz = diag[i] - ai[i];
2235: idx += 1;
2236: s1 = b[idx];
2237: while (nz--) {
2238: jdx = *vi++;
2239: x1 = x[jdx];
2240: s1 -= v[0]*x1;
2241: v += 1;
2242: }
2243: x[idx] = s1;
2244: }
2245: /* backward solve the upper triangular */
2246: for (i=n-1; i>=0; i--) {
2247: v = aa + diag[i] + 1;
2248: vi = aj + diag[i] + 1;
2249: nz = ai[i+1] - diag[i] - 1;
2250: idt = i;
2251: s1 = x[idt];
2252: while (nz--) {
2253: idx = *vi++;
2254: x1 = x[idx];
2255: s1 -= v[0]*x1;
2256: v += 1;
2257: }
2258: v = aa + diag[i];
2259: x[idt] = v[0]*s1;
2260: }
2261: VecRestoreArrayRead(bb,&b);
2262: VecRestoreArray(xx,&x);
2263: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
2264: return(0);
2265: }
2268: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2269: {
2270: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2271: PetscErrorCode ierr;
2272: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*vi;
2273: PetscScalar *x,sum;
2274: const PetscScalar *b;
2275: const MatScalar *aa = a->a,*v;
2276: PetscInt i,nz;
2279: if (!n) return(0);
2281: VecGetArrayRead(bb,&b);
2282: VecGetArray(xx,&x);
2284: /* forward solve the lower triangular */
2285: x[0] = b[0];
2286: v = aa;
2287: vi = aj;
2288: for (i=1; i<n; i++) {
2289: nz = ai[i+1] - ai[i];
2290: sum = b[i];
2291: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2292: v += nz;
2293: vi += nz;
2294: x[i] = sum;
2295: }
2296: PetscLogFlops(a->nz - A->cmap->n);
2297: VecRestoreArrayRead(bb,&b);
2298: VecRestoreArray(xx,&x);
2299: return(0);
2300: }
2302: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2303: {
2304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2305: PetscErrorCode ierr;
2306: const PetscInt n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
2307: PetscScalar *x,sum;
2308: const PetscScalar *b;
2309: const MatScalar *aa = a->a,*v;
2310: PetscInt i,nz;
2313: if (!n) return(0);
2315: VecGetArrayRead(bb,&b);
2316: VecGetArray(xx,&x);
2318: /* backward solve the upper triangular */
2319: for (i=n-1; i>=0; i--) {
2320: v = aa + adiag[i+1] + 1;
2321: vi = aj + adiag[i+1] + 1;
2322: nz = adiag[i] - adiag[i+1]-1;
2323: sum = b[i];
2324: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2325: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2326: }
2328: PetscLogFlops(2.0*a->nz - A->cmap->n);
2329: VecRestoreArrayRead(bb,&b);
2330: VecRestoreArray(xx,&x);
2331: return(0);
2332: }
2334: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2335: {
2336: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2337: PetscErrorCode ierr;
2338: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
2339: PetscScalar *x,sum;
2340: const PetscScalar *b;
2341: const MatScalar *aa = a->a,*v;
2342: PetscInt i,nz;
2345: if (!n) return(0);
2347: VecGetArrayRead(bb,&b);
2348: VecGetArray(xx,&x);
2350: /* forward solve the lower triangular */
2351: x[0] = b[0];
2352: v = aa;
2353: vi = aj;
2354: for (i=1; i<n; i++) {
2355: nz = ai[i+1] - ai[i];
2356: sum = b[i];
2357: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2358: v += nz;
2359: vi += nz;
2360: x[i] = sum;
2361: }
2363: /* backward solve the upper triangular */
2364: for (i=n-1; i>=0; i--) {
2365: v = aa + adiag[i+1] + 1;
2366: vi = aj + adiag[i+1] + 1;
2367: nz = adiag[i] - adiag[i+1]-1;
2368: sum = x[i];
2369: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2370: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2371: }
2373: PetscLogFlops(2.0*a->nz - A->cmap->n);
2374: VecRestoreArrayRead(bb,&b);
2375: VecRestoreArray(xx,&x);
2376: return(0);
2377: }