Actual source code: baijsolvnat.c
petsc-3.7.3 2016-08-01
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
8: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
9: {
10: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
11: PetscErrorCode ierr;
12: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
13: PetscInt i,nz,idx,idt,m;
14: const MatScalar *aa=a->a,*v;
15: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
16: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
17: PetscScalar *x;
18: const PetscScalar *b;
21: VecGetArrayRead(bb,&b);
22: VecGetArray(xx,&x);
24: /* forward solve the lower triangular */
25: idx = 0;
26: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
27: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
28: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
30: for (i=1; i<n; i++) {
31: v = aa + bs2*ai[i];
32: vi = aj + ai[i];
33: nz = ai[i+1] - ai[i];
34: idt = bs*i;
35: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
36: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
37: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
38: for (m=0; m<nz; m++) {
39: idx = bs*vi[m];
40: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
41: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
42: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
45: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
46: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
47: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
48: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
49: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
50: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
51: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
52: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
53: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
54: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
55: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
56: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
57: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
58: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
59: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
61: v += bs2;
62: }
63: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
64: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
65: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
67: }
68: /* backward solve the upper triangular */
69: for (i=n-1; i>=0; i--) {
70: v = aa + bs2*(adiag[i+1]+1);
71: vi = aj + adiag[i+1]+1;
72: nz = adiag[i] - adiag[i+1] - 1;
73: idt = bs*i;
74: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
75: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
76: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
78: for (m=0; m<nz; m++) {
79: idx = bs*vi[m];
80: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
81: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
82: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
84: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
85: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
86: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
87: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
88: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
89: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
90: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
91: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
92: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
93: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
94: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
95: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
96: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
97: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
98: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
100: v += bs2;
101: }
103: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
104: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
105: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
106: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
107: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
108: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
109: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
110: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
111: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
112: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
113: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
114: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
115: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
116: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
117: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
119: }
121: VecRestoreArrayRead(bb,&b);
122: VecRestoreArray(xx,&x);
123: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
124: return(0);
125: }
127: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
128: /* Default MatSolve for block size 15 */
132: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
133: {
134: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
135: PetscErrorCode ierr;
136: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
137: PetscInt i,k,nz,idx,idt,m;
138: const MatScalar *aa=a->a,*v;
139: PetscScalar s[15];
140: PetscScalar *x,xv;
141: const PetscScalar *b;
144: VecGetArrayRead(bb,&b);
145: VecGetArray(xx,&x);
147: /* forward solve the lower triangular */
148: for (i=0; i<n; i++) {
149: v = aa + bs2*ai[i];
150: vi = aj + ai[i];
151: nz = ai[i+1] - ai[i];
152: idt = bs*i;
153: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
154: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
155: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
156: for (m=0; m<nz; m++) {
157: idx = bs*vi[m];
158: for (k=0; k<15; k++) {
159: xv = x[k + idx];
160: x[idt] -= v[0]*xv;
161: x[1+idt] -= v[1]*xv;
162: x[2+idt] -= v[2]*xv;
163: x[3+idt] -= v[3]*xv;
164: x[4+idt] -= v[4]*xv;
165: x[5+idt] -= v[5]*xv;
166: x[6+idt] -= v[6]*xv;
167: x[7+idt] -= v[7]*xv;
168: x[8+idt] -= v[8]*xv;
169: x[9+idt] -= v[9]*xv;
170: x[10+idt] -= v[10]*xv;
171: x[11+idt] -= v[11]*xv;
172: x[12+idt] -= v[12]*xv;
173: x[13+idt] -= v[13]*xv;
174: x[14+idt] -= v[14]*xv;
175: v += 15;
176: }
177: }
178: }
179: /* backward solve the upper triangular */
180: for (i=n-1; i>=0; i--) {
181: v = aa + bs2*(adiag[i+1]+1);
182: vi = aj + adiag[i+1]+1;
183: nz = adiag[i] - adiag[i+1] - 1;
184: idt = bs*i;
185: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
186: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
187: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
189: for (m=0; m<nz; m++) {
190: idx = bs*vi[m];
191: for (k=0; k<15; k++) {
192: xv = x[k + idx];
193: s[0] -= v[0]*xv;
194: s[1] -= v[1]*xv;
195: s[2] -= v[2]*xv;
196: s[3] -= v[3]*xv;
197: s[4] -= v[4]*xv;
198: s[5] -= v[5]*xv;
199: s[6] -= v[6]*xv;
200: s[7] -= v[7]*xv;
201: s[8] -= v[8]*xv;
202: s[9] -= v[9]*xv;
203: s[10] -= v[10]*xv;
204: s[11] -= v[11]*xv;
205: s[12] -= v[12]*xv;
206: s[13] -= v[13]*xv;
207: s[14] -= v[14]*xv;
208: v += 15;
209: }
210: }
211: PetscMemzero(x+idt,bs*sizeof(MatScalar));
212: for (k=0; k<15; k++) {
213: x[idt] += v[0]*s[k];
214: x[1+idt] += v[1]*s[k];
215: x[2+idt] += v[2]*s[k];
216: x[3+idt] += v[3]*s[k];
217: x[4+idt] += v[4]*s[k];
218: x[5+idt] += v[5]*s[k];
219: x[6+idt] += v[6]*s[k];
220: x[7+idt] += v[7]*s[k];
221: x[8+idt] += v[8]*s[k];
222: x[9+idt] += v[9]*s[k];
223: x[10+idt] += v[10]*s[k];
224: x[11+idt] += v[11]*s[k];
225: x[12+idt] += v[12]*s[k];
226: x[13+idt] += v[13]*s[k];
227: x[14+idt] += v[14]*s[k];
228: v += 15;
229: }
230: }
231: VecRestoreArrayRead(bb,&b);
232: VecRestoreArray(xx,&x);
233: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
234: return(0);
235: }
239: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
240: {
241: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
242: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
243: PetscErrorCode ierr;
244: PetscInt i,nz,idx,idt,jdx;
245: const MatScalar *aa=a->a,*v;
246: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
247: const PetscScalar *b;
250: VecGetArrayRead(bb,&b);
251: VecGetArray(xx,&x);
252: /* forward solve the lower triangular */
253: idx = 0;
254: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
255: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
256: x[6] = b[6+idx];
257: for (i=1; i<n; i++) {
258: v = aa + 49*ai[i];
259: vi = aj + ai[i];
260: nz = diag[i] - ai[i];
261: idx = 7*i;
262: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
263: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
264: s7 = b[6+idx];
265: while (nz--) {
266: jdx = 7*(*vi++);
267: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
268: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
269: x7 = x[6+jdx];
270: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
271: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
272: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
273: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
274: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
275: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
276: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
277: v += 49;
278: }
279: x[idx] = s1;
280: x[1+idx] = s2;
281: x[2+idx] = s3;
282: x[3+idx] = s4;
283: x[4+idx] = s5;
284: x[5+idx] = s6;
285: x[6+idx] = s7;
286: }
287: /* backward solve the upper triangular */
288: for (i=n-1; i>=0; i--) {
289: v = aa + 49*diag[i] + 49;
290: vi = aj + diag[i] + 1;
291: nz = ai[i+1] - diag[i] - 1;
292: idt = 7*i;
293: s1 = x[idt]; s2 = x[1+idt];
294: s3 = x[2+idt]; s4 = x[3+idt];
295: s5 = x[4+idt]; s6 = x[5+idt];
296: s7 = x[6+idt];
297: while (nz--) {
298: idx = 7*(*vi++);
299: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
300: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
301: x7 = x[6+idx];
302: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
303: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
304: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
305: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
306: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
307: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
308: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
309: v += 49;
310: }
311: v = aa + 49*diag[i];
312: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
313: + v[28]*s5 + v[35]*s6 + v[42]*s7;
314: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
315: + v[29]*s5 + v[36]*s6 + v[43]*s7;
316: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
317: + v[30]*s5 + v[37]*s6 + v[44]*s7;
318: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
319: + v[31]*s5 + v[38]*s6 + v[45]*s7;
320: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
321: + v[32]*s5 + v[39]*s6 + v[46]*s7;
322: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
323: + v[33]*s5 + v[40]*s6 + v[47]*s7;
324: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
325: + v[34]*s5 + v[41]*s6 + v[48]*s7;
326: }
328: VecRestoreArrayRead(bb,&b);
329: VecRestoreArray(xx,&x);
330: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
331: return(0);
332: }
336: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
337: {
338: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
339: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
340: PetscErrorCode ierr;
341: PetscInt i,k,nz,idx,jdx,idt;
342: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
343: const MatScalar *aa=a->a,*v;
344: PetscScalar *x;
345: const PetscScalar *b;
346: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
349: VecGetArrayRead(bb,&b);
350: VecGetArray(xx,&x);
351: /* forward solve the lower triangular */
352: idx = 0;
353: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
354: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
355: for (i=1; i<n; i++) {
356: v = aa + bs2*ai[i];
357: vi = aj + ai[i];
358: nz = ai[i+1] - ai[i];
359: idx = bs*i;
360: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
361: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
362: for (k=0; k<nz; k++) {
363: jdx = bs*vi[k];
364: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
365: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
366: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
367: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
368: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
369: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
370: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
371: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
372: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
373: v += bs2;
374: }
376: x[idx] = s1;
377: x[1+idx] = s2;
378: x[2+idx] = s3;
379: x[3+idx] = s4;
380: x[4+idx] = s5;
381: x[5+idx] = s6;
382: x[6+idx] = s7;
383: }
385: /* backward solve the upper triangular */
386: for (i=n-1; i>=0; i--) {
387: v = aa + bs2*(adiag[i+1]+1);
388: vi = aj + adiag[i+1]+1;
389: nz = adiag[i] - adiag[i+1]-1;
390: idt = bs*i;
391: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
392: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
393: for (k=0; k<nz; k++) {
394: idx = bs*vi[k];
395: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
396: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
397: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
398: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
399: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
400: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
401: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
402: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
403: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
404: v += bs2;
405: }
406: /* x = inv_diagonal*x */
407: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
408: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
409: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
410: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
411: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
412: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
413: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
414: }
416: VecRestoreArrayRead(bb,&b);
417: VecRestoreArray(xx,&x);
418: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
419: return(0);
420: }
424: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
427: PetscInt i,nz,idx,idt,jdx;
428: PetscErrorCode ierr;
429: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
430: const MatScalar *aa =a->a,*v;
431: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
432: const PetscScalar *b;
435: VecGetArrayRead(bb,&b);
436: VecGetArray(xx,&x);
437: /* forward solve the lower triangular */
438: idx = 0;
439: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
440: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
441: for (i=1; i<n; i++) {
442: v = aa + 36*ai[i];
443: vi = aj + ai[i];
444: nz = diag[i] - ai[i];
445: idx = 6*i;
446: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
447: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
448: while (nz--) {
449: jdx = 6*(*vi++);
450: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
451: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
452: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
453: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
454: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
455: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
456: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
457: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
458: v += 36;
459: }
460: x[idx] = s1;
461: x[1+idx] = s2;
462: x[2+idx] = s3;
463: x[3+idx] = s4;
464: x[4+idx] = s5;
465: x[5+idx] = s6;
466: }
467: /* backward solve the upper triangular */
468: for (i=n-1; i>=0; i--) {
469: v = aa + 36*diag[i] + 36;
470: vi = aj + diag[i] + 1;
471: nz = ai[i+1] - diag[i] - 1;
472: idt = 6*i;
473: s1 = x[idt]; s2 = x[1+idt];
474: s3 = x[2+idt]; s4 = x[3+idt];
475: s5 = x[4+idt]; s6 = x[5+idt];
476: while (nz--) {
477: idx = 6*(*vi++);
478: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
479: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
480: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
481: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
482: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
483: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
484: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
485: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
486: v += 36;
487: }
488: v = aa + 36*diag[i];
489: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
490: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
491: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
492: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
493: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
494: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
495: }
497: VecRestoreArrayRead(bb,&b);
498: VecRestoreArray(xx,&x);
499: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
500: return(0);
501: }
505: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
506: {
507: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
508: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
509: PetscErrorCode ierr;
510: PetscInt i,k,nz,idx,jdx,idt;
511: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
512: const MatScalar *aa=a->a,*v;
513: PetscScalar *x;
514: const PetscScalar *b;
515: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
518: VecGetArrayRead(bb,&b);
519: VecGetArray(xx,&x);
520: /* forward solve the lower triangular */
521: idx = 0;
522: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
523: x[4] = b[4+idx];x[5] = b[5+idx];
524: for (i=1; i<n; i++) {
525: v = aa + bs2*ai[i];
526: vi = aj + ai[i];
527: nz = ai[i+1] - ai[i];
528: idx = bs*i;
529: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
530: s5 = b[4+idx];s6 = b[5+idx];
531: for (k=0; k<nz; k++) {
532: jdx = bs*vi[k];
533: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
534: x5 = x[4+jdx]; x6 = x[5+jdx];
535: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
536: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
537: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
538: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
539: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
540: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
541: v += bs2;
542: }
544: x[idx] = s1;
545: x[1+idx] = s2;
546: x[2+idx] = s3;
547: x[3+idx] = s4;
548: x[4+idx] = s5;
549: x[5+idx] = s6;
550: }
552: /* backward solve the upper triangular */
553: for (i=n-1; i>=0; i--) {
554: v = aa + bs2*(adiag[i+1]+1);
555: vi = aj + adiag[i+1]+1;
556: nz = adiag[i] - adiag[i+1]-1;
557: idt = bs*i;
558: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
559: s5 = x[4+idt];s6 = x[5+idt];
560: for (k=0; k<nz; k++) {
561: idx = bs*vi[k];
562: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
563: x5 = x[4+idx];x6 = x[5+idx];
564: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
565: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
566: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
567: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
568: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
569: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
570: v += bs2;
571: }
572: /* x = inv_diagonal*x */
573: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
574: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
575: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
576: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
577: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
578: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
579: }
581: VecRestoreArrayRead(bb,&b);
582: VecRestoreArray(xx,&x);
583: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
584: return(0);
585: }
589: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
590: {
591: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
592: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
593: PetscInt i,nz,idx,idt,jdx;
594: PetscErrorCode ierr;
595: const MatScalar *aa=a->a,*v;
596: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
597: const PetscScalar *b;
600: VecGetArrayRead(bb,&b);
601: VecGetArray(xx,&x);
602: /* forward solve the lower triangular */
603: idx = 0;
604: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
605: for (i=1; i<n; i++) {
606: v = aa + 25*ai[i];
607: vi = aj + ai[i];
608: nz = diag[i] - ai[i];
609: idx = 5*i;
610: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
611: while (nz--) {
612: jdx = 5*(*vi++);
613: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
614: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
615: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
616: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
617: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
618: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
619: v += 25;
620: }
621: x[idx] = s1;
622: x[1+idx] = s2;
623: x[2+idx] = s3;
624: x[3+idx] = s4;
625: x[4+idx] = s5;
626: }
627: /* backward solve the upper triangular */
628: for (i=n-1; i>=0; i--) {
629: v = aa + 25*diag[i] + 25;
630: vi = aj + diag[i] + 1;
631: nz = ai[i+1] - diag[i] - 1;
632: idt = 5*i;
633: s1 = x[idt]; s2 = x[1+idt];
634: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
635: while (nz--) {
636: idx = 5*(*vi++);
637: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
638: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
639: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
640: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
641: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
642: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
643: v += 25;
644: }
645: v = aa + 25*diag[i];
646: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
647: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
648: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
649: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
650: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
651: }
653: VecRestoreArrayRead(bb,&b);
654: VecRestoreArray(xx,&x);
655: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
656: return(0);
657: }
661: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
662: {
663: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
664: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
665: PetscInt i,k,nz,idx,idt,jdx;
666: PetscErrorCode ierr;
667: const MatScalar *aa=a->a,*v;
668: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
669: const PetscScalar *b;
672: VecGetArrayRead(bb,&b);
673: VecGetArray(xx,&x);
674: /* forward solve the lower triangular */
675: idx = 0;
676: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
677: for (i=1; i<n; i++) {
678: v = aa + 25*ai[i];
679: vi = aj + ai[i];
680: nz = ai[i+1] - ai[i];
681: idx = 5*i;
682: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
683: for (k=0; k<nz; k++) {
684: jdx = 5*vi[k];
685: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
686: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
687: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
688: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
689: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
690: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
691: v += 25;
692: }
693: x[idx] = s1;
694: x[1+idx] = s2;
695: x[2+idx] = s3;
696: x[3+idx] = s4;
697: x[4+idx] = s5;
698: }
700: /* backward solve the upper triangular */
701: for (i=n-1; i>=0; i--) {
702: v = aa + 25*(adiag[i+1]+1);
703: vi = aj + adiag[i+1]+1;
704: nz = adiag[i] - adiag[i+1]-1;
705: idt = 5*i;
706: s1 = x[idt]; s2 = x[1+idt];
707: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
708: for (k=0; k<nz; k++) {
709: idx = 5*vi[k];
710: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
711: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
712: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
713: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
714: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
715: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
716: v += 25;
717: }
718: /* x = inv_diagonal*x */
719: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
720: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
721: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
722: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
723: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
724: }
726: VecRestoreArrayRead(bb,&b);
727: VecRestoreArray(xx,&x);
728: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
729: return(0);
730: }
732: /*
733: Special case where the matrix was ILU(0) factored in the natural
734: ordering. This eliminates the need for the column and row permutation.
735: */
738: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
739: {
740: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
741: PetscInt n =a->mbs;
742: const PetscInt *ai=a->i,*aj=a->j;
743: PetscErrorCode ierr;
744: const PetscInt *diag = a->diag;
745: const MatScalar *aa =a->a;
746: PetscScalar *x;
747: const PetscScalar *b;
750: VecGetArrayRead(bb,&b);
751: VecGetArray(xx,&x);
753: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
754: {
755: static PetscScalar w[2000]; /* very BAD need to fix */
756: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
757: }
758: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
759: {
760: static PetscScalar w[2000]; /* very BAD need to fix */
761: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
762: }
763: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
764: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
765: #else
766: {
767: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
768: const MatScalar *v;
769: PetscInt jdx,idt,idx,nz,i,ai16;
770: const PetscInt *vi;
772: /* forward solve the lower triangular */
773: idx = 0;
774: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
775: for (i=1; i<n; i++) {
776: v = aa + 16*ai[i];
777: vi = aj + ai[i];
778: nz = diag[i] - ai[i];
779: idx += 4;
780: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
781: while (nz--) {
782: jdx = 4*(*vi++);
783: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
784: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
785: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
786: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
787: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
788: v += 16;
789: }
790: x[idx] = s1;
791: x[1+idx] = s2;
792: x[2+idx] = s3;
793: x[3+idx] = s4;
794: }
795: /* backward solve the upper triangular */
796: idt = 4*(n-1);
797: for (i=n-1; i>=0; i--) {
798: ai16 = 16*diag[i];
799: v = aa + ai16 + 16;
800: vi = aj + diag[i] + 1;
801: nz = ai[i+1] - diag[i] - 1;
802: s1 = x[idt]; s2 = x[1+idt];
803: s3 = x[2+idt];s4 = x[3+idt];
804: while (nz--) {
805: idx = 4*(*vi++);
806: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
807: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
808: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
809: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
810: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
811: v += 16;
812: }
813: v = aa + ai16;
814: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
815: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
816: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
817: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
818: idt -= 4;
819: }
820: }
821: #endif
823: VecRestoreArrayRead(bb,&b);
824: VecRestoreArray(xx,&x);
825: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
826: return(0);
827: }
831: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
832: {
833: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
834: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
835: PetscInt i,k,nz,idx,jdx,idt;
836: PetscErrorCode ierr;
837: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
838: const MatScalar *aa=a->a,*v;
839: PetscScalar *x;
840: const PetscScalar *b;
841: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
844: VecGetArrayRead(bb,&b);
845: VecGetArray(xx,&x);
846: /* forward solve the lower triangular */
847: idx = 0;
848: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
849: for (i=1; i<n; i++) {
850: v = aa + bs2*ai[i];
851: vi = aj + ai[i];
852: nz = ai[i+1] - ai[i];
853: idx = bs*i;
854: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
855: for (k=0; k<nz; k++) {
856: jdx = bs*vi[k];
857: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
858: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
859: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
860: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
861: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
863: v += bs2;
864: }
866: x[idx] = s1;
867: x[1+idx] = s2;
868: x[2+idx] = s3;
869: x[3+idx] = s4;
870: }
872: /* backward solve the upper triangular */
873: for (i=n-1; i>=0; i--) {
874: v = aa + bs2*(adiag[i+1]+1);
875: vi = aj + adiag[i+1]+1;
876: nz = adiag[i] - adiag[i+1]-1;
877: idt = bs*i;
878: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
880: for (k=0; k<nz; k++) {
881: idx = bs*vi[k];
882: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
883: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
884: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
885: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
886: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
888: v += bs2;
889: }
890: /* x = inv_diagonal*x */
891: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
892: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
893: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
894: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
896: }
898: VecRestoreArrayRead(bb,&b);
899: VecRestoreArray(xx,&x);
900: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
901: return(0);
902: }
906: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
907: {
908: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
909: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
910: PetscErrorCode ierr;
911: const MatScalar *aa=a->a;
912: const PetscScalar *b;
913: PetscScalar *x;
916: VecGetArrayRead(bb,&b);
917: VecGetArray(xx,&x);
919: {
920: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
921: const MatScalar *v;
922: MatScalar *t=(MatScalar*)x;
923: PetscInt jdx,idt,idx,nz,i,ai16;
924: const PetscInt *vi;
926: /* forward solve the lower triangular */
927: idx = 0;
928: t[0] = (MatScalar)b[0];
929: t[1] = (MatScalar)b[1];
930: t[2] = (MatScalar)b[2];
931: t[3] = (MatScalar)b[3];
932: for (i=1; i<n; i++) {
933: v = aa + 16*ai[i];
934: vi = aj + ai[i];
935: nz = diag[i] - ai[i];
936: idx += 4;
937: s1 = (MatScalar)b[idx];
938: s2 = (MatScalar)b[1+idx];
939: s3 = (MatScalar)b[2+idx];
940: s4 = (MatScalar)b[3+idx];
941: while (nz--) {
942: jdx = 4*(*vi++);
943: x1 = t[jdx];
944: x2 = t[1+jdx];
945: x3 = t[2+jdx];
946: x4 = t[3+jdx];
947: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
948: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
949: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
950: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
951: v += 16;
952: }
953: t[idx] = s1;
954: t[1+idx] = s2;
955: t[2+idx] = s3;
956: t[3+idx] = s4;
957: }
958: /* backward solve the upper triangular */
959: idt = 4*(n-1);
960: for (i=n-1; i>=0; i--) {
961: ai16 = 16*diag[i];
962: v = aa + ai16 + 16;
963: vi = aj + diag[i] + 1;
964: nz = ai[i+1] - diag[i] - 1;
965: s1 = t[idt];
966: s2 = t[1+idt];
967: s3 = t[2+idt];
968: s4 = t[3+idt];
969: while (nz--) {
970: idx = 4*(*vi++);
971: x1 = (MatScalar)x[idx];
972: x2 = (MatScalar)x[1+idx];
973: x3 = (MatScalar)x[2+idx];
974: x4 = (MatScalar)x[3+idx];
975: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
976: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
977: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
978: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
979: v += 16;
980: }
981: v = aa + ai16;
982: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
983: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
984: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
985: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
986: idt -= 4;
987: }
988: }
990: VecRestoreArrayRead(bb,&b);
991: VecRestoreArray(xx,&x);
992: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
993: return(0);
994: }
996: #if defined(PETSC_HAVE_SSE)
998: #include PETSC_HAVE_SSE
1001: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1002: {
1003: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1004: unsigned short *aj=(unsigned short*)a->j;
1006: int *ai=a->i,n=a->mbs,*diag = a->diag;
1007: MatScalar *aa=a->a;
1008: PetscScalar *x,*b;
1011: SSE_SCOPE_BEGIN;
1012: /*
1013: Note: This code currently uses demotion of double
1014: to float when performing the mixed-mode computation.
1015: This may not be numerically reasonable for all applications.
1016: */
1017: PREFETCH_NTA(aa+16*ai[1]);
1019: VecGetArray(bb,&b);
1020: VecGetArray(xx,&x);
1021: {
1022: /* x will first be computed in single precision then promoted inplace to double */
1023: MatScalar *v,*t=(MatScalar*)x;
1024: int nz,i,idt,ai16;
1025: unsigned int jdx,idx;
1026: unsigned short *vi;
1027: /* Forward solve the lower triangular factor. */
1029: /* First block is the identity. */
1030: idx = 0;
1031: CONVERT_DOUBLE4_FLOAT4(t,b);
1032: v = aa + 16*((unsigned int)ai[1]);
1034: for (i=1; i<n; ) {
1035: PREFETCH_NTA(&v[8]);
1036: vi = aj + ai[i];
1037: nz = diag[i] - ai[i];
1038: idx += 4;
1040: /* Demote RHS from double to float. */
1041: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1042: LOAD_PS(&t[idx],XMM7);
1044: while (nz--) {
1045: PREFETCH_NTA(&v[16]);
1046: jdx = 4*((unsigned int)(*vi++));
1048: /* 4x4 Matrix-Vector product with negative accumulation: */
1049: SSE_INLINE_BEGIN_2(&t[jdx],v)
1050: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1052: /* First Column */
1053: SSE_COPY_PS(XMM0,XMM6)
1054: SSE_SHUFFLE(XMM0,XMM0,0x00)
1055: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1056: SSE_SUB_PS(XMM7,XMM0)
1058: /* Second Column */
1059: SSE_COPY_PS(XMM1,XMM6)
1060: SSE_SHUFFLE(XMM1,XMM1,0x55)
1061: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1062: SSE_SUB_PS(XMM7,XMM1)
1064: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1066: /* Third Column */
1067: SSE_COPY_PS(XMM2,XMM6)
1068: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1069: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1070: SSE_SUB_PS(XMM7,XMM2)
1072: /* Fourth Column */
1073: SSE_COPY_PS(XMM3,XMM6)
1074: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1075: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1076: SSE_SUB_PS(XMM7,XMM3)
1077: SSE_INLINE_END_2
1079: v += 16;
1080: }
1081: v = aa + 16*ai[++i];
1082: PREFETCH_NTA(v);
1083: STORE_PS(&t[idx],XMM7);
1084: }
1086: /* Backward solve the upper triangular factor.*/
1088: idt = 4*(n-1);
1089: ai16 = 16*diag[n-1];
1090: v = aa + ai16 + 16;
1091: for (i=n-1; i>=0; ) {
1092: PREFETCH_NTA(&v[8]);
1093: vi = aj + diag[i] + 1;
1094: nz = ai[i+1] - diag[i] - 1;
1096: LOAD_PS(&t[idt],XMM7);
1098: while (nz--) {
1099: PREFETCH_NTA(&v[16]);
1100: idx = 4*((unsigned int)(*vi++));
1102: /* 4x4 Matrix-Vector Product with negative accumulation: */
1103: SSE_INLINE_BEGIN_2(&t[idx],v)
1104: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1106: /* First Column */
1107: SSE_COPY_PS(XMM0,XMM6)
1108: SSE_SHUFFLE(XMM0,XMM0,0x00)
1109: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1110: SSE_SUB_PS(XMM7,XMM0)
1112: /* Second Column */
1113: SSE_COPY_PS(XMM1,XMM6)
1114: SSE_SHUFFLE(XMM1,XMM1,0x55)
1115: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1116: SSE_SUB_PS(XMM7,XMM1)
1118: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1120: /* Third Column */
1121: SSE_COPY_PS(XMM2,XMM6)
1122: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1123: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1124: SSE_SUB_PS(XMM7,XMM2)
1126: /* Fourth Column */
1127: SSE_COPY_PS(XMM3,XMM6)
1128: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1129: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1130: SSE_SUB_PS(XMM7,XMM3)
1131: SSE_INLINE_END_2
1132: v += 16;
1133: }
1134: v = aa + ai16;
1135: ai16 = 16*diag[--i];
1136: PREFETCH_NTA(aa+ai16+16);
1137: /*
1138: Scale the result by the diagonal 4x4 block,
1139: which was inverted as part of the factorization
1140: */
1141: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1142: /* First Column */
1143: SSE_COPY_PS(XMM0,XMM7)
1144: SSE_SHUFFLE(XMM0,XMM0,0x00)
1145: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1147: /* Second Column */
1148: SSE_COPY_PS(XMM1,XMM7)
1149: SSE_SHUFFLE(XMM1,XMM1,0x55)
1150: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1151: SSE_ADD_PS(XMM0,XMM1)
1153: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1155: /* Third Column */
1156: SSE_COPY_PS(XMM2,XMM7)
1157: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1158: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1159: SSE_ADD_PS(XMM0,XMM2)
1161: /* Fourth Column */
1162: SSE_COPY_PS(XMM3,XMM7)
1163: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1164: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1165: SSE_ADD_PS(XMM0,XMM3)
1167: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1168: SSE_INLINE_END_3
1170: v = aa + ai16 + 16;
1171: idt -= 4;
1172: }
1174: /* Convert t from single precision back to double precision (inplace)*/
1175: idt = 4*(n-1);
1176: for (i=n-1; i>=0; i--) {
1177: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1178: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1179: PetscScalar *xtemp=&x[idt];
1180: MatScalar *ttemp=&t[idt];
1181: xtemp[3] = (PetscScalar)ttemp[3];
1182: xtemp[2] = (PetscScalar)ttemp[2];
1183: xtemp[1] = (PetscScalar)ttemp[1];
1184: xtemp[0] = (PetscScalar)ttemp[0];
1185: idt -= 4;
1186: }
1188: } /* End of artificial scope. */
1189: VecRestoreArray(bb,&b);
1190: VecRestoreArray(xx,&x);
1191: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1192: SSE_SCOPE_END;
1193: return(0);
1194: }
1198: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1199: {
1200: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1201: int *aj=a->j;
1203: int *ai=a->i,n=a->mbs,*diag = a->diag;
1204: MatScalar *aa=a->a;
1205: PetscScalar *x,*b;
1208: SSE_SCOPE_BEGIN;
1209: /*
1210: Note: This code currently uses demotion of double
1211: to float when performing the mixed-mode computation.
1212: This may not be numerically reasonable for all applications.
1213: */
1214: PREFETCH_NTA(aa+16*ai[1]);
1216: VecGetArray(bb,&b);
1217: VecGetArray(xx,&x);
1218: {
1219: /* x will first be computed in single precision then promoted inplace to double */
1220: MatScalar *v,*t=(MatScalar*)x;
1221: int nz,i,idt,ai16;
1222: int jdx,idx;
1223: int *vi;
1224: /* Forward solve the lower triangular factor. */
1226: /* First block is the identity. */
1227: idx = 0;
1228: CONVERT_DOUBLE4_FLOAT4(t,b);
1229: v = aa + 16*ai[1];
1231: for (i=1; i<n; ) {
1232: PREFETCH_NTA(&v[8]);
1233: vi = aj + ai[i];
1234: nz = diag[i] - ai[i];
1235: idx += 4;
1237: /* Demote RHS from double to float. */
1238: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1239: LOAD_PS(&t[idx],XMM7);
1241: while (nz--) {
1242: PREFETCH_NTA(&v[16]);
1243: jdx = 4*(*vi++);
1244: /* jdx = *vi++; */
1246: /* 4x4 Matrix-Vector product with negative accumulation: */
1247: SSE_INLINE_BEGIN_2(&t[jdx],v)
1248: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1250: /* First Column */
1251: SSE_COPY_PS(XMM0,XMM6)
1252: SSE_SHUFFLE(XMM0,XMM0,0x00)
1253: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1254: SSE_SUB_PS(XMM7,XMM0)
1256: /* Second Column */
1257: SSE_COPY_PS(XMM1,XMM6)
1258: SSE_SHUFFLE(XMM1,XMM1,0x55)
1259: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1260: SSE_SUB_PS(XMM7,XMM1)
1262: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1264: /* Third Column */
1265: SSE_COPY_PS(XMM2,XMM6)
1266: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1267: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1268: SSE_SUB_PS(XMM7,XMM2)
1270: /* Fourth Column */
1271: SSE_COPY_PS(XMM3,XMM6)
1272: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1273: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1274: SSE_SUB_PS(XMM7,XMM3)
1275: SSE_INLINE_END_2
1277: v += 16;
1278: }
1279: v = aa + 16*ai[++i];
1280: PREFETCH_NTA(v);
1281: STORE_PS(&t[idx],XMM7);
1282: }
1284: /* Backward solve the upper triangular factor.*/
1286: idt = 4*(n-1);
1287: ai16 = 16*diag[n-1];
1288: v = aa + ai16 + 16;
1289: for (i=n-1; i>=0; ) {
1290: PREFETCH_NTA(&v[8]);
1291: vi = aj + diag[i] + 1;
1292: nz = ai[i+1] - diag[i] - 1;
1294: LOAD_PS(&t[idt],XMM7);
1296: while (nz--) {
1297: PREFETCH_NTA(&v[16]);
1298: idx = 4*(*vi++);
1299: /* idx = *vi++; */
1301: /* 4x4 Matrix-Vector Product with negative accumulation: */
1302: SSE_INLINE_BEGIN_2(&t[idx],v)
1303: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1305: /* First Column */
1306: SSE_COPY_PS(XMM0,XMM6)
1307: SSE_SHUFFLE(XMM0,XMM0,0x00)
1308: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1309: SSE_SUB_PS(XMM7,XMM0)
1311: /* Second Column */
1312: SSE_COPY_PS(XMM1,XMM6)
1313: SSE_SHUFFLE(XMM1,XMM1,0x55)
1314: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1315: SSE_SUB_PS(XMM7,XMM1)
1317: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1319: /* Third Column */
1320: SSE_COPY_PS(XMM2,XMM6)
1321: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1322: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1323: SSE_SUB_PS(XMM7,XMM2)
1325: /* Fourth Column */
1326: SSE_COPY_PS(XMM3,XMM6)
1327: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1328: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1329: SSE_SUB_PS(XMM7,XMM3)
1330: SSE_INLINE_END_2
1331: v += 16;
1332: }
1333: v = aa + ai16;
1334: ai16 = 16*diag[--i];
1335: PREFETCH_NTA(aa+ai16+16);
1336: /*
1337: Scale the result by the diagonal 4x4 block,
1338: which was inverted as part of the factorization
1339: */
1340: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1341: /* First Column */
1342: SSE_COPY_PS(XMM0,XMM7)
1343: SSE_SHUFFLE(XMM0,XMM0,0x00)
1344: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1346: /* Second Column */
1347: SSE_COPY_PS(XMM1,XMM7)
1348: SSE_SHUFFLE(XMM1,XMM1,0x55)
1349: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1350: SSE_ADD_PS(XMM0,XMM1)
1352: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1354: /* Third Column */
1355: SSE_COPY_PS(XMM2,XMM7)
1356: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1357: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1358: SSE_ADD_PS(XMM0,XMM2)
1360: /* Fourth Column */
1361: SSE_COPY_PS(XMM3,XMM7)
1362: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1363: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1364: SSE_ADD_PS(XMM0,XMM3)
1366: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1367: SSE_INLINE_END_3
1369: v = aa + ai16 + 16;
1370: idt -= 4;
1371: }
1373: /* Convert t from single precision back to double precision (inplace)*/
1374: idt = 4*(n-1);
1375: for (i=n-1; i>=0; i--) {
1376: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1377: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1378: PetscScalar *xtemp=&x[idt];
1379: MatScalar *ttemp=&t[idt];
1380: xtemp[3] = (PetscScalar)ttemp[3];
1381: xtemp[2] = (PetscScalar)ttemp[2];
1382: xtemp[1] = (PetscScalar)ttemp[1];
1383: xtemp[0] = (PetscScalar)ttemp[0];
1384: idt -= 4;
1385: }
1387: } /* End of artificial scope. */
1388: VecRestoreArray(bb,&b);
1389: VecRestoreArray(xx,&x);
1390: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1391: SSE_SCOPE_END;
1392: return(0);
1393: }
1395: #endif
1397: /*
1398: Special case where the matrix was ILU(0) factored in the natural
1399: ordering. This eliminates the need for the column and row permutation.
1400: */
1403: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1404: {
1405: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1406: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j;
1407: PetscErrorCode ierr;
1408: const PetscInt *diag = a->diag,*vi;
1409: const MatScalar *aa =a->a,*v;
1410: PetscScalar *x,s1,s2,s3,x1,x2,x3;
1411: const PetscScalar *b;
1412: PetscInt jdx,idt,idx,nz,i;
1415: VecGetArrayRead(bb,&b);
1416: VecGetArray(xx,&x);
1418: /* forward solve the lower triangular */
1419: idx = 0;
1420: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1421: for (i=1; i<n; i++) {
1422: v = aa + 9*ai[i];
1423: vi = aj + ai[i];
1424: nz = diag[i] - ai[i];
1425: idx += 3;
1426: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1427: while (nz--) {
1428: jdx = 3*(*vi++);
1429: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1430: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1431: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1432: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1433: v += 9;
1434: }
1435: x[idx] = s1;
1436: x[1+idx] = s2;
1437: x[2+idx] = s3;
1438: }
1439: /* backward solve the upper triangular */
1440: for (i=n-1; i>=0; i--) {
1441: v = aa + 9*diag[i] + 9;
1442: vi = aj + diag[i] + 1;
1443: nz = ai[i+1] - diag[i] - 1;
1444: idt = 3*i;
1445: s1 = x[idt]; s2 = x[1+idt];
1446: s3 = x[2+idt];
1447: while (nz--) {
1448: idx = 3*(*vi++);
1449: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
1450: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1451: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1452: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1453: v += 9;
1454: }
1455: v = aa + 9*diag[i];
1456: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1457: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1458: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1459: }
1461: VecRestoreArrayRead(bb,&b);
1462: VecRestoreArray(xx,&x);
1463: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1464: return(0);
1465: }
1469: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1470: {
1471: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1472: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1473: PetscErrorCode ierr;
1474: PetscInt i,k,nz,idx,jdx,idt;
1475: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1476: const MatScalar *aa=a->a,*v;
1477: PetscScalar *x;
1478: const PetscScalar *b;
1479: PetscScalar s1,s2,s3,x1,x2,x3;
1482: VecGetArrayRead(bb,&b);
1483: VecGetArray(xx,&x);
1484: /* forward solve the lower triangular */
1485: idx = 0;
1486: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1487: for (i=1; i<n; i++) {
1488: v = aa + bs2*ai[i];
1489: vi = aj + ai[i];
1490: nz = ai[i+1] - ai[i];
1491: idx = bs*i;
1492: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1493: for (k=0; k<nz; k++) {
1494: jdx = bs*vi[k];
1495: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1496: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1497: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1498: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1500: v += bs2;
1501: }
1503: x[idx] = s1;
1504: x[1+idx] = s2;
1505: x[2+idx] = s3;
1506: }
1508: /* backward solve the upper triangular */
1509: for (i=n-1; i>=0; i--) {
1510: v = aa + bs2*(adiag[i+1]+1);
1511: vi = aj + adiag[i+1]+1;
1512: nz = adiag[i] - adiag[i+1]-1;
1513: idt = bs*i;
1514: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
1516: for (k=0; k<nz; k++) {
1517: idx = bs*vi[k];
1518: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1519: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1520: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1521: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1523: v += bs2;
1524: }
1525: /* x = inv_diagonal*x */
1526: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1527: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1528: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1530: }
1532: VecRestoreArrayRead(bb,&b);
1533: VecRestoreArray(xx,&x);
1534: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1535: return(0);
1536: }
1540: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1541: {
1542: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1543: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j;
1544: PetscErrorCode ierr;
1545: PetscInt i,k,nz,idx,jdx;
1546: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1547: const MatScalar *aa=a->a,*v;
1548: PetscScalar *x;
1549: const PetscScalar *b;
1550: PetscScalar s1,s2,s3,x1,x2,x3;
1553: VecGetArrayRead(bb,&b);
1554: VecGetArray(xx,&x);
1555: /* forward solve the lower triangular */
1556: idx = 0;
1557: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1558: for (i=1; i<n; i++) {
1559: v = aa + bs2*ai[i];
1560: vi = aj + ai[i];
1561: nz = ai[i+1] - ai[i];
1562: idx = bs*i;
1563: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1564: for (k=0; k<nz; k++) {
1565: jdx = bs*vi[k];
1566: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1567: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1568: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1569: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1571: v += bs2;
1572: }
1574: x[idx] = s1;
1575: x[1+idx] = s2;
1576: x[2+idx] = s3;
1577: }
1579: VecRestoreArrayRead(bb,&b);
1580: VecRestoreArray(xx,&x);
1581: PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1582: return(0);
1583: }
1588: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1589: {
1590: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1591: const PetscInt n =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1592: PetscErrorCode ierr;
1593: PetscInt i,k,nz,idx,idt;
1594: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1595: const MatScalar *aa=a->a,*v;
1596: PetscScalar *x;
1597: const PetscScalar *b;
1598: PetscScalar s1,s2,s3,x1,x2,x3;
1601: VecGetArrayRead(bb,&b);
1602: VecGetArray(xx,&x);
1604: /* backward solve the upper triangular */
1605: for (i=n-1; i>=0; i--) {
1606: v = aa + bs2*(adiag[i+1]+1);
1607: vi = aj + adiag[i+1]+1;
1608: nz = adiag[i] - adiag[i+1]-1;
1609: idt = bs*i;
1610: s1 = b[idt]; s2 = b[1+idt];s3 = b[2+idt];
1612: for (k=0; k<nz; k++) {
1613: idx = bs*vi[k];
1614: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1615: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1616: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1617: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1619: v += bs2;
1620: }
1621: /* x = inv_diagonal*x */
1622: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1623: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1624: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1626: }
1628: VecRestoreArrayRead(bb,&b);
1629: VecRestoreArray(xx,&x);
1630: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1631: return(0);
1632: }
1634: /*
1635: Special case where the matrix was ILU(0) factored in the natural
1636: ordering. This eliminates the need for the column and row permutation.
1637: */
1640: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1641: {
1642: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1643: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1644: PetscErrorCode ierr;
1645: const MatScalar *aa=a->a,*v;
1646: PetscScalar *x,s1,s2,x1,x2;
1647: const PetscScalar *b;
1648: PetscInt jdx,idt,idx,nz,i;
1651: VecGetArrayRead(bb,&b);
1652: VecGetArray(xx,&x);
1654: /* forward solve the lower triangular */
1655: idx = 0;
1656: x[0] = b[0]; x[1] = b[1];
1657: for (i=1; i<n; i++) {
1658: v = aa + 4*ai[i];
1659: vi = aj + ai[i];
1660: nz = diag[i] - ai[i];
1661: idx += 2;
1662: s1 = b[idx];s2 = b[1+idx];
1663: while (nz--) {
1664: jdx = 2*(*vi++);
1665: x1 = x[jdx];x2 = x[1+jdx];
1666: s1 -= v[0]*x1 + v[2]*x2;
1667: s2 -= v[1]*x1 + v[3]*x2;
1668: v += 4;
1669: }
1670: x[idx] = s1;
1671: x[1+idx] = s2;
1672: }
1673: /* backward solve the upper triangular */
1674: for (i=n-1; i>=0; i--) {
1675: v = aa + 4*diag[i] + 4;
1676: vi = aj + diag[i] + 1;
1677: nz = ai[i+1] - diag[i] - 1;
1678: idt = 2*i;
1679: s1 = x[idt]; s2 = x[1+idt];
1680: while (nz--) {
1681: idx = 2*(*vi++);
1682: x1 = x[idx]; x2 = x[1+idx];
1683: s1 -= v[0]*x1 + v[2]*x2;
1684: s2 -= v[1]*x1 + v[3]*x2;
1685: v += 4;
1686: }
1687: v = aa + 4*diag[i];
1688: x[idt] = v[0]*s1 + v[2]*s2;
1689: x[1+idt] = v[1]*s1 + v[3]*s2;
1690: }
1692: VecRestoreArrayRead(bb,&b);
1693: VecRestoreArray(xx,&x);
1694: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1695: return(0);
1696: }
1700: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1701: {
1702: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1703: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1704: PetscInt i,k,nz,idx,idt,jdx;
1705: PetscErrorCode ierr;
1706: const MatScalar *aa=a->a,*v;
1707: PetscScalar *x,s1,s2,x1,x2;
1708: const PetscScalar *b;
1711: VecGetArrayRead(bb,&b);
1712: VecGetArray(xx,&x);
1713: /* forward solve the lower triangular */
1714: idx = 0;
1715: x[0] = b[idx]; x[1] = b[1+idx];
1716: for (i=1; i<n; i++) {
1717: v = aa + 4*ai[i];
1718: vi = aj + ai[i];
1719: nz = ai[i+1] - ai[i];
1720: idx = 2*i;
1721: s1 = b[idx];s2 = b[1+idx];
1722: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1723: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1724: for (k=0; k<nz; k++) {
1725: jdx = 2*vi[k];
1726: x1 = x[jdx];x2 = x[1+jdx];
1727: s1 -= v[0]*x1 + v[2]*x2;
1728: s2 -= v[1]*x1 + v[3]*x2;
1729: v += 4;
1730: }
1731: x[idx] = s1;
1732: x[1+idx] = s2;
1733: }
1735: /* backward solve the upper triangular */
1736: for (i=n-1; i>=0; i--) {
1737: v = aa + 4*(adiag[i+1]+1);
1738: vi = aj + adiag[i+1]+1;
1739: nz = adiag[i] - adiag[i+1]-1;
1740: idt = 2*i;
1741: s1 = x[idt]; s2 = x[1+idt];
1742: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1743: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1744: for (k=0; k<nz; k++) {
1745: idx = 2*vi[k];
1746: x1 = x[idx]; x2 = x[1+idx];
1747: s1 -= v[0]*x1 + v[2]*x2;
1748: s2 -= v[1]*x1 + v[3]*x2;
1749: v += 4;
1750: }
1751: /* x = inv_diagonal*x */
1752: x[idt] = v[0]*s1 + v[2]*s2;
1753: x[1+idt] = v[1]*s1 + v[3]*s2;
1754: }
1756: VecRestoreArrayRead(bb,&b);
1757: VecRestoreArray(xx,&x);
1758: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1759: return(0);
1760: }
1764: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1765: {
1766: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1767: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j;
1768: PetscInt i,k,nz,idx,jdx;
1769: PetscErrorCode ierr;
1770: const MatScalar *aa=a->a,*v;
1771: PetscScalar *x,s1,s2,x1,x2;
1772: const PetscScalar *b;
1775: VecGetArrayRead(bb,&b);
1776: VecGetArray(xx,&x);
1777: /* forward solve the lower triangular */
1778: idx = 0;
1779: x[0] = b[idx]; x[1] = b[1+idx];
1780: for (i=1; i<n; i++) {
1781: v = aa + 4*ai[i];
1782: vi = aj + ai[i];
1783: nz = ai[i+1] - ai[i];
1784: idx = 2*i;
1785: s1 = b[idx];s2 = b[1+idx];
1786: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1787: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1788: for (k=0; k<nz; k++) {
1789: jdx = 2*vi[k];
1790: x1 = x[jdx];x2 = x[1+jdx];
1791: s1 -= v[0]*x1 + v[2]*x2;
1792: s2 -= v[1]*x1 + v[3]*x2;
1793: v += 4;
1794: }
1795: x[idx] = s1;
1796: x[1+idx] = s2;
1797: }
1800: VecRestoreArrayRead(bb,&b);
1801: VecRestoreArray(xx,&x);
1802: PetscLogFlops(4.0*(a->nz) - A->cmap->n);
1803: return(0);
1804: }
1808: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1809: {
1810: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1811: const PetscInt n = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1812: PetscInt i,k,nz,idx,idt;
1813: PetscErrorCode ierr;
1814: const MatScalar *aa=a->a,*v;
1815: PetscScalar *x,s1,s2,x1,x2;
1816: const PetscScalar *b;
1819: VecGetArrayRead(bb,&b);
1820: VecGetArray(xx,&x);
1822: /* backward solve the upper triangular */
1823: for (i=n-1; i>=0; i--) {
1824: v = aa + 4*(adiag[i+1]+1);
1825: vi = aj + adiag[i+1]+1;
1826: nz = adiag[i] - adiag[i+1]-1;
1827: idt = 2*i;
1828: s1 = b[idt]; s2 = b[1+idt];
1829: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1830: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1831: for (k=0; k<nz; k++) {
1832: idx = 2*vi[k];
1833: x1 = x[idx]; x2 = x[1+idx];
1834: s1 -= v[0]*x1 + v[2]*x2;
1835: s2 -= v[1]*x1 + v[3]*x2;
1836: v += 4;
1837: }
1838: /* x = inv_diagonal*x */
1839: x[idt] = v[0]*s1 + v[2]*s2;
1840: x[1+idt] = v[1]*s1 + v[3]*s2;
1841: }
1843: VecRestoreArrayRead(bb,&b);
1844: VecRestoreArray(xx,&x);
1845: PetscLogFlops(4.0*a->nz - A->cmap->n);
1846: return(0);
1847: }
1849: /*
1850: Special case where the matrix was ILU(0) factored in the natural
1851: ordering. This eliminates the need for the column and row permutation.
1852: */
1855: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1856: {
1857: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1858: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1859: PetscErrorCode ierr;
1860: const MatScalar *aa=a->a,*v;
1861: PetscScalar *x;
1862: const PetscScalar *b;
1863: PetscScalar s1,x1;
1864: PetscInt jdx,idt,idx,nz,i;
1867: VecGetArrayRead(bb,&b);
1868: VecGetArray(xx,&x);
1870: /* forward solve the lower triangular */
1871: idx = 0;
1872: x[0] = b[0];
1873: for (i=1; i<n; i++) {
1874: v = aa + ai[i];
1875: vi = aj + ai[i];
1876: nz = diag[i] - ai[i];
1877: idx += 1;
1878: s1 = b[idx];
1879: while (nz--) {
1880: jdx = *vi++;
1881: x1 = x[jdx];
1882: s1 -= v[0]*x1;
1883: v += 1;
1884: }
1885: x[idx] = s1;
1886: }
1887: /* backward solve the upper triangular */
1888: for (i=n-1; i>=0; i--) {
1889: v = aa + diag[i] + 1;
1890: vi = aj + diag[i] + 1;
1891: nz = ai[i+1] - diag[i] - 1;
1892: idt = i;
1893: s1 = x[idt];
1894: while (nz--) {
1895: idx = *vi++;
1896: x1 = x[idx];
1897: s1 -= v[0]*x1;
1898: v += 1;
1899: }
1900: v = aa + diag[i];
1901: x[idt] = v[0]*s1;
1902: }
1903: VecRestoreArrayRead(bb,&b);
1904: VecRestoreArray(xx,&x);
1905: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
1906: return(0);
1907: }
1912: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1913: {
1914: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1915: PetscErrorCode ierr;
1916: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*vi;
1917: PetscScalar *x,sum;
1918: const PetscScalar *b;
1919: const MatScalar *aa = a->a,*v;
1920: PetscInt i,nz;
1923: if (!n) return(0);
1925: VecGetArrayRead(bb,&b);
1926: VecGetArray(xx,&x);
1928: /* forward solve the lower triangular */
1929: x[0] = b[0];
1930: v = aa;
1931: vi = aj;
1932: for (i=1; i<n; i++) {
1933: nz = ai[i+1] - ai[i];
1934: sum = b[i];
1935: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
1936: v += nz;
1937: vi += nz;
1938: x[i] = sum;
1939: }
1940: PetscLogFlops(a->nz - A->cmap->n);
1941: VecRestoreArrayRead(bb,&b);
1942: VecRestoreArray(xx,&x);
1943: return(0);
1944: }
1948: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1949: {
1950: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1951: PetscErrorCode ierr;
1952: const PetscInt n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
1953: PetscScalar *x,sum;
1954: const PetscScalar *b;
1955: const MatScalar *aa = a->a,*v;
1956: PetscInt i,nz;
1959: if (!n) return(0);
1961: VecGetArrayRead(bb,&b);
1962: VecGetArray(xx,&x);
1964: /* backward solve the upper triangular */
1965: for (i=n-1; i>=0; i--) {
1966: v = aa + adiag[i+1] + 1;
1967: vi = aj + adiag[i+1] + 1;
1968: nz = adiag[i] - adiag[i+1]-1;
1969: sum = b[i];
1970: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
1971: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
1972: }
1974: PetscLogFlops(2.0*a->nz - A->cmap->n);
1975: VecRestoreArrayRead(bb,&b);
1976: VecRestoreArray(xx,&x);
1977: return(0);
1978: }
1982: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1983: {
1984: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1985: PetscErrorCode ierr;
1986: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
1987: PetscScalar *x,sum;
1988: const PetscScalar *b;
1989: const MatScalar *aa = a->a,*v;
1990: PetscInt i,nz;
1993: if (!n) return(0);
1995: VecGetArrayRead(bb,&b);
1996: VecGetArray(xx,&x);
1998: /* forward solve the lower triangular */
1999: x[0] = b[0];
2000: v = aa;
2001: vi = aj;
2002: for (i=1; i<n; i++) {
2003: nz = ai[i+1] - ai[i];
2004: sum = b[i];
2005: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2006: v += nz;
2007: vi += nz;
2008: x[i] = sum;
2009: }
2011: /* backward solve the upper triangular */
2012: for (i=n-1; i>=0; i--) {
2013: v = aa + adiag[i+1] + 1;
2014: vi = aj + adiag[i+1] + 1;
2015: nz = adiag[i] - adiag[i+1]-1;
2016: sum = x[i];
2017: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2018: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2019: }
2021: PetscLogFlops(2.0*a->nz - A->cmap->n);
2022: VecRestoreArrayRead(bb,&b);
2023: VecRestoreArray(xx,&x);
2024: return(0);
2025: }