Actual source code: baijsolvnat.c

petsc-3.6.1 2015-08-06
Report Typos and Errors
  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

  8: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
  9: {
 10:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
 11:   PetscErrorCode    ierr;
 12:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
 13:   PetscInt          i,nz,idx,idt,m;
 14:   const MatScalar   *aa=a->a,*v;
 15:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
 16:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
 17:   PetscScalar       *x;
 18:   const PetscScalar *b;

 21:   VecGetArrayRead(bb,&b);
 22:   VecGetArray(xx,&x);

 24:   /* forward solve the lower triangular */
 25:   idx   = 0;
 26:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
 27:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
 28:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

 30:   for (i=1; i<n; i++) {
 31:     v   = aa + bs2*ai[i];
 32:     vi  = aj + ai[i];
 33:     nz  = ai[i+1] - ai[i];
 34:     idt = bs*i;
 35:     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
 36:     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
 37:     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
 38:     for (m=0; m<nz; m++) {
 39:       idx = bs*vi[m];
 40:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 41:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 42:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];


 45:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 46:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 47:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 48:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 49:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 50:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 51:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 52:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 53:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 54:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 55:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 56:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 57:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 58:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 59:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

 61:       v += bs2;
 62:     }
 63:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
 64:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
 65:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;

 67:   }
 68:   /* backward solve the upper triangular */
 69:   for (i=n-1; i>=0; i--) {
 70:     v   = aa + bs2*(adiag[i+1]+1);
 71:     vi  = aj + adiag[i+1]+1;
 72:     nz  = adiag[i] - adiag[i+1] - 1;
 73:     idt = bs*i;
 74:     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
 75:     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
 76:     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];

 78:     for (m=0; m<nz; m++) {
 79:       idx = bs*vi[m];
 80:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 81:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 82:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

 84:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 85:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 86:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 87:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 88:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 89:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 90:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 91:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 92:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 93:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 94:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 95:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 96:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 97:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 98:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

100:       v += bs2;
101:     }

103:     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
104:     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
105:     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
106:     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
107:     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
108:     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
109:     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
110:     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
111:     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
112:     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
113:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
114:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
115:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
116:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
117:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

119:   }

121:   VecRestoreArrayRead(bb,&b);
122:   VecRestoreArray(xx,&x);
123:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
124:   return(0);
125: }

127: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
128: /* Default MatSolve for block size 15 */

132: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
133: {
134:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
135:   PetscErrorCode    ierr;
136:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
137:   PetscInt          i,k,nz,idx,idt,m;
138:   const MatScalar   *aa=a->a,*v;
139:   PetscScalar       s[15];
140:   PetscScalar       *x,xv;
141:   const PetscScalar *b;

144:   VecGetArrayRead(bb,&b);
145:   VecGetArray(xx,&x);

147:   /* forward solve the lower triangular */
148:   for (i=0; i<n; i++) {
149:     v         = aa + bs2*ai[i];
150:     vi        = aj + ai[i];
151:     nz        = ai[i+1] - ai[i];
152:     idt       = bs*i;
153:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
154:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
155:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
156:     for (m=0; m<nz; m++) {
157:       idx = bs*vi[m];
158:       for (k=0; k<15; k++) {
159:         xv         = x[k + idx];
160:         x[idt]    -= v[0]*xv;
161:         x[1+idt]  -= v[1]*xv;
162:         x[2+idt]  -= v[2]*xv;
163:         x[3+idt]  -= v[3]*xv;
164:         x[4+idt]  -= v[4]*xv;
165:         x[5+idt]  -= v[5]*xv;
166:         x[6+idt]  -= v[6]*xv;
167:         x[7+idt]  -= v[7]*xv;
168:         x[8+idt]  -= v[8]*xv;
169:         x[9+idt]  -= v[9]*xv;
170:         x[10+idt] -= v[10]*xv;
171:         x[11+idt] -= v[11]*xv;
172:         x[12+idt] -= v[12]*xv;
173:         x[13+idt] -= v[13]*xv;
174:         x[14+idt] -= v[14]*xv;
175:         v         += 15;
176:       }
177:     }
178:   }
179:   /* backward solve the upper triangular */
180:   for (i=n-1; i>=0; i--) {
181:     v     = aa + bs2*(adiag[i+1]+1);
182:     vi    = aj + adiag[i+1]+1;
183:     nz    = adiag[i] - adiag[i+1] - 1;
184:     idt   = bs*i;
185:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
186:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
187:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];

189:     for (m=0; m<nz; m++) {
190:       idx = bs*vi[m];
191:       for (k=0; k<15; k++) {
192:         xv     = x[k + idx];
193:         s[0]  -= v[0]*xv;
194:         s[1]  -= v[1]*xv;
195:         s[2]  -= v[2]*xv;
196:         s[3]  -= v[3]*xv;
197:         s[4]  -= v[4]*xv;
198:         s[5]  -= v[5]*xv;
199:         s[6]  -= v[6]*xv;
200:         s[7]  -= v[7]*xv;
201:         s[8]  -= v[8]*xv;
202:         s[9]  -= v[9]*xv;
203:         s[10] -= v[10]*xv;
204:         s[11] -= v[11]*xv;
205:         s[12] -= v[12]*xv;
206:         s[13] -= v[13]*xv;
207:         s[14] -= v[14]*xv;
208:         v     += 15;
209:       }
210:     }
211:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
212:     for (k=0; k<15; k++) {
213:       x[idt]    += v[0]*s[k];
214:       x[1+idt]  += v[1]*s[k];
215:       x[2+idt]  += v[2]*s[k];
216:       x[3+idt]  += v[3]*s[k];
217:       x[4+idt]  += v[4]*s[k];
218:       x[5+idt]  += v[5]*s[k];
219:       x[6+idt]  += v[6]*s[k];
220:       x[7+idt]  += v[7]*s[k];
221:       x[8+idt]  += v[8]*s[k];
222:       x[9+idt]  += v[9]*s[k];
223:       x[10+idt] += v[10]*s[k];
224:       x[11+idt] += v[11]*s[k];
225:       x[12+idt] += v[12]*s[k];
226:       x[13+idt] += v[13]*s[k];
227:       x[14+idt] += v[14]*s[k];
228:       v         += 15;
229:     }
230:   }
231:   VecRestoreArrayRead(bb,&b);
232:   VecRestoreArray(xx,&x);
233:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
234:   return(0);
235: }

239: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
240: {
241:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
242:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
243:   PetscErrorCode    ierr;
244:   PetscInt          i,nz,idx,idt,jdx;
245:   const MatScalar   *aa=a->a,*v;
246:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
247:   const PetscScalar *b;

250:   VecGetArrayRead(bb,&b);
251:   VecGetArray(xx,&x);
252:   /* forward solve the lower triangular */
253:   idx  = 0;
254:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
255:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
256:   x[6] = b[6+idx];
257:   for (i=1; i<n; i++) {
258:     v   =  aa + 49*ai[i];
259:     vi  =  aj + ai[i];
260:     nz  =  diag[i] - ai[i];
261:     idx =  7*i;
262:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
263:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
264:     s7  =  b[6+idx];
265:     while (nz--) {
266:       jdx = 7*(*vi++);
267:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
268:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
269:       x7  = x[6+jdx];
270:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
271:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
272:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
273:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
274:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
275:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
276:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
277:       v  += 49;
278:     }
279:     x[idx]   = s1;
280:     x[1+idx] = s2;
281:     x[2+idx] = s3;
282:     x[3+idx] = s4;
283:     x[4+idx] = s5;
284:     x[5+idx] = s6;
285:     x[6+idx] = s7;
286:   }
287:   /* backward solve the upper triangular */
288:   for (i=n-1; i>=0; i--) {
289:     v   = aa + 49*diag[i] + 49;
290:     vi  = aj + diag[i] + 1;
291:     nz  = ai[i+1] - diag[i] - 1;
292:     idt = 7*i;
293:     s1  = x[idt];   s2 = x[1+idt];
294:     s3  = x[2+idt]; s4 = x[3+idt];
295:     s5  = x[4+idt]; s6 = x[5+idt];
296:     s7  = x[6+idt];
297:     while (nz--) {
298:       idx = 7*(*vi++);
299:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
300:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
301:       x7  = x[6+idx];
302:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
303:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
304:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
305:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
306:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
307:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
308:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
309:       v  += 49;
310:     }
311:     v      = aa + 49*diag[i];
312:     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
313:              + v[28]*s5 + v[35]*s6 + v[42]*s7;
314:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
315:                + v[29]*s5 + v[36]*s6 + v[43]*s7;
316:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
317:                + v[30]*s5 + v[37]*s6 + v[44]*s7;
318:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
319:                + v[31]*s5 + v[38]*s6 + v[45]*s7;
320:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
321:                + v[32]*s5 + v[39]*s6 + v[46]*s7;
322:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
323:                + v[33]*s5 + v[40]*s6 + v[47]*s7;
324:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
325:                + v[34]*s5 + v[41]*s6 + v[48]*s7;
326:   }

328:   VecRestoreArrayRead(bb,&b);
329:   VecRestoreArray(xx,&x);
330:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
331:   return(0);
332: }

336: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
337: {
338:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
339:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
340:   PetscErrorCode    ierr;
341:   PetscInt          i,k,nz,idx,jdx,idt;
342:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
343:   const MatScalar   *aa=a->a,*v;
344:   PetscScalar       *x;
345:   const PetscScalar *b;
346:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

349:   VecGetArrayRead(bb,&b);
350:   VecGetArray(xx,&x);
351:   /* forward solve the lower triangular */
352:   idx  = 0;
353:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
354:   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
355:   for (i=1; i<n; i++) {
356:     v   = aa + bs2*ai[i];
357:     vi  = aj + ai[i];
358:     nz  = ai[i+1] - ai[i];
359:     idx = bs*i;
360:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
361:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
362:     for (k=0; k<nz; k++) {
363:       jdx = bs*vi[k];
364:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
365:       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
366:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
367:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
368:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
369:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
370:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
371:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
372:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
373:       v  +=  bs2;
374:     }

376:     x[idx]   = s1;
377:     x[1+idx] = s2;
378:     x[2+idx] = s3;
379:     x[3+idx] = s4;
380:     x[4+idx] = s5;
381:     x[5+idx] = s6;
382:     x[6+idx] = s7;
383:   }

385:   /* backward solve the upper triangular */
386:   for (i=n-1; i>=0; i--) {
387:     v   = aa + bs2*(adiag[i+1]+1);
388:     vi  = aj + adiag[i+1]+1;
389:     nz  = adiag[i] - adiag[i+1]-1;
390:     idt = bs*i;
391:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
392:     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
393:     for (k=0; k<nz; k++) {
394:       idx = bs*vi[k];
395:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
396:       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
397:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
398:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
399:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
400:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
401:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
402:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
403:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
404:       v  +=  bs2;
405:     }
406:     /* x = inv_diagonal*x */
407:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
408:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
409:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
410:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
411:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
412:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
413:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
414:   }

416:   VecRestoreArrayRead(bb,&b);
417:   VecRestoreArray(xx,&x);
418:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
419:   return(0);
420: }

424: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
427:   PetscInt          i,nz,idx,idt,jdx;
428:   PetscErrorCode    ierr;
429:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
430:   const MatScalar   *aa   =a->a,*v;
431:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
432:   const PetscScalar *b;

435:   VecGetArrayRead(bb,&b);
436:   VecGetArray(xx,&x);
437:   /* forward solve the lower triangular */
438:   idx  = 0;
439:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
440:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
441:   for (i=1; i<n; i++) {
442:     v   =  aa + 36*ai[i];
443:     vi  =  aj + ai[i];
444:     nz  =  diag[i] - ai[i];
445:     idx =  6*i;
446:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
447:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
448:     while (nz--) {
449:       jdx = 6*(*vi++);
450:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
451:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
452:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
453:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
454:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
455:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
456:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
457:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
458:       v  += 36;
459:     }
460:     x[idx]   = s1;
461:     x[1+idx] = s2;
462:     x[2+idx] = s3;
463:     x[3+idx] = s4;
464:     x[4+idx] = s5;
465:     x[5+idx] = s6;
466:   }
467:   /* backward solve the upper triangular */
468:   for (i=n-1; i>=0; i--) {
469:     v   = aa + 36*diag[i] + 36;
470:     vi  = aj + diag[i] + 1;
471:     nz  = ai[i+1] - diag[i] - 1;
472:     idt = 6*i;
473:     s1  = x[idt];   s2 = x[1+idt];
474:     s3  = x[2+idt]; s4 = x[3+idt];
475:     s5  = x[4+idt]; s6 = x[5+idt];
476:     while (nz--) {
477:       idx = 6*(*vi++);
478:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
479:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
480:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
481:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
482:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
483:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
484:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
485:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
486:       v  += 36;
487:     }
488:     v        = aa + 36*diag[i];
489:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
490:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
491:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
492:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
493:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
494:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
495:   }

497:   VecRestoreArrayRead(bb,&b);
498:   VecRestoreArray(xx,&x);
499:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
500:   return(0);
501: }

505: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
506: {
507:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
508:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
509:   PetscErrorCode    ierr;
510:   PetscInt          i,k,nz,idx,jdx,idt;
511:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
512:   const MatScalar   *aa=a->a,*v;
513:   PetscScalar       *x;
514:   const PetscScalar *b;
515:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

518:   VecGetArrayRead(bb,&b);
519:   VecGetArray(xx,&x);
520:   /* forward solve the lower triangular */
521:   idx  = 0;
522:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
523:   x[4] = b[4+idx];x[5] = b[5+idx];
524:   for (i=1; i<n; i++) {
525:     v   = aa + bs2*ai[i];
526:     vi  = aj + ai[i];
527:     nz  = ai[i+1] - ai[i];
528:     idx = bs*i;
529:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
530:     s5  = b[4+idx];s6 = b[5+idx];
531:     for (k=0; k<nz; k++) {
532:       jdx = bs*vi[k];
533:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
534:       x5  = x[4+jdx]; x6 = x[5+jdx];
535:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
536:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
537:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
538:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
539:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
540:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
541:       v  +=  bs2;
542:     }

544:     x[idx]   = s1;
545:     x[1+idx] = s2;
546:     x[2+idx] = s3;
547:     x[3+idx] = s4;
548:     x[4+idx] = s5;
549:     x[5+idx] = s6;
550:   }

552:   /* backward solve the upper triangular */
553:   for (i=n-1; i>=0; i--) {
554:     v   = aa + bs2*(adiag[i+1]+1);
555:     vi  = aj + adiag[i+1]+1;
556:     nz  = adiag[i] - adiag[i+1]-1;
557:     idt = bs*i;
558:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
559:     s5  = x[4+idt];s6 = x[5+idt];
560:     for (k=0; k<nz; k++) {
561:       idx = bs*vi[k];
562:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
563:       x5  = x[4+idx];x6 = x[5+idx];
564:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
565:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
566:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
567:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
568:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
569:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
570:       v  +=  bs2;
571:     }
572:     /* x = inv_diagonal*x */
573:     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
574:     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
575:     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
576:     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
577:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
578:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
579:   }

581:   VecRestoreArrayRead(bb,&b);
582:   VecRestoreArray(xx,&x);
583:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
584:   return(0);
585: }

589: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
590: {
591:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
592:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
593:   PetscInt          i,nz,idx,idt,jdx;
594:   PetscErrorCode    ierr;
595:   const MatScalar   *aa=a->a,*v;
596:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
597:   const PetscScalar *b;

600:   VecGetArrayRead(bb,&b);
601:   VecGetArray(xx,&x);
602:   /* forward solve the lower triangular */
603:   idx  = 0;
604:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
605:   for (i=1; i<n; i++) {
606:     v   =  aa + 25*ai[i];
607:     vi  =  aj + ai[i];
608:     nz  =  diag[i] - ai[i];
609:     idx =  5*i;
610:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
611:     while (nz--) {
612:       jdx = 5*(*vi++);
613:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
614:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
615:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
616:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
617:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
618:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
619:       v  += 25;
620:     }
621:     x[idx]   = s1;
622:     x[1+idx] = s2;
623:     x[2+idx] = s3;
624:     x[3+idx] = s4;
625:     x[4+idx] = s5;
626:   }
627:   /* backward solve the upper triangular */
628:   for (i=n-1; i>=0; i--) {
629:     v   = aa + 25*diag[i] + 25;
630:     vi  = aj + diag[i] + 1;
631:     nz  = ai[i+1] - diag[i] - 1;
632:     idt = 5*i;
633:     s1  = x[idt];  s2 = x[1+idt];
634:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
635:     while (nz--) {
636:       idx = 5*(*vi++);
637:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
638:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
639:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
640:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
641:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
642:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
643:       v  += 25;
644:     }
645:     v        = aa + 25*diag[i];
646:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
647:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
648:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
649:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
650:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
651:   }

653:   VecRestoreArrayRead(bb,&b);
654:   VecRestoreArray(xx,&x);
655:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
656:   return(0);
657: }

661: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
662: {
663:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
664:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
665:   PetscInt          i,k,nz,idx,idt,jdx;
666:   PetscErrorCode    ierr;
667:   const MatScalar   *aa=a->a,*v;
668:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
669:   const PetscScalar *b;

672:   VecGetArrayRead(bb,&b);
673:   VecGetArray(xx,&x);
674:   /* forward solve the lower triangular */
675:   idx  = 0;
676:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
677:   for (i=1; i<n; i++) {
678:     v   = aa + 25*ai[i];
679:     vi  = aj + ai[i];
680:     nz  = ai[i+1] - ai[i];
681:     idx = 5*i;
682:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
683:     for (k=0; k<nz; k++) {
684:       jdx = 5*vi[k];
685:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
686:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
687:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
688:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
689:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
690:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
691:       v  += 25;
692:     }
693:     x[idx]   = s1;
694:     x[1+idx] = s2;
695:     x[2+idx] = s3;
696:     x[3+idx] = s4;
697:     x[4+idx] = s5;
698:   }

700:   /* backward solve the upper triangular */
701:   for (i=n-1; i>=0; i--) {
702:     v   = aa + 25*(adiag[i+1]+1);
703:     vi  = aj + adiag[i+1]+1;
704:     nz  = adiag[i] - adiag[i+1]-1;
705:     idt = 5*i;
706:     s1  = x[idt];  s2 = x[1+idt];
707:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
708:     for (k=0; k<nz; k++) {
709:       idx = 5*vi[k];
710:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
711:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
712:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
713:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
714:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
715:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
716:       v  += 25;
717:     }
718:     /* x = inv_diagonal*x */
719:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
720:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
721:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
722:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
723:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
724:   }

726:   VecRestoreArrayRead(bb,&b);
727:   VecRestoreArray(xx,&x);
728:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
729:   return(0);
730: }

732: /*
733:       Special case where the matrix was ILU(0) factored in the natural
734:    ordering. This eliminates the need for the column and row permutation.
735: */
738: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
739: {
740:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
741:   PetscInt          n  =a->mbs;
742:   const PetscInt    *ai=a->i,*aj=a->j;
743:   PetscErrorCode    ierr;
744:   const PetscInt    *diag = a->diag;
745:   const MatScalar   *aa   =a->a;
746:   PetscScalar       *x;
747:   const PetscScalar *b;

750:   VecGetArrayRead(bb,&b);
751:   VecGetArray(xx,&x);

753: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
754:   {
755:     static PetscScalar w[2000]; /* very BAD need to fix */
756:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
757:   }
758: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
759:   {
760:     static PetscScalar w[2000]; /* very BAD need to fix */
761:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
762:   }
763: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
764:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
765: #else
766:   {
767:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
768:     const MatScalar *v;
769:     PetscInt        jdx,idt,idx,nz,i,ai16;
770:     const PetscInt  *vi;

772:     /* forward solve the lower triangular */
773:     idx  = 0;
774:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
775:     for (i=1; i<n; i++) {
776:       v    =  aa      + 16*ai[i];
777:       vi   =  aj      + ai[i];
778:       nz   =  diag[i] - ai[i];
779:       idx +=  4;
780:       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
781:       while (nz--) {
782:         jdx = 4*(*vi++);
783:         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
784:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
785:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
786:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
787:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
788:         v  += 16;
789:       }
790:       x[idx]   = s1;
791:       x[1+idx] = s2;
792:       x[2+idx] = s3;
793:       x[3+idx] = s4;
794:     }
795:     /* backward solve the upper triangular */
796:     idt = 4*(n-1);
797:     for (i=n-1; i>=0; i--) {
798:       ai16 = 16*diag[i];
799:       v    = aa + ai16 + 16;
800:       vi   = aj + diag[i] + 1;
801:       nz   = ai[i+1] - diag[i] - 1;
802:       s1   = x[idt];  s2 = x[1+idt];
803:       s3   = x[2+idt];s4 = x[3+idt];
804:       while (nz--) {
805:         idx = 4*(*vi++);
806:         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
807:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
808:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
809:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
810:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
811:         v  += 16;
812:       }
813:       v        = aa + ai16;
814:       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
815:       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
816:       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
817:       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
818:       idt     -= 4;
819:     }
820:   }
821: #endif

823:   VecRestoreArrayRead(bb,&b);
824:   VecRestoreArray(xx,&x);
825:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
826:   return(0);
827: }

831: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
832: {
833:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
834:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
835:   PetscInt          i,k,nz,idx,jdx,idt;
836:   PetscErrorCode    ierr;
837:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
838:   const MatScalar   *aa=a->a,*v;
839:   PetscScalar       *x;
840:   const PetscScalar *b;
841:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

844:   VecGetArrayRead(bb,&b);
845:   VecGetArray(xx,&x);
846:   /* forward solve the lower triangular */
847:   idx  = 0;
848:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
849:   for (i=1; i<n; i++) {
850:     v   = aa + bs2*ai[i];
851:     vi  = aj + ai[i];
852:     nz  = ai[i+1] - ai[i];
853:     idx = bs*i;
854:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
855:     for (k=0; k<nz; k++) {
856:       jdx = bs*vi[k];
857:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
858:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
859:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
860:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
861:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

863:       v +=  bs2;
864:     }

866:     x[idx]   = s1;
867:     x[1+idx] = s2;
868:     x[2+idx] = s3;
869:     x[3+idx] = s4;
870:   }

872:   /* backward solve the upper triangular */
873:   for (i=n-1; i>=0; i--) {
874:     v   = aa + bs2*(adiag[i+1]+1);
875:     vi  = aj + adiag[i+1]+1;
876:     nz  = adiag[i] - adiag[i+1]-1;
877:     idt = bs*i;
878:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];

880:     for (k=0; k<nz; k++) {
881:       idx = bs*vi[k];
882:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
883:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
884:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
885:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
886:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

888:       v +=  bs2;
889:     }
890:     /* x = inv_diagonal*x */
891:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
892:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
893:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
894:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

896:   }

898:   VecRestoreArrayRead(bb,&b);
899:   VecRestoreArray(xx,&x);
900:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
901:   return(0);
902: }

906: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
907: {
908:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
909:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
910:   PetscErrorCode    ierr;
911:   const MatScalar   *aa=a->a;
912:   const PetscScalar *b;
913:   PetscScalar       *x;

916:   VecGetArrayRead(bb,&b);
917:   VecGetArray(xx,&x);

919:   {
920:     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
921:     const MatScalar *v;
922:     MatScalar       *t=(MatScalar*)x;
923:     PetscInt        jdx,idt,idx,nz,i,ai16;
924:     const PetscInt  *vi;

926:     /* forward solve the lower triangular */
927:     idx  = 0;
928:     t[0] = (MatScalar)b[0];
929:     t[1] = (MatScalar)b[1];
930:     t[2] = (MatScalar)b[2];
931:     t[3] = (MatScalar)b[3];
932:     for (i=1; i<n; i++) {
933:       v    =  aa      + 16*ai[i];
934:       vi   =  aj      + ai[i];
935:       nz   =  diag[i] - ai[i];
936:       idx +=  4;
937:       s1   = (MatScalar)b[idx];
938:       s2   = (MatScalar)b[1+idx];
939:       s3   = (MatScalar)b[2+idx];
940:       s4   = (MatScalar)b[3+idx];
941:       while (nz--) {
942:         jdx = 4*(*vi++);
943:         x1  = t[jdx];
944:         x2  = t[1+jdx];
945:         x3  = t[2+jdx];
946:         x4  = t[3+jdx];
947:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
948:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
949:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
950:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
951:         v  += 16;
952:       }
953:       t[idx]   = s1;
954:       t[1+idx] = s2;
955:       t[2+idx] = s3;
956:       t[3+idx] = s4;
957:     }
958:     /* backward solve the upper triangular */
959:     idt = 4*(n-1);
960:     for (i=n-1; i>=0; i--) {
961:       ai16 = 16*diag[i];
962:       v    = aa + ai16 + 16;
963:       vi   = aj + diag[i] + 1;
964:       nz   = ai[i+1] - diag[i] - 1;
965:       s1   = t[idt];
966:       s2   = t[1+idt];
967:       s3   = t[2+idt];
968:       s4   = t[3+idt];
969:       while (nz--) {
970:         idx = 4*(*vi++);
971:         x1  = (MatScalar)x[idx];
972:         x2  = (MatScalar)x[1+idx];
973:         x3  = (MatScalar)x[2+idx];
974:         x4  = (MatScalar)x[3+idx];
975:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
976:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
977:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
978:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
979:         v  += 16;
980:       }
981:       v        = aa + ai16;
982:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
983:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
984:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
985:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
986:       idt     -= 4;
987:     }
988:   }

990:   VecRestoreArrayRead(bb,&b);
991:   VecRestoreArray(xx,&x);
992:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
993:   return(0);
994: }

996: #if defined(PETSC_HAVE_SSE)

998: #include PETSC_HAVE_SSE
1001: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1002: {
1003:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1004:   unsigned short *aj=(unsigned short*)a->j;
1006:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1007:   MatScalar      *aa=a->a;
1008:   PetscScalar    *x,*b;

1011:   SSE_SCOPE_BEGIN;
1012:   /*
1013:      Note: This code currently uses demotion of double
1014:      to float when performing the mixed-mode computation.
1015:      This may not be numerically reasonable for all applications.
1016:   */
1017:   PREFETCH_NTA(aa+16*ai[1]);

1019:   VecGetArray(bb,&b);
1020:   VecGetArray(xx,&x);
1021:   {
1022:     /* x will first be computed in single precision then promoted inplace to double */
1023:     MatScalar      *v,*t=(MatScalar*)x;
1024:     int            nz,i,idt,ai16;
1025:     unsigned int   jdx,idx;
1026:     unsigned short *vi;
1027:     /* Forward solve the lower triangular factor. */

1029:     /* First block is the identity. */
1030:     idx = 0;
1031:     CONVERT_DOUBLE4_FLOAT4(t,b);
1032:     v =  aa + 16*((unsigned int)ai[1]);

1034:     for (i=1; i<n; ) {
1035:       PREFETCH_NTA(&v[8]);
1036:       vi   =  aj      + ai[i];
1037:       nz   =  diag[i] - ai[i];
1038:       idx +=  4;

1040:       /* Demote RHS from double to float. */
1041:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1042:       LOAD_PS(&t[idx],XMM7);

1044:       while (nz--) {
1045:         PREFETCH_NTA(&v[16]);
1046:         jdx = 4*((unsigned int)(*vi++));

1048:         /* 4x4 Matrix-Vector product with negative accumulation: */
1049:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1050:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1052:         /* First Column */
1053:         SSE_COPY_PS(XMM0,XMM6)
1054:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1055:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1056:         SSE_SUB_PS(XMM7,XMM0)

1058:         /* Second Column */
1059:         SSE_COPY_PS(XMM1,XMM6)
1060:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1061:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1062:         SSE_SUB_PS(XMM7,XMM1)

1064:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1066:         /* Third Column */
1067:         SSE_COPY_PS(XMM2,XMM6)
1068:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1069:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1070:         SSE_SUB_PS(XMM7,XMM2)

1072:         /* Fourth Column */
1073:         SSE_COPY_PS(XMM3,XMM6)
1074:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1075:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1076:         SSE_SUB_PS(XMM7,XMM3)
1077:         SSE_INLINE_END_2

1079:         v += 16;
1080:       }
1081:       v =  aa + 16*ai[++i];
1082:       PREFETCH_NTA(v);
1083:       STORE_PS(&t[idx],XMM7);
1084:     }

1086:     /* Backward solve the upper triangular factor.*/

1088:     idt  = 4*(n-1);
1089:     ai16 = 16*diag[n-1];
1090:     v    = aa + ai16 + 16;
1091:     for (i=n-1; i>=0; ) {
1092:       PREFETCH_NTA(&v[8]);
1093:       vi = aj + diag[i] + 1;
1094:       nz = ai[i+1] - diag[i] - 1;

1096:       LOAD_PS(&t[idt],XMM7);

1098:       while (nz--) {
1099:         PREFETCH_NTA(&v[16]);
1100:         idx = 4*((unsigned int)(*vi++));

1102:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1103:         SSE_INLINE_BEGIN_2(&t[idx],v)
1104:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1106:         /* First Column */
1107:         SSE_COPY_PS(XMM0,XMM6)
1108:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1109:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1110:         SSE_SUB_PS(XMM7,XMM0)

1112:         /* Second Column */
1113:         SSE_COPY_PS(XMM1,XMM6)
1114:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1115:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1116:         SSE_SUB_PS(XMM7,XMM1)

1118:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1120:         /* Third Column */
1121:         SSE_COPY_PS(XMM2,XMM6)
1122:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1123:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1124:         SSE_SUB_PS(XMM7,XMM2)

1126:         /* Fourth Column */
1127:         SSE_COPY_PS(XMM3,XMM6)
1128:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1129:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1130:         SSE_SUB_PS(XMM7,XMM3)
1131:         SSE_INLINE_END_2
1132:         v += 16;
1133:       }
1134:       v    = aa + ai16;
1135:       ai16 = 16*diag[--i];
1136:       PREFETCH_NTA(aa+ai16+16);
1137:       /*
1138:          Scale the result by the diagonal 4x4 block,
1139:          which was inverted as part of the factorization
1140:       */
1141:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1142:       /* First Column */
1143:       SSE_COPY_PS(XMM0,XMM7)
1144:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1145:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1147:       /* Second Column */
1148:       SSE_COPY_PS(XMM1,XMM7)
1149:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1150:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1151:       SSE_ADD_PS(XMM0,XMM1)

1153:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1155:       /* Third Column */
1156:       SSE_COPY_PS(XMM2,XMM7)
1157:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1158:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1159:       SSE_ADD_PS(XMM0,XMM2)

1161:       /* Fourth Column */
1162:       SSE_COPY_PS(XMM3,XMM7)
1163:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1164:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1165:       SSE_ADD_PS(XMM0,XMM3)

1167:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1168:       SSE_INLINE_END_3

1170:       v    = aa + ai16 + 16;
1171:       idt -= 4;
1172:     }

1174:     /* Convert t from single precision back to double precision (inplace)*/
1175:     idt = 4*(n-1);
1176:     for (i=n-1; i>=0; i--) {
1177:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1178:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1179:       PetscScalar *xtemp=&x[idt];
1180:       MatScalar   *ttemp=&t[idt];
1181:       xtemp[3] = (PetscScalar)ttemp[3];
1182:       xtemp[2] = (PetscScalar)ttemp[2];
1183:       xtemp[1] = (PetscScalar)ttemp[1];
1184:       xtemp[0] = (PetscScalar)ttemp[0];
1185:       idt     -= 4;
1186:     }

1188:   } /* End of artificial scope. */
1189:   VecRestoreArray(bb,&b);
1190:   VecRestoreArray(xx,&x);
1191:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1192:   SSE_SCOPE_END;
1193:   return(0);
1194: }

1198: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1199: {
1200:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1201:   int            *aj=a->j;
1203:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1204:   MatScalar      *aa=a->a;
1205:   PetscScalar    *x,*b;

1208:   SSE_SCOPE_BEGIN;
1209:   /*
1210:      Note: This code currently uses demotion of double
1211:      to float when performing the mixed-mode computation.
1212:      This may not be numerically reasonable for all applications.
1213:   */
1214:   PREFETCH_NTA(aa+16*ai[1]);

1216:   VecGetArray(bb,&b);
1217:   VecGetArray(xx,&x);
1218:   {
1219:     /* x will first be computed in single precision then promoted inplace to double */
1220:     MatScalar *v,*t=(MatScalar*)x;
1221:     int       nz,i,idt,ai16;
1222:     int       jdx,idx;
1223:     int       *vi;
1224:     /* Forward solve the lower triangular factor. */

1226:     /* First block is the identity. */
1227:     idx = 0;
1228:     CONVERT_DOUBLE4_FLOAT4(t,b);
1229:     v =  aa + 16*ai[1];

1231:     for (i=1; i<n; ) {
1232:       PREFETCH_NTA(&v[8]);
1233:       vi   =  aj      + ai[i];
1234:       nz   =  diag[i] - ai[i];
1235:       idx +=  4;

1237:       /* Demote RHS from double to float. */
1238:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1239:       LOAD_PS(&t[idx],XMM7);

1241:       while (nz--) {
1242:         PREFETCH_NTA(&v[16]);
1243:         jdx = 4*(*vi++);
1244: /*          jdx = *vi++; */

1246:         /* 4x4 Matrix-Vector product with negative accumulation: */
1247:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1248:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1250:         /* First Column */
1251:         SSE_COPY_PS(XMM0,XMM6)
1252:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1253:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1254:         SSE_SUB_PS(XMM7,XMM0)

1256:         /* Second Column */
1257:         SSE_COPY_PS(XMM1,XMM6)
1258:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1259:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1260:         SSE_SUB_PS(XMM7,XMM1)

1262:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1264:         /* Third Column */
1265:         SSE_COPY_PS(XMM2,XMM6)
1266:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1267:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1268:         SSE_SUB_PS(XMM7,XMM2)

1270:         /* Fourth Column */
1271:         SSE_COPY_PS(XMM3,XMM6)
1272:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1273:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1274:         SSE_SUB_PS(XMM7,XMM3)
1275:         SSE_INLINE_END_2

1277:         v += 16;
1278:       }
1279:       v =  aa + 16*ai[++i];
1280:       PREFETCH_NTA(v);
1281:       STORE_PS(&t[idx],XMM7);
1282:     }

1284:     /* Backward solve the upper triangular factor.*/

1286:     idt  = 4*(n-1);
1287:     ai16 = 16*diag[n-1];
1288:     v    = aa + ai16 + 16;
1289:     for (i=n-1; i>=0; ) {
1290:       PREFETCH_NTA(&v[8]);
1291:       vi = aj + diag[i] + 1;
1292:       nz = ai[i+1] - diag[i] - 1;

1294:       LOAD_PS(&t[idt],XMM7);

1296:       while (nz--) {
1297:         PREFETCH_NTA(&v[16]);
1298:         idx = 4*(*vi++);
1299: /*          idx = *vi++; */

1301:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1302:         SSE_INLINE_BEGIN_2(&t[idx],v)
1303:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1305:         /* First Column */
1306:         SSE_COPY_PS(XMM0,XMM6)
1307:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1308:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1309:         SSE_SUB_PS(XMM7,XMM0)

1311:         /* Second Column */
1312:         SSE_COPY_PS(XMM1,XMM6)
1313:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1314:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1315:         SSE_SUB_PS(XMM7,XMM1)

1317:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1319:         /* Third Column */
1320:         SSE_COPY_PS(XMM2,XMM6)
1321:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1322:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1323:         SSE_SUB_PS(XMM7,XMM2)

1325:         /* Fourth Column */
1326:         SSE_COPY_PS(XMM3,XMM6)
1327:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1328:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1329:         SSE_SUB_PS(XMM7,XMM3)
1330:         SSE_INLINE_END_2
1331:         v += 16;
1332:       }
1333:       v    = aa + ai16;
1334:       ai16 = 16*diag[--i];
1335:       PREFETCH_NTA(aa+ai16+16);
1336:       /*
1337:          Scale the result by the diagonal 4x4 block,
1338:          which was inverted as part of the factorization
1339:       */
1340:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1341:       /* First Column */
1342:       SSE_COPY_PS(XMM0,XMM7)
1343:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1344:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1346:       /* Second Column */
1347:       SSE_COPY_PS(XMM1,XMM7)
1348:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1349:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1350:       SSE_ADD_PS(XMM0,XMM1)

1352:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1354:       /* Third Column */
1355:       SSE_COPY_PS(XMM2,XMM7)
1356:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1357:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1358:       SSE_ADD_PS(XMM0,XMM2)

1360:       /* Fourth Column */
1361:       SSE_COPY_PS(XMM3,XMM7)
1362:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1363:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1364:       SSE_ADD_PS(XMM0,XMM3)

1366:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1367:       SSE_INLINE_END_3

1369:       v    = aa + ai16 + 16;
1370:       idt -= 4;
1371:     }

1373:     /* Convert t from single precision back to double precision (inplace)*/
1374:     idt = 4*(n-1);
1375:     for (i=n-1; i>=0; i--) {
1376:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1377:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1378:       PetscScalar *xtemp=&x[idt];
1379:       MatScalar   *ttemp=&t[idt];
1380:       xtemp[3] = (PetscScalar)ttemp[3];
1381:       xtemp[2] = (PetscScalar)ttemp[2];
1382:       xtemp[1] = (PetscScalar)ttemp[1];
1383:       xtemp[0] = (PetscScalar)ttemp[0];
1384:       idt     -= 4;
1385:     }

1387:   } /* End of artificial scope. */
1388:   VecRestoreArray(bb,&b);
1389:   VecRestoreArray(xx,&x);
1390:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1391:   SSE_SCOPE_END;
1392:   return(0);
1393: }

1395: #endif

1397: /*
1398:       Special case where the matrix was ILU(0) factored in the natural
1399:    ordering. This eliminates the need for the column and row permutation.
1400: */
1403: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1404: {
1405:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1406:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
1407:   PetscErrorCode    ierr;
1408:   const PetscInt    *diag = a->diag,*vi;
1409:   const MatScalar   *aa   =a->a,*v;
1410:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
1411:   const PetscScalar *b;
1412:   PetscInt          jdx,idt,idx,nz,i;

1415:   VecGetArrayRead(bb,&b);
1416:   VecGetArray(xx,&x);

1418:   /* forward solve the lower triangular */
1419:   idx  = 0;
1420:   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1421:   for (i=1; i<n; i++) {
1422:     v    =  aa      + 9*ai[i];
1423:     vi   =  aj      + ai[i];
1424:     nz   =  diag[i] - ai[i];
1425:     idx +=  3;
1426:     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1427:     while (nz--) {
1428:       jdx = 3*(*vi++);
1429:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1430:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1431:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1432:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1433:       v  += 9;
1434:     }
1435:     x[idx]   = s1;
1436:     x[1+idx] = s2;
1437:     x[2+idx] = s3;
1438:   }
1439:   /* backward solve the upper triangular */
1440:   for (i=n-1; i>=0; i--) {
1441:     v   = aa + 9*diag[i] + 9;
1442:     vi  = aj + diag[i] + 1;
1443:     nz  = ai[i+1] - diag[i] - 1;
1444:     idt = 3*i;
1445:     s1  = x[idt];  s2 = x[1+idt];
1446:     s3  = x[2+idt];
1447:     while (nz--) {
1448:       idx = 3*(*vi++);
1449:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1450:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1451:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1452:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1453:       v  += 9;
1454:     }
1455:     v        = aa +  9*diag[i];
1456:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1457:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1458:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1459:   }

1461:   VecRestoreArrayRead(bb,&b);
1462:   VecRestoreArray(xx,&x);
1463:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1464:   return(0);
1465: }

1469: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1470: {
1471:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1472:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1473:   PetscErrorCode    ierr;
1474:   PetscInt          i,k,nz,idx,jdx,idt;
1475:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1476:   const MatScalar   *aa=a->a,*v;
1477:   PetscScalar       *x;
1478:   const PetscScalar *b;
1479:   PetscScalar       s1,s2,s3,x1,x2,x3;

1482:   VecGetArrayRead(bb,&b);
1483:   VecGetArray(xx,&x);
1484:   /* forward solve the lower triangular */
1485:   idx  = 0;
1486:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1487:   for (i=1; i<n; i++) {
1488:     v   = aa + bs2*ai[i];
1489:     vi  = aj + ai[i];
1490:     nz  = ai[i+1] - ai[i];
1491:     idx = bs*i;
1492:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1493:     for (k=0; k<nz; k++) {
1494:       jdx = bs*vi[k];
1495:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1496:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1497:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1498:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1500:       v +=  bs2;
1501:     }

1503:     x[idx]   = s1;
1504:     x[1+idx] = s2;
1505:     x[2+idx] = s3;
1506:   }

1508:   /* backward solve the upper triangular */
1509:   for (i=n-1; i>=0; i--) {
1510:     v   = aa + bs2*(adiag[i+1]+1);
1511:     vi  = aj + adiag[i+1]+1;
1512:     nz  = adiag[i] - adiag[i+1]-1;
1513:     idt = bs*i;
1514:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];

1516:     for (k=0; k<nz; k++) {
1517:       idx = bs*vi[k];
1518:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1519:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1520:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1521:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1523:       v +=  bs2;
1524:     }
1525:     /* x = inv_diagonal*x */
1526:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1527:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1528:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1530:   }

1532:   VecRestoreArrayRead(bb,&b);
1533:   VecRestoreArray(xx,&x);
1534:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1535:   return(0);
1536: }

1540: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1541: {
1542:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1543:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j;
1544:   PetscErrorCode    ierr;
1545:   PetscInt          i,k,nz,idx,jdx;
1546:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1547:   const MatScalar   *aa=a->a,*v;
1548:   PetscScalar       *x;
1549:   const PetscScalar *b;
1550:   PetscScalar       s1,s2,s3,x1,x2,x3;

1553:   VecGetArrayRead(bb,&b);
1554:   VecGetArray(xx,&x);
1555:   /* forward solve the lower triangular */
1556:   idx  = 0;
1557:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1558:   for (i=1; i<n; i++) {
1559:     v   = aa + bs2*ai[i];
1560:     vi  = aj + ai[i];
1561:     nz  = ai[i+1] - ai[i];
1562:     idx = bs*i;
1563:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1564:     for (k=0; k<nz; k++) {
1565:       jdx = bs*vi[k];
1566:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1567:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1568:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1569:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1571:       v +=  bs2;
1572:     }

1574:     x[idx]   = s1;
1575:     x[1+idx] = s2;
1576:     x[2+idx] = s3;
1577:   }

1579:   VecRestoreArrayRead(bb,&b);
1580:   VecRestoreArray(xx,&x);
1581:   PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1582:   return(0);
1583: }


1588: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1589: {
1590:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1591:   const PetscInt    n  =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1592:   PetscErrorCode    ierr;
1593:   PetscInt          i,k,nz,idx,idt;
1594:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1595:   const MatScalar   *aa=a->a,*v;
1596:   PetscScalar       *x;
1597:   const PetscScalar *b;
1598:   PetscScalar       s1,s2,s3,x1,x2,x3;

1601:   VecGetArrayRead(bb,&b);
1602:   VecGetArray(xx,&x);

1604:   /* backward solve the upper triangular */
1605:   for (i=n-1; i>=0; i--) {
1606:     v   = aa + bs2*(adiag[i+1]+1);
1607:     vi  = aj + adiag[i+1]+1;
1608:     nz  = adiag[i] - adiag[i+1]-1;
1609:     idt = bs*i;
1610:     s1  = b[idt];  s2 = b[1+idt];s3 = b[2+idt];

1612:     for (k=0; k<nz; k++) {
1613:       idx = bs*vi[k];
1614:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1615:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1616:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1617:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1619:       v +=  bs2;
1620:     }
1621:     /* x = inv_diagonal*x */
1622:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1623:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1624:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1626:   }

1628:   VecRestoreArrayRead(bb,&b);
1629:   VecRestoreArray(xx,&x);
1630:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1631:   return(0);
1632: }

1634: /*
1635:       Special case where the matrix was ILU(0) factored in the natural
1636:    ordering. This eliminates the need for the column and row permutation.
1637: */
1640: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1641: {
1642:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1643:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1644:   PetscErrorCode    ierr;
1645:   const MatScalar   *aa=a->a,*v;
1646:   PetscScalar       *x,s1,s2,x1,x2;
1647:   const PetscScalar *b;
1648:   PetscInt          jdx,idt,idx,nz,i;

1651:   VecGetArrayRead(bb,&b);
1652:   VecGetArray(xx,&x);

1654:   /* forward solve the lower triangular */
1655:   idx  = 0;
1656:   x[0] = b[0]; x[1] = b[1];
1657:   for (i=1; i<n; i++) {
1658:     v    =  aa      + 4*ai[i];
1659:     vi   =  aj      + ai[i];
1660:     nz   =  diag[i] - ai[i];
1661:     idx +=  2;
1662:     s1   =  b[idx];s2 = b[1+idx];
1663:     while (nz--) {
1664:       jdx = 2*(*vi++);
1665:       x1  = x[jdx];x2 = x[1+jdx];
1666:       s1 -= v[0]*x1 + v[2]*x2;
1667:       s2 -= v[1]*x1 + v[3]*x2;
1668:       v  += 4;
1669:     }
1670:     x[idx]   = s1;
1671:     x[1+idx] = s2;
1672:   }
1673:   /* backward solve the upper triangular */
1674:   for (i=n-1; i>=0; i--) {
1675:     v   = aa + 4*diag[i] + 4;
1676:     vi  = aj + diag[i] + 1;
1677:     nz  = ai[i+1] - diag[i] - 1;
1678:     idt = 2*i;
1679:     s1  = x[idt];  s2 = x[1+idt];
1680:     while (nz--) {
1681:       idx = 2*(*vi++);
1682:       x1  = x[idx];   x2 = x[1+idx];
1683:       s1 -= v[0]*x1 + v[2]*x2;
1684:       s2 -= v[1]*x1 + v[3]*x2;
1685:       v  += 4;
1686:     }
1687:     v        = aa +  4*diag[i];
1688:     x[idt]   = v[0]*s1 + v[2]*s2;
1689:     x[1+idt] = v[1]*s1 + v[3]*s2;
1690:   }

1692:   VecRestoreArrayRead(bb,&b);
1693:   VecRestoreArray(xx,&x);
1694:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1695:   return(0);
1696: }

1700: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1701: {
1702:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1703:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1704:   PetscInt          i,k,nz,idx,idt,jdx;
1705:   PetscErrorCode    ierr;
1706:   const MatScalar   *aa=a->a,*v;
1707:   PetscScalar       *x,s1,s2,x1,x2;
1708:   const PetscScalar *b;

1711:   VecGetArrayRead(bb,&b);
1712:   VecGetArray(xx,&x);
1713:   /* forward solve the lower triangular */
1714:   idx  = 0;
1715:   x[0] = b[idx]; x[1] = b[1+idx];
1716:   for (i=1; i<n; i++) {
1717:     v   = aa + 4*ai[i];
1718:     vi  = aj + ai[i];
1719:     nz  = ai[i+1] - ai[i];
1720:     idx = 2*i;
1721:     s1  = b[idx];s2 = b[1+idx];
1722:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1723:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1724:     for (k=0; k<nz; k++) {
1725:       jdx = 2*vi[k];
1726:       x1  = x[jdx];x2 = x[1+jdx];
1727:       s1 -= v[0]*x1 + v[2]*x2;
1728:       s2 -= v[1]*x1 + v[3]*x2;
1729:       v  +=  4;
1730:     }
1731:     x[idx]   = s1;
1732:     x[1+idx] = s2;
1733:   }

1735:   /* backward solve the upper triangular */
1736:   for (i=n-1; i>=0; i--) {
1737:     v   = aa + 4*(adiag[i+1]+1);
1738:     vi  = aj + adiag[i+1]+1;
1739:     nz  = adiag[i] - adiag[i+1]-1;
1740:     idt = 2*i;
1741:     s1  = x[idt];  s2 = x[1+idt];
1742:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1743:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1744:     for (k=0; k<nz; k++) {
1745:       idx = 2*vi[k];
1746:       x1  = x[idx];   x2 = x[1+idx];
1747:       s1 -= v[0]*x1 + v[2]*x2;
1748:       s2 -= v[1]*x1 + v[3]*x2;
1749:       v  += 4;
1750:     }
1751:     /* x = inv_diagonal*x */
1752:     x[idt]   = v[0]*s1 + v[2]*s2;
1753:     x[1+idt] = v[1]*s1 + v[3]*s2;
1754:   }

1756:   VecRestoreArrayRead(bb,&b);
1757:   VecRestoreArray(xx,&x);
1758:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1759:   return(0);
1760: }

1764: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1765: {
1766:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1767:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j;
1768:   PetscInt          i,k,nz,idx,jdx;
1769:   PetscErrorCode    ierr;
1770:   const MatScalar   *aa=a->a,*v;
1771:   PetscScalar       *x,s1,s2,x1,x2;
1772:   const PetscScalar *b;

1775:   VecGetArrayRead(bb,&b);
1776:   VecGetArray(xx,&x);
1777:   /* forward solve the lower triangular */
1778:   idx  = 0;
1779:   x[0] = b[idx]; x[1] = b[1+idx];
1780:   for (i=1; i<n; i++) {
1781:     v   = aa + 4*ai[i];
1782:     vi  = aj + ai[i];
1783:     nz  = ai[i+1] - ai[i];
1784:     idx = 2*i;
1785:     s1  = b[idx];s2 = b[1+idx];
1786:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1787:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1788:     for (k=0; k<nz; k++) {
1789:       jdx = 2*vi[k];
1790:       x1  = x[jdx];x2 = x[1+jdx];
1791:       s1 -= v[0]*x1 + v[2]*x2;
1792:       s2 -= v[1]*x1 + v[3]*x2;
1793:       v  +=  4;
1794:     }
1795:     x[idx]   = s1;
1796:     x[1+idx] = s2;
1797:   }


1800:   VecRestoreArrayRead(bb,&b);
1801:   VecRestoreArray(xx,&x);
1802:   PetscLogFlops(4.0*(a->nz) - A->cmap->n);
1803:   return(0);
1804: }

1808: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1809: {
1810:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1811:   const PetscInt    n  = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1812:   PetscInt          i,k,nz,idx,idt;
1813:   PetscErrorCode    ierr;
1814:   const MatScalar   *aa=a->a,*v;
1815:   PetscScalar       *x,s1,s2,x1,x2;
1816:   const PetscScalar *b;

1819:   VecGetArrayRead(bb,&b);
1820:   VecGetArray(xx,&x);

1822:   /* backward solve the upper triangular */
1823:   for (i=n-1; i>=0; i--) {
1824:     v   = aa + 4*(adiag[i+1]+1);
1825:     vi  = aj + adiag[i+1]+1;
1826:     nz  = adiag[i] - adiag[i+1]-1;
1827:     idt = 2*i;
1828:     s1  = b[idt];  s2 = b[1+idt];
1829:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
1830:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
1831:     for (k=0; k<nz; k++) {
1832:       idx = 2*vi[k];
1833:       x1  = x[idx];   x2 = x[1+idx];
1834:       s1 -= v[0]*x1 + v[2]*x2;
1835:       s2 -= v[1]*x1 + v[3]*x2;
1836:       v  += 4;
1837:     }
1838:     /* x = inv_diagonal*x */
1839:     x[idt]   = v[0]*s1 + v[2]*s2;
1840:     x[1+idt] = v[1]*s1 + v[3]*s2;
1841:   }

1843:   VecRestoreArrayRead(bb,&b);
1844:   VecRestoreArray(xx,&x);
1845:   PetscLogFlops(4.0*a->nz - A->cmap->n);
1846:   return(0);
1847: }

1849: /*
1850:       Special case where the matrix was ILU(0) factored in the natural
1851:    ordering. This eliminates the need for the column and row permutation.
1852: */
1855: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1856: {
1857:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1858:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1859:   PetscErrorCode    ierr;
1860:   const MatScalar   *aa=a->a,*v;
1861:   PetscScalar       *x;
1862:   const PetscScalar *b;
1863:   PetscScalar       s1,x1;
1864:   PetscInt          jdx,idt,idx,nz,i;

1867:   VecGetArrayRead(bb,&b);
1868:   VecGetArray(xx,&x);

1870:   /* forward solve the lower triangular */
1871:   idx  = 0;
1872:   x[0] = b[0];
1873:   for (i=1; i<n; i++) {
1874:     v    =  aa      + ai[i];
1875:     vi   =  aj      + ai[i];
1876:     nz   =  diag[i] - ai[i];
1877:     idx +=  1;
1878:     s1   =  b[idx];
1879:     while (nz--) {
1880:       jdx = *vi++;
1881:       x1  = x[jdx];
1882:       s1 -= v[0]*x1;
1883:       v  += 1;
1884:     }
1885:     x[idx] = s1;
1886:   }
1887:   /* backward solve the upper triangular */
1888:   for (i=n-1; i>=0; i--) {
1889:     v   = aa + diag[i] + 1;
1890:     vi  = aj + diag[i] + 1;
1891:     nz  = ai[i+1] - diag[i] - 1;
1892:     idt = i;
1893:     s1  = x[idt];
1894:     while (nz--) {
1895:       idx = *vi++;
1896:       x1  = x[idx];
1897:       s1 -= v[0]*x1;
1898:       v  += 1;
1899:     }
1900:     v      = aa +  diag[i];
1901:     x[idt] = v[0]*s1;
1902:   }
1903:   VecRestoreArrayRead(bb,&b);
1904:   VecRestoreArray(xx,&x);
1905:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
1906:   return(0);
1907: }


1912: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1913: {
1914:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1915:   PetscErrorCode    ierr;
1916:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*vi;
1917:   PetscScalar       *x,sum;
1918:   const PetscScalar *b;
1919:   const MatScalar   *aa = a->a,*v;
1920:   PetscInt          i,nz;

1923:   if (!n) return(0);

1925:   VecGetArrayRead(bb,&b);
1926:   VecGetArray(xx,&x);

1928:   /* forward solve the lower triangular */
1929:   x[0] = b[0];
1930:   v    = aa;
1931:   vi   = aj;
1932:   for (i=1; i<n; i++) {
1933:     nz  = ai[i+1] - ai[i];
1934:     sum = b[i];
1935:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
1936:     v   += nz;
1937:     vi  += nz;
1938:     x[i] = sum;
1939:   }
1940:   PetscLogFlops(a->nz - A->cmap->n);
1941:   VecRestoreArrayRead(bb,&b);
1942:   VecRestoreArray(xx,&x);
1943:   return(0);
1944: }

1948: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1949: {
1950:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1951:   PetscErrorCode    ierr;
1952:   const PetscInt    n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
1953:   PetscScalar       *x,sum;
1954:   const PetscScalar *b;
1955:   const MatScalar   *aa = a->a,*v;
1956:   PetscInt          i,nz;

1959:   if (!n) return(0);

1961:   VecGetArrayRead(bb,&b);
1962:   VecGetArray(xx,&x);

1964:   /* backward solve the upper triangular */
1965:   for (i=n-1; i>=0; i--) {
1966:     v   = aa + adiag[i+1] + 1;
1967:     vi  = aj + adiag[i+1] + 1;
1968:     nz  = adiag[i] - adiag[i+1]-1;
1969:     sum = b[i];
1970:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
1971:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
1972:   }

1974:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1975:   VecRestoreArrayRead(bb,&b);
1976:   VecRestoreArray(xx,&x);
1977:   return(0);
1978: }

1982: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1983: {
1984:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1985:   PetscErrorCode    ierr;
1986:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
1987:   PetscScalar       *x,sum;
1988:   const PetscScalar *b;
1989:   const MatScalar   *aa = a->a,*v;
1990:   PetscInt          i,nz;

1993:   if (!n) return(0);

1995:   VecGetArrayRead(bb,&b);
1996:   VecGetArray(xx,&x);

1998:   /* forward solve the lower triangular */
1999:   x[0] = b[0];
2000:   v    = aa;
2001:   vi   = aj;
2002:   for (i=1; i<n; i++) {
2003:     nz  = ai[i+1] - ai[i];
2004:     sum = b[i];
2005:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2006:     v   += nz;
2007:     vi  += nz;
2008:     x[i] = sum;
2009:   }

2011:   /* backward solve the upper triangular */
2012:   for (i=n-1; i>=0; i--) {
2013:     v   = aa + adiag[i+1] + 1;
2014:     vi  = aj + adiag[i+1] + 1;
2015:     nz  = adiag[i] - adiag[i+1]-1;
2016:     sum = x[i];
2017:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2018:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2019:   }

2021:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2022:   VecRestoreArrayRead(bb,&b);
2023:   VecRestoreArray(xx,&x);
2024:   return(0);
2025: }