Actual source code: baijsolvnat.c

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1:  #include <../src/mat/impls/baij/seq/baij.h>
  2:  #include <petsc/private/kernels/blockinvert.h>

  4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

  6: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
  7: {
  8:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
  9:   PetscErrorCode    ierr;
 10:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
 11:   PetscInt          i,nz,idx,idt,m;
 12:   const MatScalar   *aa=a->a,*v;
 13:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
 14:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 19:   VecGetArrayRead(bb,&b);
 20:   VecGetArray(xx,&x);

 22:   /* forward solve the lower triangular */
 23:   idx   = 0;
 24:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
 25:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
 26:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

 28:   for (i=1; i<n; i++) {
 29:     v   = aa + bs2*ai[i];
 30:     vi  = aj + ai[i];
 31:     nz  = ai[i+1] - ai[i];
 32:     idt = bs*i;
 33:     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
 34:     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
 35:     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
 36:     for (m=0; m<nz; m++) {
 37:       idx = bs*vi[m];
 38:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 39:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 40:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];


 43:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 44:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 45:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 46:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 47:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 48:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 49:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 50:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 51:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 52:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 53:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 54:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 55:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 56:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 57:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

 59:       v += bs2;
 60:     }
 61:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
 62:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
 63:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;

 65:   }
 66:   /* backward solve the upper triangular */
 67:   for (i=n-1; i>=0; i--) {
 68:     v   = aa + bs2*(adiag[i+1]+1);
 69:     vi  = aj + adiag[i+1]+1;
 70:     nz  = adiag[i] - adiag[i+1] - 1;
 71:     idt = bs*i;
 72:     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
 73:     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
 74:     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];

 76:     for (m=0; m<nz; m++) {
 77:       idx = bs*vi[m];
 78:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 79:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 80:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

 82:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 83:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 84:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 85:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 86:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 87:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 88:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 89:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 90:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 91:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 92:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 93:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 94:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 95:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 96:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

 98:       v += bs2;
 99:     }

101:     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
102:     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
103:     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
104:     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
105:     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
106:     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
107:     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
108:     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
109:     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
110:     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
111:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
112:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
113:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
114:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
115:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

117:   }

119:   VecRestoreArrayRead(bb,&b);
120:   VecRestoreArray(xx,&x);
121:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
122:   return(0);
123: }

125: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
126: /* Default MatSolve for block size 15 */

128: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
129: {
130:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
131:   PetscErrorCode    ierr;
132:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
133:   PetscInt          i,k,nz,idx,idt,m;
134:   const MatScalar   *aa=a->a,*v;
135:   PetscScalar       s[15];
136:   PetscScalar       *x,xv;
137:   const PetscScalar *b;

140:   VecGetArrayRead(bb,&b);
141:   VecGetArray(xx,&x);

143:   /* forward solve the lower triangular */
144:   for (i=0; i<n; i++) {
145:     v         = aa + bs2*ai[i];
146:     vi        = aj + ai[i];
147:     nz        = ai[i+1] - ai[i];
148:     idt       = bs*i;
149:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
150:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
151:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
152:     for (m=0; m<nz; m++) {
153:       idx = bs*vi[m];
154:       for (k=0; k<15; k++) {
155:         xv         = x[k + idx];
156:         x[idt]    -= v[0]*xv;
157:         x[1+idt]  -= v[1]*xv;
158:         x[2+idt]  -= v[2]*xv;
159:         x[3+idt]  -= v[3]*xv;
160:         x[4+idt]  -= v[4]*xv;
161:         x[5+idt]  -= v[5]*xv;
162:         x[6+idt]  -= v[6]*xv;
163:         x[7+idt]  -= v[7]*xv;
164:         x[8+idt]  -= v[8]*xv;
165:         x[9+idt]  -= v[9]*xv;
166:         x[10+idt] -= v[10]*xv;
167:         x[11+idt] -= v[11]*xv;
168:         x[12+idt] -= v[12]*xv;
169:         x[13+idt] -= v[13]*xv;
170:         x[14+idt] -= v[14]*xv;
171:         v         += 15;
172:       }
173:     }
174:   }
175:   /* backward solve the upper triangular */
176:   for (i=n-1; i>=0; i--) {
177:     v     = aa + bs2*(adiag[i+1]+1);
178:     vi    = aj + adiag[i+1]+1;
179:     nz    = adiag[i] - adiag[i+1] - 1;
180:     idt   = bs*i;
181:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
182:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
183:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];

185:     for (m=0; m<nz; m++) {
186:       idx = bs*vi[m];
187:       for (k=0; k<15; k++) {
188:         xv     = x[k + idx];
189:         s[0]  -= v[0]*xv;
190:         s[1]  -= v[1]*xv;
191:         s[2]  -= v[2]*xv;
192:         s[3]  -= v[3]*xv;
193:         s[4]  -= v[4]*xv;
194:         s[5]  -= v[5]*xv;
195:         s[6]  -= v[6]*xv;
196:         s[7]  -= v[7]*xv;
197:         s[8]  -= v[8]*xv;
198:         s[9]  -= v[9]*xv;
199:         s[10] -= v[10]*xv;
200:         s[11] -= v[11]*xv;
201:         s[12] -= v[12]*xv;
202:         s[13] -= v[13]*xv;
203:         s[14] -= v[14]*xv;
204:         v     += 15;
205:       }
206:     }
207:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
208:     for (k=0; k<15; k++) {
209:       x[idt]    += v[0]*s[k];
210:       x[1+idt]  += v[1]*s[k];
211:       x[2+idt]  += v[2]*s[k];
212:       x[3+idt]  += v[3]*s[k];
213:       x[4+idt]  += v[4]*s[k];
214:       x[5+idt]  += v[5]*s[k];
215:       x[6+idt]  += v[6]*s[k];
216:       x[7+idt]  += v[7]*s[k];
217:       x[8+idt]  += v[8]*s[k];
218:       x[9+idt]  += v[9]*s[k];
219:       x[10+idt] += v[10]*s[k];
220:       x[11+idt] += v[11]*s[k];
221:       x[12+idt] += v[12]*s[k];
222:       x[13+idt] += v[13]*s[k];
223:       x[14+idt] += v[14]*s[k];
224:       v         += 15;
225:     }
226:   }
227:   VecRestoreArrayRead(bb,&b);
228:   VecRestoreArray(xx,&x);
229:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
230:   return(0);
231: }

233: /* Block operations are done by accessing one column at at time */
234: /* Default MatSolve for block size 14 */

236: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
237: {
238:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
239:   PetscErrorCode    ierr;
240:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
241:   PetscInt          i,k,nz,idx,idt,m;
242:   const MatScalar   *aa=a->a,*v;
243:   PetscScalar       s[14];
244:   PetscScalar       *x,xv;
245:   const PetscScalar *b;

248:   VecGetArrayRead(bb,&b);
249:   VecGetArray(xx,&x);

251:   /* forward solve the lower triangular */
252:   for (i=0; i<n; i++) {
253:     v         = aa + bs2*ai[i];
254:     vi        = aj + ai[i];
255:     nz        = ai[i+1] - ai[i];
256:     idt       = bs*i;
257:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
258:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
259:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
260:     for (m=0; m<nz; m++) {
261:       idx = bs*vi[m];
262:       for (k=0; k<bs; k++) {
263:         xv         = x[k + idx];
264:         x[idt]    -= v[0]*xv;
265:         x[1+idt]  -= v[1]*xv;
266:         x[2+idt]  -= v[2]*xv;
267:         x[3+idt]  -= v[3]*xv;
268:         x[4+idt]  -= v[4]*xv;
269:         x[5+idt]  -= v[5]*xv;
270:         x[6+idt]  -= v[6]*xv;
271:         x[7+idt]  -= v[7]*xv;
272:         x[8+idt]  -= v[8]*xv;
273:         x[9+idt]  -= v[9]*xv;
274:         x[10+idt] -= v[10]*xv;
275:         x[11+idt] -= v[11]*xv;
276:         x[12+idt] -= v[12]*xv;
277:         x[13+idt] -= v[13]*xv;
278:         v         += 14;
279:       }
280:     }
281:   }
282:   /* backward solve the upper triangular */
283:   for (i=n-1; i>=0; i--) {
284:     v     = aa + bs2*(adiag[i+1]+1);
285:     vi    = aj + adiag[i+1]+1;
286:     nz    = adiag[i] - adiag[i+1] - 1;
287:     idt   = bs*i;
288:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
289:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
290:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];

292:     for (m=0; m<nz; m++) {
293:       idx = bs*vi[m];
294:       for (k=0; k<bs; k++) {
295:         xv     = x[k + idx];
296:         s[0]  -= v[0]*xv;
297:         s[1]  -= v[1]*xv;
298:         s[2]  -= v[2]*xv;
299:         s[3]  -= v[3]*xv;
300:         s[4]  -= v[4]*xv;
301:         s[5]  -= v[5]*xv;
302:         s[6]  -= v[6]*xv;
303:         s[7]  -= v[7]*xv;
304:         s[8]  -= v[8]*xv;
305:         s[9]  -= v[9]*xv;
306:         s[10] -= v[10]*xv;
307:         s[11] -= v[11]*xv;
308:         s[12] -= v[12]*xv;
309:         s[13] -= v[13]*xv;
310:         v     += 14;
311:       }
312:     }
313:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
314:     for (k=0; k<bs; k++) {
315:       x[idt]    += v[0]*s[k];
316:       x[1+idt]  += v[1]*s[k];
317:       x[2+idt]  += v[2]*s[k];
318:       x[3+idt]  += v[3]*s[k];
319:       x[4+idt]  += v[4]*s[k];
320:       x[5+idt]  += v[5]*s[k];
321:       x[6+idt]  += v[6]*s[k];
322:       x[7+idt]  += v[7]*s[k];
323:       x[8+idt]  += v[8]*s[k];
324:       x[9+idt]  += v[9]*s[k];
325:       x[10+idt] += v[10]*s[k];
326:       x[11+idt] += v[11]*s[k];
327:       x[12+idt] += v[12]*s[k];
328:       x[13+idt] += v[13]*s[k];
329:       v         += 14;
330:     }
331:   }
332:   VecRestoreArrayRead(bb,&b);
333:   VecRestoreArray(xx,&x);
334:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
335:   return(0);
336: }

338: /* Block operations are done by accessing one column at at time */
339: /* Default MatSolve for block size 13 */

341: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
342: {
343:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
344:   PetscErrorCode    ierr;
345:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
346:   PetscInt          i,k,nz,idx,idt,m;
347:   const MatScalar   *aa=a->a,*v;
348:   PetscScalar       s[13];
349:   PetscScalar       *x,xv;
350:   const PetscScalar *b;

353:   VecGetArrayRead(bb,&b);
354:   VecGetArray(xx,&x);

356:   /* forward solve the lower triangular */
357:   for (i=0; i<n; i++) {
358:     v         = aa + bs2*ai[i];
359:     vi        = aj + ai[i];
360:     nz        = ai[i+1] - ai[i];
361:     idt       = bs*i;
362:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
363:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
364:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
365:     for (m=0; m<nz; m++) {
366:       idx = bs*vi[m];
367:       for (k=0; k<bs; k++) {
368:         xv         = x[k + idx];
369:         x[idt]    -= v[0]*xv;
370:         x[1+idt]  -= v[1]*xv;
371:         x[2+idt]  -= v[2]*xv;
372:         x[3+idt]  -= v[3]*xv;
373:         x[4+idt]  -= v[4]*xv;
374:         x[5+idt]  -= v[5]*xv;
375:         x[6+idt]  -= v[6]*xv;
376:         x[7+idt]  -= v[7]*xv;
377:         x[8+idt]  -= v[8]*xv;
378:         x[9+idt]  -= v[9]*xv;
379:         x[10+idt] -= v[10]*xv;
380:         x[11+idt] -= v[11]*xv;
381:         x[12+idt] -= v[12]*xv;
382:         v         += 13;
383:       }
384:     }
385:   }
386:   /* backward solve the upper triangular */
387:   for (i=n-1; i>=0; i--) {
388:     v     = aa + bs2*(adiag[i+1]+1);
389:     vi    = aj + adiag[i+1]+1;
390:     nz    = adiag[i] - adiag[i+1] - 1;
391:     idt   = bs*i;
392:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
393:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
394:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];

396:     for (m=0; m<nz; m++) {
397:       idx = bs*vi[m];
398:       for (k=0; k<bs; k++) {
399:         xv     = x[k + idx];
400:         s[0]  -= v[0]*xv;
401:         s[1]  -= v[1]*xv;
402:         s[2]  -= v[2]*xv;
403:         s[3]  -= v[3]*xv;
404:         s[4]  -= v[4]*xv;
405:         s[5]  -= v[5]*xv;
406:         s[6]  -= v[6]*xv;
407:         s[7]  -= v[7]*xv;
408:         s[8]  -= v[8]*xv;
409:         s[9]  -= v[9]*xv;
410:         s[10] -= v[10]*xv;
411:         s[11] -= v[11]*xv;
412:         s[12] -= v[12]*xv;
413:         v     += 13;
414:       }
415:     }
416:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
417:     for (k=0; k<bs; k++) {
418:       x[idt]    += v[0]*s[k];
419:       x[1+idt]  += v[1]*s[k];
420:       x[2+idt]  += v[2]*s[k];
421:       x[3+idt]  += v[3]*s[k];
422:       x[4+idt]  += v[4]*s[k];
423:       x[5+idt]  += v[5]*s[k];
424:       x[6+idt]  += v[6]*s[k];
425:       x[7+idt]  += v[7]*s[k];
426:       x[8+idt]  += v[8]*s[k];
427:       x[9+idt]  += v[9]*s[k];
428:       x[10+idt] += v[10]*s[k];
429:       x[11+idt] += v[11]*s[k];
430:       x[12+idt] += v[12]*s[k];
431:       v         += 13;
432:     }
433:   }
434:   VecRestoreArrayRead(bb,&b);
435:   VecRestoreArray(xx,&x);
436:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
437:   return(0);
438: }

440: /* Block operations are done by accessing one column at at time */
441: /* Default MatSolve for block size 12 */

443: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
444: {
445:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
446:   PetscErrorCode    ierr;
447:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
448:   PetscInt          i,k,nz,idx,idt,m;
449:   const MatScalar   *aa=a->a,*v;
450:   PetscScalar       s[12];
451:   PetscScalar       *x,xv;
452:   const PetscScalar *b;

455:   VecGetArrayRead(bb,&b);
456:   VecGetArray(xx,&x);

458:   /* forward solve the lower triangular */
459:   for (i=0; i<n; i++) {
460:     v         = aa + bs2*ai[i];
461:     vi        = aj + ai[i];
462:     nz        = ai[i+1] - ai[i];
463:     idt       = bs*i;
464:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
465:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
466:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
467:     for (m=0; m<nz; m++) {
468:       idx = bs*vi[m];
469:       for (k=0; k<bs; k++) {
470:         xv         = x[k + idx];
471:         x[idt]    -= v[0]*xv;
472:         x[1+idt]  -= v[1]*xv;
473:         x[2+idt]  -= v[2]*xv;
474:         x[3+idt]  -= v[3]*xv;
475:         x[4+idt]  -= v[4]*xv;
476:         x[5+idt]  -= v[5]*xv;
477:         x[6+idt]  -= v[6]*xv;
478:         x[7+idt]  -= v[7]*xv;
479:         x[8+idt]  -= v[8]*xv;
480:         x[9+idt]  -= v[9]*xv;
481:         x[10+idt] -= v[10]*xv;
482:         x[11+idt] -= v[11]*xv;
483:         v         += 12;
484:       }
485:     }
486:   }
487:   /* backward solve the upper triangular */
488:   for (i=n-1; i>=0; i--) {
489:     v     = aa + bs2*(adiag[i+1]+1);
490:     vi    = aj + adiag[i+1]+1;
491:     nz    = adiag[i] - adiag[i+1] - 1;
492:     idt   = bs*i;
493:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
494:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
495:     s[10] = x[10+idt]; s[11] = x[11+idt];

497:     for (m=0; m<nz; m++) {
498:       idx = bs*vi[m];
499:       for (k=0; k<bs; k++) {
500:         xv     = x[k + idx];
501:         s[0]  -= v[0]*xv;
502:         s[1]  -= v[1]*xv;
503:         s[2]  -= v[2]*xv;
504:         s[3]  -= v[3]*xv;
505:         s[4]  -= v[4]*xv;
506:         s[5]  -= v[5]*xv;
507:         s[6]  -= v[6]*xv;
508:         s[7]  -= v[7]*xv;
509:         s[8]  -= v[8]*xv;
510:         s[9]  -= v[9]*xv;
511:         s[10] -= v[10]*xv;
512:         s[11] -= v[11]*xv;
513:         v     += 12;
514:       }
515:     }
516:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
517:     for (k=0; k<bs; k++) {
518:       x[idt]    += v[0]*s[k];
519:       x[1+idt]  += v[1]*s[k];
520:       x[2+idt]  += v[2]*s[k];
521:       x[3+idt]  += v[3]*s[k];
522:       x[4+idt]  += v[4]*s[k];
523:       x[5+idt]  += v[5]*s[k];
524:       x[6+idt]  += v[6]*s[k];
525:       x[7+idt]  += v[7]*s[k];
526:       x[8+idt]  += v[8]*s[k];
527:       x[9+idt]  += v[9]*s[k];
528:       x[10+idt] += v[10]*s[k];
529:       x[11+idt] += v[11]*s[k];
530:       v         += 12;
531:     }
532:   }
533:   VecRestoreArrayRead(bb,&b);
534:   VecRestoreArray(xx,&x);
535:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
536:   return(0);
537: }

539: /* Block operations are done by accessing one column at at time */
540: /* Default MatSolve for block size 11 */

542: PetscErrorCode MatSolve_SeqBAIJ_11_NaturalOrdering(Mat A,Vec bb,Vec xx)
543: {
544:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
545:   PetscErrorCode    ierr;
546:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
547:   PetscInt          i,k,nz,idx,idt,m;
548:   const MatScalar   *aa=a->a,*v;
549:   PetscScalar       s[11];
550:   PetscScalar       *x,xv;
551:   const PetscScalar *b;

554:   VecGetArrayRead(bb,&b);
555:   VecGetArray(xx,&x);

557:   /* forward solve the lower triangular */
558:   for (i=0; i<n; i++) {
559:     v         = aa + bs2*ai[i];
560:     vi        = aj + ai[i];
561:     nz        = ai[i+1] - ai[i];
562:     idt       = bs*i;
563:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
564:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
565:     x[10+idt] = b[10+idt];
566:     for (m=0; m<nz; m++) {
567:       idx = bs*vi[m];
568:       for (k=0; k<11; k++) {
569:         xv         = x[k + idx];
570:         x[idt]    -= v[0]*xv;
571:         x[1+idt]  -= v[1]*xv;
572:         x[2+idt]  -= v[2]*xv;
573:         x[3+idt]  -= v[3]*xv;
574:         x[4+idt]  -= v[4]*xv;
575:         x[5+idt]  -= v[5]*xv;
576:         x[6+idt]  -= v[6]*xv;
577:         x[7+idt]  -= v[7]*xv;
578:         x[8+idt]  -= v[8]*xv;
579:         x[9+idt]  -= v[9]*xv;
580:         x[10+idt] -= v[10]*xv;
581:         v         += 11;
582:       }
583:     }
584:   }
585:   /* backward solve the upper triangular */
586:   for (i=n-1; i>=0; i--) {
587:     v     = aa + bs2*(adiag[i+1]+1);
588:     vi    = aj + adiag[i+1]+1;
589:     nz    = adiag[i] - adiag[i+1] - 1;
590:     idt   = bs*i;
591:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
592:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
593:     s[10] = x[10+idt];

595:     for (m=0; m<nz; m++) {
596:       idx = bs*vi[m];
597:       for (k=0; k<11; k++) {
598:         xv     = x[k + idx];
599:         s[0]  -= v[0]*xv;
600:         s[1]  -= v[1]*xv;
601:         s[2]  -= v[2]*xv;
602:         s[3]  -= v[3]*xv;
603:         s[4]  -= v[4]*xv;
604:         s[5]  -= v[5]*xv;
605:         s[6]  -= v[6]*xv;
606:         s[7]  -= v[7]*xv;
607:         s[8]  -= v[8]*xv;
608:         s[9]  -= v[9]*xv;
609:         s[10] -= v[10]*xv;
610:         v     += 11;
611:       }
612:     }
613:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
614:     for (k=0; k<11; k++) {
615:       x[idt]    += v[0]*s[k];
616:       x[1+idt]  += v[1]*s[k];
617:       x[2+idt]  += v[2]*s[k];
618:       x[3+idt]  += v[3]*s[k];
619:       x[4+idt]  += v[4]*s[k];
620:       x[5+idt]  += v[5]*s[k];
621:       x[6+idt]  += v[6]*s[k];
622:       x[7+idt]  += v[7]*s[k];
623:       x[8+idt]  += v[8]*s[k];
624:       x[9+idt]  += v[9]*s[k];
625:       x[10+idt] += v[10]*s[k];
626:       v         += 11;
627:     }
628:   }
629:   VecRestoreArrayRead(bb,&b);
630:   VecRestoreArray(xx,&x);
631:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
632:   return(0);
633: }

635: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
636: {
637:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
638:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
639:   PetscErrorCode    ierr;
640:   PetscInt          i,nz,idx,idt,jdx;
641:   const MatScalar   *aa=a->a,*v;
642:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
643:   const PetscScalar *b;

646:   VecGetArrayRead(bb,&b);
647:   VecGetArray(xx,&x);
648:   /* forward solve the lower triangular */
649:   idx  = 0;
650:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
651:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
652:   x[6] = b[6+idx];
653:   for (i=1; i<n; i++) {
654:     v   =  aa + 49*ai[i];
655:     vi  =  aj + ai[i];
656:     nz  =  diag[i] - ai[i];
657:     idx =  7*i;
658:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
659:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
660:     s7  =  b[6+idx];
661:     while (nz--) {
662:       jdx = 7*(*vi++);
663:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
664:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
665:       x7  = x[6+jdx];
666:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
667:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
668:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
669:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
670:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
671:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
672:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
673:       v  += 49;
674:     }
675:     x[idx]   = s1;
676:     x[1+idx] = s2;
677:     x[2+idx] = s3;
678:     x[3+idx] = s4;
679:     x[4+idx] = s5;
680:     x[5+idx] = s6;
681:     x[6+idx] = s7;
682:   }
683:   /* backward solve the upper triangular */
684:   for (i=n-1; i>=0; i--) {
685:     v   = aa + 49*diag[i] + 49;
686:     vi  = aj + diag[i] + 1;
687:     nz  = ai[i+1] - diag[i] - 1;
688:     idt = 7*i;
689:     s1  = x[idt];   s2 = x[1+idt];
690:     s3  = x[2+idt]; s4 = x[3+idt];
691:     s5  = x[4+idt]; s6 = x[5+idt];
692:     s7  = x[6+idt];
693:     while (nz--) {
694:       idx = 7*(*vi++);
695:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
696:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
697:       x7  = x[6+idx];
698:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
699:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
700:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
701:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
702:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
703:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
704:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
705:       v  += 49;
706:     }
707:     v      = aa + 49*diag[i];
708:     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
709:              + v[28]*s5 + v[35]*s6 + v[42]*s7;
710:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
711:                + v[29]*s5 + v[36]*s6 + v[43]*s7;
712:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
713:                + v[30]*s5 + v[37]*s6 + v[44]*s7;
714:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
715:                + v[31]*s5 + v[38]*s6 + v[45]*s7;
716:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
717:                + v[32]*s5 + v[39]*s6 + v[46]*s7;
718:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
719:                + v[33]*s5 + v[40]*s6 + v[47]*s7;
720:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
721:                + v[34]*s5 + v[41]*s6 + v[48]*s7;
722:   }

724:   VecRestoreArrayRead(bb,&b);
725:   VecRestoreArray(xx,&x);
726:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
727:   return(0);
728: }

730: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
731: {
732:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
733:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
734:   PetscErrorCode    ierr;
735:   PetscInt          i,k,nz,idx,jdx,idt;
736:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
737:   const MatScalar   *aa=a->a,*v;
738:   PetscScalar       *x;
739:   const PetscScalar *b;
740:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

743:   VecGetArrayRead(bb,&b);
744:   VecGetArray(xx,&x);
745:   /* forward solve the lower triangular */
746:   idx  = 0;
747:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
748:   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
749:   for (i=1; i<n; i++) {
750:     v   = aa + bs2*ai[i];
751:     vi  = aj + ai[i];
752:     nz  = ai[i+1] - ai[i];
753:     idx = bs*i;
754:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
755:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
756:     for (k=0; k<nz; k++) {
757:       jdx = bs*vi[k];
758:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
759:       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
760:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
761:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
762:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
763:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
764:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
765:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
766:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
767:       v  +=  bs2;
768:     }

770:     x[idx]   = s1;
771:     x[1+idx] = s2;
772:     x[2+idx] = s3;
773:     x[3+idx] = s4;
774:     x[4+idx] = s5;
775:     x[5+idx] = s6;
776:     x[6+idx] = s7;
777:   }

779:   /* backward solve the upper triangular */
780:   for (i=n-1; i>=0; i--) {
781:     v   = aa + bs2*(adiag[i+1]+1);
782:     vi  = aj + adiag[i+1]+1;
783:     nz  = adiag[i] - adiag[i+1]-1;
784:     idt = bs*i;
785:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
786:     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
787:     for (k=0; k<nz; k++) {
788:       idx = bs*vi[k];
789:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
790:       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
791:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
792:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
793:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
794:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
795:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
796:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
797:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
798:       v  +=  bs2;
799:     }
800:     /* x = inv_diagonal*x */
801:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
802:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
803:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
804:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
805:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
806:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
807:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
808:   }

810:   VecRestoreArrayRead(bb,&b);
811:   VecRestoreArray(xx,&x);
812:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
813:   return(0);
814: }

816: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
817: {
818:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
819:   PetscInt          i,nz,idx,idt,jdx;
820:   PetscErrorCode    ierr;
821:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
822:   const MatScalar   *aa   =a->a,*v;
823:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
824:   const PetscScalar *b;

827:   VecGetArrayRead(bb,&b);
828:   VecGetArray(xx,&x);
829:   /* forward solve the lower triangular */
830:   idx  = 0;
831:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
832:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
833:   for (i=1; i<n; i++) {
834:     v   =  aa + 36*ai[i];
835:     vi  =  aj + ai[i];
836:     nz  =  diag[i] - ai[i];
837:     idx =  6*i;
838:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
839:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
840:     while (nz--) {
841:       jdx = 6*(*vi++);
842:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
843:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
844:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
845:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
846:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
847:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
848:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
849:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
850:       v  += 36;
851:     }
852:     x[idx]   = s1;
853:     x[1+idx] = s2;
854:     x[2+idx] = s3;
855:     x[3+idx] = s4;
856:     x[4+idx] = s5;
857:     x[5+idx] = s6;
858:   }
859:   /* backward solve the upper triangular */
860:   for (i=n-1; i>=0; i--) {
861:     v   = aa + 36*diag[i] + 36;
862:     vi  = aj + diag[i] + 1;
863:     nz  = ai[i+1] - diag[i] - 1;
864:     idt = 6*i;
865:     s1  = x[idt];   s2 = x[1+idt];
866:     s3  = x[2+idt]; s4 = x[3+idt];
867:     s5  = x[4+idt]; s6 = x[5+idt];
868:     while (nz--) {
869:       idx = 6*(*vi++);
870:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
871:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
872:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
873:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
874:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
875:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
876:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
877:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
878:       v  += 36;
879:     }
880:     v        = aa + 36*diag[i];
881:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
882:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
883:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
884:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
885:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
886:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
887:   }

889:   VecRestoreArrayRead(bb,&b);
890:   VecRestoreArray(xx,&x);
891:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
892:   return(0);
893: }

895: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
896: {
897:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
898:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
899:   PetscErrorCode    ierr;
900:   PetscInt          i,k,nz,idx,jdx,idt;
901:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
902:   const MatScalar   *aa=a->a,*v;
903:   PetscScalar       *x;
904:   const PetscScalar *b;
905:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

908:   VecGetArrayRead(bb,&b);
909:   VecGetArray(xx,&x);
910:   /* forward solve the lower triangular */
911:   idx  = 0;
912:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
913:   x[4] = b[4+idx];x[5] = b[5+idx];
914:   for (i=1; i<n; i++) {
915:     v   = aa + bs2*ai[i];
916:     vi  = aj + ai[i];
917:     nz  = ai[i+1] - ai[i];
918:     idx = bs*i;
919:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
920:     s5  = b[4+idx];s6 = b[5+idx];
921:     for (k=0; k<nz; k++) {
922:       jdx = bs*vi[k];
923:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
924:       x5  = x[4+jdx]; x6 = x[5+jdx];
925:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
926:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
927:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
928:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
929:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
930:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
931:       v  +=  bs2;
932:     }

934:     x[idx]   = s1;
935:     x[1+idx] = s2;
936:     x[2+idx] = s3;
937:     x[3+idx] = s4;
938:     x[4+idx] = s5;
939:     x[5+idx] = s6;
940:   }

942:   /* backward solve the upper triangular */
943:   for (i=n-1; i>=0; i--) {
944:     v   = aa + bs2*(adiag[i+1]+1);
945:     vi  = aj + adiag[i+1]+1;
946:     nz  = adiag[i] - adiag[i+1]-1;
947:     idt = bs*i;
948:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
949:     s5  = x[4+idt];s6 = x[5+idt];
950:     for (k=0; k<nz; k++) {
951:       idx = bs*vi[k];
952:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
953:       x5  = x[4+idx];x6 = x[5+idx];
954:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
955:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
956:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
957:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
958:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
959:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
960:       v  +=  bs2;
961:     }
962:     /* x = inv_diagonal*x */
963:     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
964:     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
965:     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
966:     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
967:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
968:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
969:   }

971:   VecRestoreArrayRead(bb,&b);
972:   VecRestoreArray(xx,&x);
973:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
974:   return(0);
975: }

977: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
978: {
979:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
980:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
981:   PetscInt          i,nz,idx,idt,jdx;
982:   PetscErrorCode    ierr;
983:   const MatScalar   *aa=a->a,*v;
984:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
985:   const PetscScalar *b;

988:   VecGetArrayRead(bb,&b);
989:   VecGetArray(xx,&x);
990:   /* forward solve the lower triangular */
991:   idx  = 0;
992:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
993:   for (i=1; i<n; i++) {
994:     v   =  aa + 25*ai[i];
995:     vi  =  aj + ai[i];
996:     nz  =  diag[i] - ai[i];
997:     idx =  5*i;
998:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
999:     while (nz--) {
1000:       jdx = 5*(*vi++);
1001:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1002:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1003:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1004:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1005:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1006:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1007:       v  += 25;
1008:     }
1009:     x[idx]   = s1;
1010:     x[1+idx] = s2;
1011:     x[2+idx] = s3;
1012:     x[3+idx] = s4;
1013:     x[4+idx] = s5;
1014:   }
1015:   /* backward solve the upper triangular */
1016:   for (i=n-1; i>=0; i--) {
1017:     v   = aa + 25*diag[i] + 25;
1018:     vi  = aj + diag[i] + 1;
1019:     nz  = ai[i+1] - diag[i] - 1;
1020:     idt = 5*i;
1021:     s1  = x[idt];  s2 = x[1+idt];
1022:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1023:     while (nz--) {
1024:       idx = 5*(*vi++);
1025:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1026:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1027:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1028:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1029:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1030:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1031:       v  += 25;
1032:     }
1033:     v        = aa + 25*diag[i];
1034:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1035:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1036:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1037:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1038:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1039:   }

1041:   VecRestoreArrayRead(bb,&b);
1042:   VecRestoreArray(xx,&x);
1043:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1044:   return(0);
1045: }

1047: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1048: {
1049:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1050:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1051:   PetscInt          i,k,nz,idx,idt,jdx;
1052:   PetscErrorCode    ierr;
1053:   const MatScalar   *aa=a->a,*v;
1054:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1055:   const PetscScalar *b;

1058:   VecGetArrayRead(bb,&b);
1059:   VecGetArray(xx,&x);
1060:   /* forward solve the lower triangular */
1061:   idx  = 0;
1062:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1063:   for (i=1; i<n; i++) {
1064:     v   = aa + 25*ai[i];
1065:     vi  = aj + ai[i];
1066:     nz  = ai[i+1] - ai[i];
1067:     idx = 5*i;
1068:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1069:     for (k=0; k<nz; k++) {
1070:       jdx = 5*vi[k];
1071:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1072:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1073:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1074:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1075:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1076:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1077:       v  += 25;
1078:     }
1079:     x[idx]   = s1;
1080:     x[1+idx] = s2;
1081:     x[2+idx] = s3;
1082:     x[3+idx] = s4;
1083:     x[4+idx] = s5;
1084:   }

1086:   /* backward solve the upper triangular */
1087:   for (i=n-1; i>=0; i--) {
1088:     v   = aa + 25*(adiag[i+1]+1);
1089:     vi  = aj + adiag[i+1]+1;
1090:     nz  = adiag[i] - adiag[i+1]-1;
1091:     idt = 5*i;
1092:     s1  = x[idt];  s2 = x[1+idt];
1093:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1094:     for (k=0; k<nz; k++) {
1095:       idx = 5*vi[k];
1096:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1097:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1098:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1099:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1100:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1101:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1102:       v  += 25;
1103:     }
1104:     /* x = inv_diagonal*x */
1105:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1106:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1107:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1108:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1109:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1110:   }

1112:   VecRestoreArrayRead(bb,&b);
1113:   VecRestoreArray(xx,&x);
1114:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1115:   return(0);
1116: }

1118: /*
1119:       Special case where the matrix was ILU(0) factored in the natural
1120:    ordering. This eliminates the need for the column and row permutation.
1121: */
1122: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1123: {
1124:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1125:   PetscInt          n  =a->mbs;
1126:   const PetscInt    *ai=a->i,*aj=a->j;
1127:   PetscErrorCode    ierr;
1128:   const PetscInt    *diag = a->diag;
1129:   const MatScalar   *aa   =a->a;
1130:   PetscScalar       *x;
1131:   const PetscScalar *b;

1134:   VecGetArrayRead(bb,&b);
1135:   VecGetArray(xx,&x);

1137: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1138:   {
1139:     static PetscScalar w[2000]; /* very BAD need to fix */
1140:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1141:   }
1142: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1143:   {
1144:     static PetscScalar w[2000]; /* very BAD need to fix */
1145:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1146:   }
1147: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1148:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1149: #else
1150:   {
1151:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
1152:     const MatScalar *v;
1153:     PetscInt        jdx,idt,idx,nz,i,ai16;
1154:     const PetscInt  *vi;

1156:     /* forward solve the lower triangular */
1157:     idx  = 0;
1158:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1159:     for (i=1; i<n; i++) {
1160:       v    =  aa      + 16*ai[i];
1161:       vi   =  aj      + ai[i];
1162:       nz   =  diag[i] - ai[i];
1163:       idx +=  4;
1164:       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1165:       while (nz--) {
1166:         jdx = 4*(*vi++);
1167:         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1168:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1169:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1170:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1171:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1172:         v  += 16;
1173:       }
1174:       x[idx]   = s1;
1175:       x[1+idx] = s2;
1176:       x[2+idx] = s3;
1177:       x[3+idx] = s4;
1178:     }
1179:     /* backward solve the upper triangular */
1180:     idt = 4*(n-1);
1181:     for (i=n-1; i>=0; i--) {
1182:       ai16 = 16*diag[i];
1183:       v    = aa + ai16 + 16;
1184:       vi   = aj + diag[i] + 1;
1185:       nz   = ai[i+1] - diag[i] - 1;
1186:       s1   = x[idt];  s2 = x[1+idt];
1187:       s3   = x[2+idt];s4 = x[3+idt];
1188:       while (nz--) {
1189:         idx = 4*(*vi++);
1190:         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1191:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1192:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1193:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1194:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1195:         v  += 16;
1196:       }
1197:       v        = aa + ai16;
1198:       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1199:       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1200:       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1201:       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1202:       idt     -= 4;
1203:     }
1204:   }
1205: #endif

1207:   VecRestoreArrayRead(bb,&b);
1208:   VecRestoreArray(xx,&x);
1209:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1210:   return(0);
1211: }

1213: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1214: {
1215:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1216:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1217:   PetscInt          i,k,nz,idx,jdx,idt;
1218:   PetscErrorCode    ierr;
1219:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1220:   const MatScalar   *aa=a->a,*v;
1221:   PetscScalar       *x;
1222:   const PetscScalar *b;
1223:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

1226:   VecGetArrayRead(bb,&b);
1227:   VecGetArray(xx,&x);
1228:   /* forward solve the lower triangular */
1229:   idx  = 0;
1230:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1231:   for (i=1; i<n; i++) {
1232:     v   = aa + bs2*ai[i];
1233:     vi  = aj + ai[i];
1234:     nz  = ai[i+1] - ai[i];
1235:     idx = bs*i;
1236:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1237:     for (k=0; k<nz; k++) {
1238:       jdx = bs*vi[k];
1239:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1240:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1241:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1242:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1243:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

1245:       v +=  bs2;
1246:     }

1248:     x[idx]   = s1;
1249:     x[1+idx] = s2;
1250:     x[2+idx] = s3;
1251:     x[3+idx] = s4;
1252:   }

1254:   /* backward solve the upper triangular */
1255:   for (i=n-1; i>=0; i--) {
1256:     v   = aa + bs2*(adiag[i+1]+1);
1257:     vi  = aj + adiag[i+1]+1;
1258:     nz  = adiag[i] - adiag[i+1]-1;
1259:     idt = bs*i;
1260:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];

1262:     for (k=0; k<nz; k++) {
1263:       idx = bs*vi[k];
1264:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1265:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1266:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1267:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1268:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

1270:       v +=  bs2;
1271:     }
1272:     /* x = inv_diagonal*x */
1273:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1274:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
1275:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1276:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

1278:   }

1280:   VecRestoreArrayRead(bb,&b);
1281:   VecRestoreArray(xx,&x);
1282:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1283:   return(0);
1284: }

1286: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1287: {
1288:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1289:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1290:   PetscErrorCode    ierr;
1291:   const MatScalar   *aa=a->a;
1292:   const PetscScalar *b;
1293:   PetscScalar       *x;

1296:   VecGetArrayRead(bb,&b);
1297:   VecGetArray(xx,&x);

1299:   {
1300:     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1301:     const MatScalar *v;
1302:     MatScalar       *t=(MatScalar*)x;
1303:     PetscInt        jdx,idt,idx,nz,i,ai16;
1304:     const PetscInt  *vi;

1306:     /* forward solve the lower triangular */
1307:     idx  = 0;
1308:     t[0] = (MatScalar)b[0];
1309:     t[1] = (MatScalar)b[1];
1310:     t[2] = (MatScalar)b[2];
1311:     t[3] = (MatScalar)b[3];
1312:     for (i=1; i<n; i++) {
1313:       v    =  aa      + 16*ai[i];
1314:       vi   =  aj      + ai[i];
1315:       nz   =  diag[i] - ai[i];
1316:       idx +=  4;
1317:       s1   = (MatScalar)b[idx];
1318:       s2   = (MatScalar)b[1+idx];
1319:       s3   = (MatScalar)b[2+idx];
1320:       s4   = (MatScalar)b[3+idx];
1321:       while (nz--) {
1322:         jdx = 4*(*vi++);
1323:         x1  = t[jdx];
1324:         x2  = t[1+jdx];
1325:         x3  = t[2+jdx];
1326:         x4  = t[3+jdx];
1327:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1328:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1329:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1330:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1331:         v  += 16;
1332:       }
1333:       t[idx]   = s1;
1334:       t[1+idx] = s2;
1335:       t[2+idx] = s3;
1336:       t[3+idx] = s4;
1337:     }
1338:     /* backward solve the upper triangular */
1339:     idt = 4*(n-1);
1340:     for (i=n-1; i>=0; i--) {
1341:       ai16 = 16*diag[i];
1342:       v    = aa + ai16 + 16;
1343:       vi   = aj + diag[i] + 1;
1344:       nz   = ai[i+1] - diag[i] - 1;
1345:       s1   = t[idt];
1346:       s2   = t[1+idt];
1347:       s3   = t[2+idt];
1348:       s4   = t[3+idt];
1349:       while (nz--) {
1350:         idx = 4*(*vi++);
1351:         x1  = (MatScalar)x[idx];
1352:         x2  = (MatScalar)x[1+idx];
1353:         x3  = (MatScalar)x[2+idx];
1354:         x4  = (MatScalar)x[3+idx];
1355:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1356:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1357:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1358:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1359:         v  += 16;
1360:       }
1361:       v        = aa + ai16;
1362:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
1363:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
1364:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
1365:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
1366:       idt     -= 4;
1367:     }
1368:   }

1370:   VecRestoreArrayRead(bb,&b);
1371:   VecRestoreArray(xx,&x);
1372:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1373:   return(0);
1374: }

1376: #if defined(PETSC_HAVE_SSE)

1378: #include PETSC_HAVE_SSE
1379: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1380: {
1381:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1382:   unsigned short *aj=(unsigned short*)a->j;
1384:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1385:   MatScalar      *aa=a->a;
1386:   PetscScalar    *x,*b;

1389:   SSE_SCOPE_BEGIN;
1390:   /*
1391:      Note: This code currently uses demotion of double
1392:      to float when performing the mixed-mode computation.
1393:      This may not be numerically reasonable for all applications.
1394:   */
1395:   PREFETCH_NTA(aa+16*ai[1]);

1397:   VecGetArray(bb,&b);
1398:   VecGetArray(xx,&x);
1399:   {
1400:     /* x will first be computed in single precision then promoted inplace to double */
1401:     MatScalar      *v,*t=(MatScalar*)x;
1402:     int            nz,i,idt,ai16;
1403:     unsigned int   jdx,idx;
1404:     unsigned short *vi;
1405:     /* Forward solve the lower triangular factor. */

1407:     /* First block is the identity. */
1408:     idx = 0;
1409:     CONVERT_DOUBLE4_FLOAT4(t,b);
1410:     v =  aa + 16*((unsigned int)ai[1]);

1412:     for (i=1; i<n; ) {
1413:       PREFETCH_NTA(&v[8]);
1414:       vi   =  aj      + ai[i];
1415:       nz   =  diag[i] - ai[i];
1416:       idx +=  4;

1418:       /* Demote RHS from double to float. */
1419:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1420:       LOAD_PS(&t[idx],XMM7);

1422:       while (nz--) {
1423:         PREFETCH_NTA(&v[16]);
1424:         jdx = 4*((unsigned int)(*vi++));

1426:         /* 4x4 Matrix-Vector product with negative accumulation: */
1427:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1428:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1430:         /* First Column */
1431:         SSE_COPY_PS(XMM0,XMM6)
1432:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1433:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1434:         SSE_SUB_PS(XMM7,XMM0)

1436:         /* Second Column */
1437:         SSE_COPY_PS(XMM1,XMM6)
1438:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1439:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1440:         SSE_SUB_PS(XMM7,XMM1)

1442:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1444:         /* Third Column */
1445:         SSE_COPY_PS(XMM2,XMM6)
1446:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1447:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1448:         SSE_SUB_PS(XMM7,XMM2)

1450:         /* Fourth Column */
1451:         SSE_COPY_PS(XMM3,XMM6)
1452:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1453:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1454:         SSE_SUB_PS(XMM7,XMM3)
1455:         SSE_INLINE_END_2

1457:         v += 16;
1458:       }
1459:       v =  aa + 16*ai[++i];
1460:       PREFETCH_NTA(v);
1461:       STORE_PS(&t[idx],XMM7);
1462:     }

1464:     /* Backward solve the upper triangular factor.*/

1466:     idt  = 4*(n-1);
1467:     ai16 = 16*diag[n-1];
1468:     v    = aa + ai16 + 16;
1469:     for (i=n-1; i>=0; ) {
1470:       PREFETCH_NTA(&v[8]);
1471:       vi = aj + diag[i] + 1;
1472:       nz = ai[i+1] - diag[i] - 1;

1474:       LOAD_PS(&t[idt],XMM7);

1476:       while (nz--) {
1477:         PREFETCH_NTA(&v[16]);
1478:         idx = 4*((unsigned int)(*vi++));

1480:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1481:         SSE_INLINE_BEGIN_2(&t[idx],v)
1482:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1484:         /* First Column */
1485:         SSE_COPY_PS(XMM0,XMM6)
1486:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1487:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1488:         SSE_SUB_PS(XMM7,XMM0)

1490:         /* Second Column */
1491:         SSE_COPY_PS(XMM1,XMM6)
1492:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1493:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1494:         SSE_SUB_PS(XMM7,XMM1)

1496:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1498:         /* Third Column */
1499:         SSE_COPY_PS(XMM2,XMM6)
1500:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1501:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1502:         SSE_SUB_PS(XMM7,XMM2)

1504:         /* Fourth Column */
1505:         SSE_COPY_PS(XMM3,XMM6)
1506:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1507:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1508:         SSE_SUB_PS(XMM7,XMM3)
1509:         SSE_INLINE_END_2
1510:         v += 16;
1511:       }
1512:       v    = aa + ai16;
1513:       ai16 = 16*diag[--i];
1514:       PREFETCH_NTA(aa+ai16+16);
1515:       /*
1516:          Scale the result by the diagonal 4x4 block,
1517:          which was inverted as part of the factorization
1518:       */
1519:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1520:       /* First Column */
1521:       SSE_COPY_PS(XMM0,XMM7)
1522:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1523:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1525:       /* Second Column */
1526:       SSE_COPY_PS(XMM1,XMM7)
1527:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1528:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1529:       SSE_ADD_PS(XMM0,XMM1)

1531:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1533:       /* Third Column */
1534:       SSE_COPY_PS(XMM2,XMM7)
1535:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1536:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1537:       SSE_ADD_PS(XMM0,XMM2)

1539:       /* Fourth Column */
1540:       SSE_COPY_PS(XMM3,XMM7)
1541:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1542:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1543:       SSE_ADD_PS(XMM0,XMM3)

1545:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1546:       SSE_INLINE_END_3

1548:       v    = aa + ai16 + 16;
1549:       idt -= 4;
1550:     }

1552:     /* Convert t from single precision back to double precision (inplace)*/
1553:     idt = 4*(n-1);
1554:     for (i=n-1; i>=0; i--) {
1555:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1556:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1557:       PetscScalar *xtemp=&x[idt];
1558:       MatScalar   *ttemp=&t[idt];
1559:       xtemp[3] = (PetscScalar)ttemp[3];
1560:       xtemp[2] = (PetscScalar)ttemp[2];
1561:       xtemp[1] = (PetscScalar)ttemp[1];
1562:       xtemp[0] = (PetscScalar)ttemp[0];
1563:       idt     -= 4;
1564:     }

1566:   } /* End of artificial scope. */
1567:   VecRestoreArray(bb,&b);
1568:   VecRestoreArray(xx,&x);
1569:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1570:   SSE_SCOPE_END;
1571:   return(0);
1572: }

1574: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1575: {
1576:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1577:   int            *aj=a->j;
1579:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1580:   MatScalar      *aa=a->a;
1581:   PetscScalar    *x,*b;

1584:   SSE_SCOPE_BEGIN;
1585:   /*
1586:      Note: This code currently uses demotion of double
1587:      to float when performing the mixed-mode computation.
1588:      This may not be numerically reasonable for all applications.
1589:   */
1590:   PREFETCH_NTA(aa+16*ai[1]);

1592:   VecGetArray(bb,&b);
1593:   VecGetArray(xx,&x);
1594:   {
1595:     /* x will first be computed in single precision then promoted inplace to double */
1596:     MatScalar *v,*t=(MatScalar*)x;
1597:     int       nz,i,idt,ai16;
1598:     int       jdx,idx;
1599:     int       *vi;
1600:     /* Forward solve the lower triangular factor. */

1602:     /* First block is the identity. */
1603:     idx = 0;
1604:     CONVERT_DOUBLE4_FLOAT4(t,b);
1605:     v =  aa + 16*ai[1];

1607:     for (i=1; i<n; ) {
1608:       PREFETCH_NTA(&v[8]);
1609:       vi   =  aj      + ai[i];
1610:       nz   =  diag[i] - ai[i];
1611:       idx +=  4;

1613:       /* Demote RHS from double to float. */
1614:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1615:       LOAD_PS(&t[idx],XMM7);

1617:       while (nz--) {
1618:         PREFETCH_NTA(&v[16]);
1619:         jdx = 4*(*vi++);
1620: /*          jdx = *vi++; */

1622:         /* 4x4 Matrix-Vector product with negative accumulation: */
1623:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1624:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1626:         /* First Column */
1627:         SSE_COPY_PS(XMM0,XMM6)
1628:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1629:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1630:         SSE_SUB_PS(XMM7,XMM0)

1632:         /* Second Column */
1633:         SSE_COPY_PS(XMM1,XMM6)
1634:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1635:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1636:         SSE_SUB_PS(XMM7,XMM1)

1638:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1640:         /* Third Column */
1641:         SSE_COPY_PS(XMM2,XMM6)
1642:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1643:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1644:         SSE_SUB_PS(XMM7,XMM2)

1646:         /* Fourth Column */
1647:         SSE_COPY_PS(XMM3,XMM6)
1648:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1649:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1650:         SSE_SUB_PS(XMM7,XMM3)
1651:         SSE_INLINE_END_2

1653:         v += 16;
1654:       }
1655:       v =  aa + 16*ai[++i];
1656:       PREFETCH_NTA(v);
1657:       STORE_PS(&t[idx],XMM7);
1658:     }

1660:     /* Backward solve the upper triangular factor.*/

1662:     idt  = 4*(n-1);
1663:     ai16 = 16*diag[n-1];
1664:     v    = aa + ai16 + 16;
1665:     for (i=n-1; i>=0; ) {
1666:       PREFETCH_NTA(&v[8]);
1667:       vi = aj + diag[i] + 1;
1668:       nz = ai[i+1] - diag[i] - 1;

1670:       LOAD_PS(&t[idt],XMM7);

1672:       while (nz--) {
1673:         PREFETCH_NTA(&v[16]);
1674:         idx = 4*(*vi++);
1675: /*          idx = *vi++; */

1677:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1678:         SSE_INLINE_BEGIN_2(&t[idx],v)
1679:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1681:         /* First Column */
1682:         SSE_COPY_PS(XMM0,XMM6)
1683:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1684:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1685:         SSE_SUB_PS(XMM7,XMM0)

1687:         /* Second Column */
1688:         SSE_COPY_PS(XMM1,XMM6)
1689:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1690:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1691:         SSE_SUB_PS(XMM7,XMM1)

1693:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1695:         /* Third Column */
1696:         SSE_COPY_PS(XMM2,XMM6)
1697:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1698:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1699:         SSE_SUB_PS(XMM7,XMM2)

1701:         /* Fourth Column */
1702:         SSE_COPY_PS(XMM3,XMM6)
1703:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1704:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1705:         SSE_SUB_PS(XMM7,XMM3)
1706:         SSE_INLINE_END_2
1707:         v += 16;
1708:       }
1709:       v    = aa + ai16;
1710:       ai16 = 16*diag[--i];
1711:       PREFETCH_NTA(aa+ai16+16);
1712:       /*
1713:          Scale the result by the diagonal 4x4 block,
1714:          which was inverted as part of the factorization
1715:       */
1716:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1717:       /* First Column */
1718:       SSE_COPY_PS(XMM0,XMM7)
1719:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1720:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1722:       /* Second Column */
1723:       SSE_COPY_PS(XMM1,XMM7)
1724:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1725:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1726:       SSE_ADD_PS(XMM0,XMM1)

1728:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1730:       /* Third Column */
1731:       SSE_COPY_PS(XMM2,XMM7)
1732:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1733:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1734:       SSE_ADD_PS(XMM0,XMM2)

1736:       /* Fourth Column */
1737:       SSE_COPY_PS(XMM3,XMM7)
1738:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1739:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1740:       SSE_ADD_PS(XMM0,XMM3)

1742:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1743:       SSE_INLINE_END_3

1745:       v    = aa + ai16 + 16;
1746:       idt -= 4;
1747:     }

1749:     /* Convert t from single precision back to double precision (inplace)*/
1750:     idt = 4*(n-1);
1751:     for (i=n-1; i>=0; i--) {
1752:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1753:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1754:       PetscScalar *xtemp=&x[idt];
1755:       MatScalar   *ttemp=&t[idt];
1756:       xtemp[3] = (PetscScalar)ttemp[3];
1757:       xtemp[2] = (PetscScalar)ttemp[2];
1758:       xtemp[1] = (PetscScalar)ttemp[1];
1759:       xtemp[0] = (PetscScalar)ttemp[0];
1760:       idt     -= 4;
1761:     }

1763:   } /* End of artificial scope. */
1764:   VecRestoreArray(bb,&b);
1765:   VecRestoreArray(xx,&x);
1766:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1767:   SSE_SCOPE_END;
1768:   return(0);
1769: }

1771: #endif

1773: /*
1774:       Special case where the matrix was ILU(0) factored in the natural
1775:    ordering. This eliminates the need for the column and row permutation.
1776: */
1777: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1778: {
1779:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1780:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
1781:   PetscErrorCode    ierr;
1782:   const PetscInt    *diag = a->diag,*vi;
1783:   const MatScalar   *aa   =a->a,*v;
1784:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
1785:   const PetscScalar *b;
1786:   PetscInt          jdx,idt,idx,nz,i;

1789:   VecGetArrayRead(bb,&b);
1790:   VecGetArray(xx,&x);

1792:   /* forward solve the lower triangular */
1793:   idx  = 0;
1794:   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1795:   for (i=1; i<n; i++) {
1796:     v    =  aa      + 9*ai[i];
1797:     vi   =  aj      + ai[i];
1798:     nz   =  diag[i] - ai[i];
1799:     idx +=  3;
1800:     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1801:     while (nz--) {
1802:       jdx = 3*(*vi++);
1803:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1804:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1805:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1806:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1807:       v  += 9;
1808:     }
1809:     x[idx]   = s1;
1810:     x[1+idx] = s2;
1811:     x[2+idx] = s3;
1812:   }
1813:   /* backward solve the upper triangular */
1814:   for (i=n-1; i>=0; i--) {
1815:     v   = aa + 9*diag[i] + 9;
1816:     vi  = aj + diag[i] + 1;
1817:     nz  = ai[i+1] - diag[i] - 1;
1818:     idt = 3*i;
1819:     s1  = x[idt];  s2 = x[1+idt];
1820:     s3  = x[2+idt];
1821:     while (nz--) {
1822:       idx = 3*(*vi++);
1823:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1824:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1825:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1826:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1827:       v  += 9;
1828:     }
1829:     v        = aa +  9*diag[i];
1830:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1831:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1832:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1833:   }

1835:   VecRestoreArrayRead(bb,&b);
1836:   VecRestoreArray(xx,&x);
1837:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1838:   return(0);
1839: }

1841: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1842: {
1843:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1844:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1845:   PetscErrorCode    ierr;
1846:   PetscInt          i,k,nz,idx,jdx,idt;
1847:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1848:   const MatScalar   *aa=a->a,*v;
1849:   PetscScalar       *x;
1850:   const PetscScalar *b;
1851:   PetscScalar       s1,s2,s3,x1,x2,x3;

1854:   VecGetArrayRead(bb,&b);
1855:   VecGetArray(xx,&x);
1856:   /* forward solve the lower triangular */
1857:   idx  = 0;
1858:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1859:   for (i=1; i<n; i++) {
1860:     v   = aa + bs2*ai[i];
1861:     vi  = aj + ai[i];
1862:     nz  = ai[i+1] - ai[i];
1863:     idx = bs*i;
1864:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1865:     for (k=0; k<nz; k++) {
1866:       jdx = bs*vi[k];
1867:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1868:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1869:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1870:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1872:       v +=  bs2;
1873:     }

1875:     x[idx]   = s1;
1876:     x[1+idx] = s2;
1877:     x[2+idx] = s3;
1878:   }

1880:   /* backward solve the upper triangular */
1881:   for (i=n-1; i>=0; i--) {
1882:     v   = aa + bs2*(adiag[i+1]+1);
1883:     vi  = aj + adiag[i+1]+1;
1884:     nz  = adiag[i] - adiag[i+1]-1;
1885:     idt = bs*i;
1886:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];

1888:     for (k=0; k<nz; k++) {
1889:       idx = bs*vi[k];
1890:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1891:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1892:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1893:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1895:       v +=  bs2;
1896:     }
1897:     /* x = inv_diagonal*x */
1898:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1899:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1900:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1902:   }

1904:   VecRestoreArrayRead(bb,&b);
1905:   VecRestoreArray(xx,&x);
1906:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1907:   return(0);
1908: }

1910: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1911: {
1912:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1913:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j;
1914:   PetscErrorCode    ierr;
1915:   PetscInt          i,k,nz,idx,jdx;
1916:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1917:   const MatScalar   *aa=a->a,*v;
1918:   PetscScalar       *x;
1919:   const PetscScalar *b;
1920:   PetscScalar       s1,s2,s3,x1,x2,x3;

1923:   VecGetArrayRead(bb,&b);
1924:   VecGetArray(xx,&x);
1925:   /* forward solve the lower triangular */
1926:   idx  = 0;
1927:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1928:   for (i=1; i<n; i++) {
1929:     v   = aa + bs2*ai[i];
1930:     vi  = aj + ai[i];
1931:     nz  = ai[i+1] - ai[i];
1932:     idx = bs*i;
1933:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1934:     for (k=0; k<nz; k++) {
1935:       jdx = bs*vi[k];
1936:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1937:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1938:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1939:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1941:       v +=  bs2;
1942:     }

1944:     x[idx]   = s1;
1945:     x[1+idx] = s2;
1946:     x[2+idx] = s3;
1947:   }

1949:   VecRestoreArrayRead(bb,&b);
1950:   VecRestoreArray(xx,&x);
1951:   PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1952:   return(0);
1953: }


1956: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1957: {
1958:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1959:   const PetscInt    n  =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1960:   PetscErrorCode    ierr;
1961:   PetscInt          i,k,nz,idx,idt;
1962:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1963:   const MatScalar   *aa=a->a,*v;
1964:   PetscScalar       *x;
1965:   const PetscScalar *b;
1966:   PetscScalar       s1,s2,s3,x1,x2,x3;

1969:   VecGetArrayRead(bb,&b);
1970:   VecGetArray(xx,&x);

1972:   /* backward solve the upper triangular */
1973:   for (i=n-1; i>=0; i--) {
1974:     v   = aa + bs2*(adiag[i+1]+1);
1975:     vi  = aj + adiag[i+1]+1;
1976:     nz  = adiag[i] - adiag[i+1]-1;
1977:     idt = bs*i;
1978:     s1  = b[idt];  s2 = b[1+idt];s3 = b[2+idt];

1980:     for (k=0; k<nz; k++) {
1981:       idx = bs*vi[k];
1982:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1983:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1984:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1985:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1987:       v +=  bs2;
1988:     }
1989:     /* x = inv_diagonal*x */
1990:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1991:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1992:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1994:   }

1996:   VecRestoreArrayRead(bb,&b);
1997:   VecRestoreArray(xx,&x);
1998:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1999:   return(0);
2000: }

2002: /*
2003:       Special case where the matrix was ILU(0) factored in the natural
2004:    ordering. This eliminates the need for the column and row permutation.
2005: */
2006: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2007: {
2008:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2009:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2010:   PetscErrorCode    ierr;
2011:   const MatScalar   *aa=a->a,*v;
2012:   PetscScalar       *x,s1,s2,x1,x2;
2013:   const PetscScalar *b;
2014:   PetscInt          jdx,idt,idx,nz,i;

2017:   VecGetArrayRead(bb,&b);
2018:   VecGetArray(xx,&x);

2020:   /* forward solve the lower triangular */
2021:   idx  = 0;
2022:   x[0] = b[0]; x[1] = b[1];
2023:   for (i=1; i<n; i++) {
2024:     v    =  aa      + 4*ai[i];
2025:     vi   =  aj      + ai[i];
2026:     nz   =  diag[i] - ai[i];
2027:     idx +=  2;
2028:     s1   =  b[idx];s2 = b[1+idx];
2029:     while (nz--) {
2030:       jdx = 2*(*vi++);
2031:       x1  = x[jdx];x2 = x[1+jdx];
2032:       s1 -= v[0]*x1 + v[2]*x2;
2033:       s2 -= v[1]*x1 + v[3]*x2;
2034:       v  += 4;
2035:     }
2036:     x[idx]   = s1;
2037:     x[1+idx] = s2;
2038:   }
2039:   /* backward solve the upper triangular */
2040:   for (i=n-1; i>=0; i--) {
2041:     v   = aa + 4*diag[i] + 4;
2042:     vi  = aj + diag[i] + 1;
2043:     nz  = ai[i+1] - diag[i] - 1;
2044:     idt = 2*i;
2045:     s1  = x[idt];  s2 = x[1+idt];
2046:     while (nz--) {
2047:       idx = 2*(*vi++);
2048:       x1  = x[idx];   x2 = x[1+idx];
2049:       s1 -= v[0]*x1 + v[2]*x2;
2050:       s2 -= v[1]*x1 + v[3]*x2;
2051:       v  += 4;
2052:     }
2053:     v        = aa +  4*diag[i];
2054:     x[idt]   = v[0]*s1 + v[2]*s2;
2055:     x[1+idt] = v[1]*s1 + v[3]*s2;
2056:   }

2058:   VecRestoreArrayRead(bb,&b);
2059:   VecRestoreArray(xx,&x);
2060:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2061:   return(0);
2062: }

2064: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2065: {
2066:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2067:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2068:   PetscInt          i,k,nz,idx,idt,jdx;
2069:   PetscErrorCode    ierr;
2070:   const MatScalar   *aa=a->a,*v;
2071:   PetscScalar       *x,s1,s2,x1,x2;
2072:   const PetscScalar *b;

2075:   VecGetArrayRead(bb,&b);
2076:   VecGetArray(xx,&x);
2077:   /* forward solve the lower triangular */
2078:   idx  = 0;
2079:   x[0] = b[idx]; x[1] = b[1+idx];
2080:   for (i=1; i<n; i++) {
2081:     v   = aa + 4*ai[i];
2082:     vi  = aj + ai[i];
2083:     nz  = ai[i+1] - ai[i];
2084:     idx = 2*i;
2085:     s1  = b[idx];s2 = b[1+idx];
2086:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2087:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2088:     for (k=0; k<nz; k++) {
2089:       jdx = 2*vi[k];
2090:       x1  = x[jdx];x2 = x[1+jdx];
2091:       s1 -= v[0]*x1 + v[2]*x2;
2092:       s2 -= v[1]*x1 + v[3]*x2;
2093:       v  +=  4;
2094:     }
2095:     x[idx]   = s1;
2096:     x[1+idx] = s2;
2097:   }

2099:   /* backward solve the upper triangular */
2100:   for (i=n-1; i>=0; i--) {
2101:     v   = aa + 4*(adiag[i+1]+1);
2102:     vi  = aj + adiag[i+1]+1;
2103:     nz  = adiag[i] - adiag[i+1]-1;
2104:     idt = 2*i;
2105:     s1  = x[idt];  s2 = x[1+idt];
2106:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2107:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2108:     for (k=0; k<nz; k++) {
2109:       idx = 2*vi[k];
2110:       x1  = x[idx];   x2 = x[1+idx];
2111:       s1 -= v[0]*x1 + v[2]*x2;
2112:       s2 -= v[1]*x1 + v[3]*x2;
2113:       v  += 4;
2114:     }
2115:     /* x = inv_diagonal*x */
2116:     x[idt]   = v[0]*s1 + v[2]*s2;
2117:     x[1+idt] = v[1]*s1 + v[3]*s2;
2118:   }

2120:   VecRestoreArrayRead(bb,&b);
2121:   VecRestoreArray(xx,&x);
2122:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2123:   return(0);
2124: }

2126: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2127: {
2128:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2129:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j;
2130:   PetscInt          i,k,nz,idx,jdx;
2131:   PetscErrorCode    ierr;
2132:   const MatScalar   *aa=a->a,*v;
2133:   PetscScalar       *x,s1,s2,x1,x2;
2134:   const PetscScalar *b;

2137:   VecGetArrayRead(bb,&b);
2138:   VecGetArray(xx,&x);
2139:   /* forward solve the lower triangular */
2140:   idx  = 0;
2141:   x[0] = b[idx]; x[1] = b[1+idx];
2142:   for (i=1; i<n; i++) {
2143:     v   = aa + 4*ai[i];
2144:     vi  = aj + ai[i];
2145:     nz  = ai[i+1] - ai[i];
2146:     idx = 2*i;
2147:     s1  = b[idx];s2 = b[1+idx];
2148:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2149:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2150:     for (k=0; k<nz; k++) {
2151:       jdx = 2*vi[k];
2152:       x1  = x[jdx];x2 = x[1+jdx];
2153:       s1 -= v[0]*x1 + v[2]*x2;
2154:       s2 -= v[1]*x1 + v[3]*x2;
2155:       v  +=  4;
2156:     }
2157:     x[idx]   = s1;
2158:     x[1+idx] = s2;
2159:   }


2162:   VecRestoreArrayRead(bb,&b);
2163:   VecRestoreArray(xx,&x);
2164:   PetscLogFlops(4.0*(a->nz) - A->cmap->n);
2165:   return(0);
2166: }

2168: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2169: {
2170:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2171:   const PetscInt    n  = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
2172:   PetscInt          i,k,nz,idx,idt;
2173:   PetscErrorCode    ierr;
2174:   const MatScalar   *aa=a->a,*v;
2175:   PetscScalar       *x,s1,s2,x1,x2;
2176:   const PetscScalar *b;

2179:   VecGetArrayRead(bb,&b);
2180:   VecGetArray(xx,&x);

2182:   /* backward solve the upper triangular */
2183:   for (i=n-1; i>=0; i--) {
2184:     v   = aa + 4*(adiag[i+1]+1);
2185:     vi  = aj + adiag[i+1]+1;
2186:     nz  = adiag[i] - adiag[i+1]-1;
2187:     idt = 2*i;
2188:     s1  = b[idt];  s2 = b[1+idt];
2189:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2190:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2191:     for (k=0; k<nz; k++) {
2192:       idx = 2*vi[k];
2193:       x1  = x[idx];   x2 = x[1+idx];
2194:       s1 -= v[0]*x1 + v[2]*x2;
2195:       s2 -= v[1]*x1 + v[3]*x2;
2196:       v  += 4;
2197:     }
2198:     /* x = inv_diagonal*x */
2199:     x[idt]   = v[0]*s1 + v[2]*s2;
2200:     x[1+idt] = v[1]*s1 + v[3]*s2;
2201:   }

2203:   VecRestoreArrayRead(bb,&b);
2204:   VecRestoreArray(xx,&x);
2205:   PetscLogFlops(4.0*a->nz - A->cmap->n);
2206:   return(0);
2207: }

2209: /*
2210:       Special case where the matrix was ILU(0) factored in the natural
2211:    ordering. This eliminates the need for the column and row permutation.
2212: */
2213: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2214: {
2215:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2216:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2217:   PetscErrorCode    ierr;
2218:   const MatScalar   *aa=a->a,*v;
2219:   PetscScalar       *x;
2220:   const PetscScalar *b;
2221:   PetscScalar       s1,x1;
2222:   PetscInt          jdx,idt,idx,nz,i;

2225:   VecGetArrayRead(bb,&b);
2226:   VecGetArray(xx,&x);

2228:   /* forward solve the lower triangular */
2229:   idx  = 0;
2230:   x[0] = b[0];
2231:   for (i=1; i<n; i++) {
2232:     v    =  aa      + ai[i];
2233:     vi   =  aj      + ai[i];
2234:     nz   =  diag[i] - ai[i];
2235:     idx +=  1;
2236:     s1   =  b[idx];
2237:     while (nz--) {
2238:       jdx = *vi++;
2239:       x1  = x[jdx];
2240:       s1 -= v[0]*x1;
2241:       v  += 1;
2242:     }
2243:     x[idx] = s1;
2244:   }
2245:   /* backward solve the upper triangular */
2246:   for (i=n-1; i>=0; i--) {
2247:     v   = aa + diag[i] + 1;
2248:     vi  = aj + diag[i] + 1;
2249:     nz  = ai[i+1] - diag[i] - 1;
2250:     idt = i;
2251:     s1  = x[idt];
2252:     while (nz--) {
2253:       idx = *vi++;
2254:       x1  = x[idx];
2255:       s1 -= v[0]*x1;
2256:       v  += 1;
2257:     }
2258:     v      = aa +  diag[i];
2259:     x[idt] = v[0]*s1;
2260:   }
2261:   VecRestoreArrayRead(bb,&b);
2262:   VecRestoreArray(xx,&x);
2263:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
2264:   return(0);
2265: }


2268: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2269: {
2270:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2271:   PetscErrorCode    ierr;
2272:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*vi;
2273:   PetscScalar       *x,sum;
2274:   const PetscScalar *b;
2275:   const MatScalar   *aa = a->a,*v;
2276:   PetscInt          i,nz;

2279:   if (!n) return(0);

2281:   VecGetArrayRead(bb,&b);
2282:   VecGetArray(xx,&x);

2284:   /* forward solve the lower triangular */
2285:   x[0] = b[0];
2286:   v    = aa;
2287:   vi   = aj;
2288:   for (i=1; i<n; i++) {
2289:     nz  = ai[i+1] - ai[i];
2290:     sum = b[i];
2291:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2292:     v   += nz;
2293:     vi  += nz;
2294:     x[i] = sum;
2295:   }
2296:   PetscLogFlops(a->nz - A->cmap->n);
2297:   VecRestoreArrayRead(bb,&b);
2298:   VecRestoreArray(xx,&x);
2299:   return(0);
2300: }

2302: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2303: {
2304:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2305:   PetscErrorCode    ierr;
2306:   const PetscInt    n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
2307:   PetscScalar       *x,sum;
2308:   const PetscScalar *b;
2309:   const MatScalar   *aa = a->a,*v;
2310:   PetscInt          i,nz;

2313:   if (!n) return(0);

2315:   VecGetArrayRead(bb,&b);
2316:   VecGetArray(xx,&x);

2318:   /* backward solve the upper triangular */
2319:   for (i=n-1; i>=0; i--) {
2320:     v   = aa + adiag[i+1] + 1;
2321:     vi  = aj + adiag[i+1] + 1;
2322:     nz  = adiag[i] - adiag[i+1]-1;
2323:     sum = b[i];
2324:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2325:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2326:   }

2328:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2329:   VecRestoreArrayRead(bb,&b);
2330:   VecRestoreArray(xx,&x);
2331:   return(0);
2332: }

2334: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2335: {
2336:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2337:   PetscErrorCode    ierr;
2338:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
2339:   PetscScalar       *x,sum;
2340:   const PetscScalar *b;
2341:   const MatScalar   *aa = a->a,*v;
2342:   PetscInt          i,nz;

2345:   if (!n) return(0);

2347:   VecGetArrayRead(bb,&b);
2348:   VecGetArray(xx,&x);

2350:   /* forward solve the lower triangular */
2351:   x[0] = b[0];
2352:   v    = aa;
2353:   vi   = aj;
2354:   for (i=1; i<n; i++) {
2355:     nz  = ai[i+1] - ai[i];
2356:     sum = b[i];
2357:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2358:     v   += nz;
2359:     vi  += nz;
2360:     x[i] = sum;
2361:   }

2363:   /* backward solve the upper triangular */
2364:   for (i=n-1; i>=0; i--) {
2365:     v   = aa + adiag[i+1] + 1;
2366:     vi  = aj + adiag[i+1] + 1;
2367:     nz  = adiag[i] - adiag[i+1]-1;
2368:     sum = x[i];
2369:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2370:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2371:   }

2373:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2374:   VecRestoreArrayRead(bb,&b);
2375:   VecRestoreArray(xx,&x);
2376:   return(0);
2377: }