Actual source code: baijfact11.c

petsc-3.14.6 2021-03-30
Report Typos and Errors

  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 13: {
 14:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 15:   IS             isrow = b->row,isicol = b->icol;
 17:   const PetscInt *r,*ic;
 18:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 19:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 20:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 21:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 22:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 23:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 24:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 25:   MatScalar      m13,m14,m15,m16;
 26:   MatScalar      *ba           = b->a,*aa = a->a;
 27:   PetscBool      pivotinblocks = b->pivotinblocks;
 28:   PetscReal      shift         = info->shiftamount;
 29:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

 32:   ISGetIndices(isrow,&r);
 33:   ISGetIndices(isicol,&ic);
 34:   PetscMalloc1(16*(n+1),&rtmp);
 35:   allowzeropivot = PetscNot(A->erroriffailure);

 37:   for (i=0; i<n; i++) {
 38:     nz    = bi[i+1] - bi[i];
 39:     ajtmp = bj + bi[i];
 40:     for  (j=0; j<nz; j++) {
 41:       x     = rtmp+16*ajtmp[j];
 42:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 43:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 44:     }
 45:     /* load in initial (unfactored row) */
 46:     idx      = r[i];
 47:     nz       = ai[idx+1] - ai[idx];
 48:     ajtmpold = aj + ai[idx];
 49:     v        = aa + 16*ai[idx];
 50:     for (j=0; j<nz; j++) {
 51:       x     = rtmp+16*ic[ajtmpold[j]];
 52:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 53:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 54:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 55:       x[14] = v[14]; x[15] = v[15];
 56:       v    += 16;
 57:     }
 58:     row = *ajtmp++;
 59:     while (row < i) {
 60:       pc  = rtmp + 16*row;
 61:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 62:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 63:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 64:       p15 = pc[14]; p16 = pc[15];
 65:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 66:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 67:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 68:           || p16 != 0.0) {
 69:         pv    = ba + 16*diag_offset[row];
 70:         pj    = bj + diag_offset[row] + 1;
 71:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 72:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 73:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 74:         x15   = pv[14]; x16 = pv[15];
 75:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 76:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 77:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 78:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 80:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 81:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 82:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 83:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 85:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 86:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 87:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 88:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 90:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 91:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 92:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 93:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 95:         nz  = bi[row+1] - diag_offset[row] - 1;
 96:         pv += 16;
 97:         for (j=0; j<nz; j++) {
 98:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 99:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102:           x     = rtmp + 16*pj[j];
103:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

108:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

113:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

118:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

123:           pv += 16;
124:         }
125:         PetscLogFlops(128.0*nz+112.0);
126:       }
127:       row = *ajtmp++;
128:     }
129:     /* finished row so stick it into b->a */
130:     pv = ba + 16*bi[i];
131:     pj = bj + bi[i];
132:     nz = bi[i+1] - bi[i];
133:     for (j=0; j<nz; j++) {
134:       x      = rtmp+16*pj[j];
135:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139:       pv    += 16;
140:     }
141:     /* invert diagonal block */
142:     w = ba + 16*diag_offset[i];
143:     if (pivotinblocks) {
144:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
145:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146:     } else {
147:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
148:     }
149:   }

151:   PetscFree(rtmp);
152:   ISRestoreIndices(isicol,&ic);
153:   ISRestoreIndices(isrow,&r);

155:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
156:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157:   C->assembled           = PETSC_TRUE;

159:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
160:   return(0);
161: }

163: /* MatLUFactorNumeric_SeqBAIJ_4 -
164:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165:        PetscKernel_A_gets_A_times_B()
166:        PetscKernel_A_gets_A_minus_B_times_C()
167:        PetscKernel_A_gets_inverse_A()
168: */

170: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171: {
172:   Mat            C     = B;
173:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
174:   IS             isrow = b->row,isicol = b->icol;
176:   const PetscInt *r,*ic;
177:   PetscInt       i,j,k,nz,nzL,row;
178:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181:   PetscInt       flg;
182:   PetscReal      shift;
183:   PetscBool      allowzeropivot,zeropivotdetected;

186:   allowzeropivot = PetscNot(A->erroriffailure);
187:   ISGetIndices(isrow,&r);
188:   ISGetIndices(isicol,&ic);

190:   if (info->shifttype == MAT_SHIFT_NONE) {
191:     shift = 0;
192:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
193:     shift = info->shiftamount;
194:   }

196:   /* generate work space needed by the factorization */
197:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
198:   PetscArrayzero(rtmp,bs2*n);

200:   for (i=0; i<n; i++) {
201:     /* zero rtmp */
202:     /* L part */
203:     nz    = bi[i+1] - bi[i];
204:     bjtmp = bj + bi[i];
205:     for  (j=0; j<nz; j++) {
206:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
207:     }

209:     /* U part */
210:     nz    = bdiag[i]-bdiag[i+1];
211:     bjtmp = bj + bdiag[i+1]+1;
212:     for  (j=0; j<nz; j++) {
213:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
214:     }

216:     /* load in initial (unfactored row) */
217:     nz    = ai[r[i]+1] - ai[r[i]];
218:     ajtmp = aj + ai[r[i]];
219:     v     = aa + bs2*ai[r[i]];
220:     for (j=0; j<nz; j++) {
221:       PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);
222:     }

224:     /* elimination */
225:     bjtmp = bj + bi[i];
226:     nzL   = bi[i+1] - bi[i];
227:     for (k=0; k < nzL; k++) {
228:       row = bjtmp[k];
229:       pc  = rtmp + bs2*row;
230:       for (flg=0,j=0; j<bs2; j++) {
231:         if (pc[j]!=0.0) {
232:           flg = 1;
233:           break;
234:         }
235:       }
236:       if (flg) {
237:         pv = b->a + bs2*bdiag[row];
238:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
239:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

241:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
242:         pv = b->a + bs2*(bdiag[row+1]+1);
243:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
244:         for (j=0; j<nz; j++) {
245:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
246:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
247:           v    = rtmp + bs2*pj[j];
248:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
249:           pv  += bs2;
250:         }
251:         PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
252:       }
253:     }

255:     /* finished row so stick it into b->a */
256:     /* L part */
257:     pv = b->a + bs2*bi[i];
258:     pj = b->j + bi[i];
259:     nz = bi[i+1] - bi[i];
260:     for (j=0; j<nz; j++) {
261:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
262:     }

264:     /* Mark diagonal and invert diagonal for simplier triangular solves */
265:     pv   = b->a + bs2*bdiag[i];
266:     pj   = b->j + bdiag[i];
267:     PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
268:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
269:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

271:     /* U part */
272:     pv = b->a + bs2*(bdiag[i+1]+1);
273:     pj = b->j + bdiag[i+1]+1;
274:     nz = bdiag[i] - bdiag[i+1] - 1;
275:     for (j=0; j<nz; j++) {
276:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
277:     }
278:   }

280:   PetscFree2(rtmp,mwork);
281:   ISRestoreIndices(isicol,&ic);
282:   ISRestoreIndices(isrow,&r);

284:   C->ops->solve          = MatSolve_SeqBAIJ_4;
285:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
286:   C->assembled           = PETSC_TRUE;

288:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
289:   return(0);
290: }

292: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293: {
294: /*
295:     Default Version for when blocks are 4 by 4 Using natural ordering
296: */
297:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
299:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300:   PetscInt       *ajtmpold,*ajtmp,nz,row;
301:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
303:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306:   MatScalar      m13,m14,m15,m16;
307:   MatScalar      *ba           = b->a,*aa = a->a;
308:   PetscBool      pivotinblocks = b->pivotinblocks;
309:   PetscReal      shift         = info->shiftamount;
310:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

313:   allowzeropivot = PetscNot(A->erroriffailure);
314:   PetscMalloc1(16*(n+1),&rtmp);

316:   for (i=0; i<n; i++) {
317:     nz    = bi[i+1] - bi[i];
318:     ajtmp = bj + bi[i];
319:     for  (j=0; j<nz; j++) {
320:       x     = rtmp+16*ajtmp[j];
321:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
322:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
323:     }
324:     /* load in initial (unfactored row) */
325:     nz       = ai[i+1] - ai[i];
326:     ajtmpold = aj + ai[i];
327:     v        = aa + 16*ai[i];
328:     for (j=0; j<nz; j++) {
329:       x     = rtmp+16*ajtmpold[j];
330:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
331:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
332:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
333:       x[14] = v[14]; x[15] = v[15];
334:       v    += 16;
335:     }
336:     row = *ajtmp++;
337:     while (row < i) {
338:       pc  = rtmp + 16*row;
339:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
340:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
341:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
342:       p15 = pc[14]; p16 = pc[15];
343:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
344:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
345:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
346:           || p16 != 0.0) {
347:         pv    = ba + 16*diag_offset[row];
348:         pj    = bj + diag_offset[row] + 1;
349:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
350:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
351:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
352:         x15   = pv[14]; x16 = pv[15];
353:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
354:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
355:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
356:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

358:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
359:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
360:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
361:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

363:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
364:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
365:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
366:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

368:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
369:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
370:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
371:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
372:         nz     = bi[row+1] - diag_offset[row] - 1;
373:         pv    += 16;
374:         for (j=0; j<nz; j++) {
375:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
376:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
377:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
378:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
379:           x     = rtmp + 16*pj[j];
380:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
381:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
382:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
383:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

385:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
386:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
387:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
388:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

390:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
391:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
392:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
393:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

395:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
396:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
397:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
398:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

400:           pv += 16;
401:         }
402:         PetscLogFlops(128.0*nz+112.0);
403:       }
404:       row = *ajtmp++;
405:     }
406:     /* finished row so stick it into b->a */
407:     pv = ba + 16*bi[i];
408:     pj = bj + bi[i];
409:     nz = bi[i+1] - bi[i];
410:     for (j=0; j<nz; j++) {
411:       x      = rtmp+16*pj[j];
412:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
413:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
414:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
415:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
416:       pv    += 16;
417:     }
418:     /* invert diagonal block */
419:     w = ba + 16*diag_offset[i];
420:     if (pivotinblocks) {
421:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
422:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
423:     } else {
424:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
425:     }
426:   }

428:   PetscFree(rtmp);

430:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
431:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
432:   C->assembled           = PETSC_TRUE;

434:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
435:   return(0);
436: }

438: /*
439:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
440:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
441: */
442: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
443: {
444:   Mat            C =B;
445:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
447:   PetscInt       i,j,k,nz,nzL,row;
448:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
449:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
450:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
451:   PetscInt       flg;
452:   PetscReal      shift;
453:   PetscBool      allowzeropivot,zeropivotdetected;

456:   allowzeropivot = PetscNot(A->erroriffailure);

458:   /* generate work space needed by the factorization */
459:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
460:   PetscArrayzero(rtmp,bs2*n);

462:   if (info->shifttype == MAT_SHIFT_NONE) {
463:     shift = 0;
464:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
465:     shift = info->shiftamount;
466:   }

468:   for (i=0; i<n; i++) {
469:     /* zero rtmp */
470:     /* L part */
471:     nz    = bi[i+1] - bi[i];
472:     bjtmp = bj + bi[i];
473:     for  (j=0; j<nz; j++) {
474:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
475:     }

477:     /* U part */
478:     nz    = bdiag[i] - bdiag[i+1];
479:     bjtmp = bj + bdiag[i+1]+1;
480:     for  (j=0; j<nz; j++) {
481:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
482:     }

484:     /* load in initial (unfactored row) */
485:     nz    = ai[i+1] - ai[i];
486:     ajtmp = aj + ai[i];
487:     v     = aa + bs2*ai[i];
488:     for (j=0; j<nz; j++) {
489:       PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);
490:     }

492:     /* elimination */
493:     bjtmp = bj + bi[i];
494:     nzL   = bi[i+1] - bi[i];
495:     for (k=0; k < nzL; k++) {
496:       row = bjtmp[k];
497:       pc  = rtmp + bs2*row;
498:       for (flg=0,j=0; j<bs2; j++) {
499:         if (pc[j]!=0.0) {
500:           flg = 1;
501:           break;
502:         }
503:       }
504:       if (flg) {
505:         pv = b->a + bs2*bdiag[row];
506:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
507:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

509:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
510:         pv = b->a + bs2*(bdiag[row+1]+1);
511:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
512:         for (j=0; j<nz; j++) {
513:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
514:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
515:           v    = rtmp + bs2*pj[j];
516:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
517:           pv  += bs2;
518:         }
519:         PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
520:       }
521:     }

523:     /* finished row so stick it into b->a */
524:     /* L part */
525:     pv = b->a + bs2*bi[i];
526:     pj = b->j + bi[i];
527:     nz = bi[i+1] - bi[i];
528:     for (j=0; j<nz; j++) {
529:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
530:     }

532:     /* Mark diagonal and invert diagonal for simplier triangular solves */
533:     pv   = b->a + bs2*bdiag[i];
534:     pj   = b->j + bdiag[i];
535:     PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
536:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
537:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

539:     /* U part */
540:     pv = b->a + bs2*(bdiag[i+1]+1);
541:     pj = b->j + bdiag[i+1]+1;
542:     nz = bdiag[i] - bdiag[i+1] - 1;
543:     for (j=0; j<nz; j++) {
544:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
545:     }
546:   }
547:   PetscFree2(rtmp,mwork);

549:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
550:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
551:   C->assembled           = PETSC_TRUE;

553:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
554:   return(0);
555: }

557: #if defined(PETSC_HAVE_SSE)

559: #include PETSC_HAVE_SSE

561: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
562: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
563: {
564:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
566:   int            i,j,n = a->mbs;
567:   int            *bj = b->j,*bjtmp,*pj;
568:   int            row;
569:   int            *ajtmpold,nz,*bi=b->i;
570:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
571:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
572:   MatScalar      *ba    = b->a,*aa = a->a;
573:   int            nonzero=0;
574:   PetscBool      pivotinblocks = b->pivotinblocks;
575:   PetscReal      shift         = info->shiftamount;
576:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

579:   allowzeropivot = PetscNot(A->erroriffailure);
580:   SSE_SCOPE_BEGIN;

582:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
583:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
584:   PetscMalloc1(16*(n+1),&rtmp);
585:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
586: /*    if ((unsigned long)bj==(unsigned long)aj) { */
587: /*      colscale = 4; */
588: /*    } */
589:   for (i=0; i<n; i++) {
590:     nz    = bi[i+1] - bi[i];
591:     bjtmp = bj + bi[i];
592:     /* zero out the 4x4 block accumulators */
593:     /* zero out one register */
594:     XOR_PS(XMM7,XMM7);
595:     for  (j=0; j<nz; j++) {
596:       x = rtmp+16*bjtmp[j];
597: /*        x = rtmp+4*bjtmp[j]; */
598:       SSE_INLINE_BEGIN_1(x)
599:       /* Copy zero register to memory locations */
600:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
602:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
603:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
604:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
605:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
606:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
607:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
608:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
609:       SSE_INLINE_END_1;
610:     }
611:     /* load in initial (unfactored row) */
612:     nz       = ai[i+1] - ai[i];
613:     ajtmpold = aj + ai[i];
614:     v        = aa + 16*ai[i];
615:     for (j=0; j<nz; j++) {
616:       x = rtmp+16*ajtmpold[j];
617: /*        x = rtmp+colscale*ajtmpold[j]; */
618:       /* Copy v block into x block */
619:       SSE_INLINE_BEGIN_2(v,x)
620:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

624:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
625:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

627:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
628:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

630:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
631:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

633:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
634:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

636:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
637:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

639:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
640:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

642:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
643:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
644:       SSE_INLINE_END_2;

646:       v += 16;
647:     }
648: /*      row = (*bjtmp++)/4; */
649:     row = *bjtmp++;
650:     while (row < i) {
651:       pc = rtmp + 16*row;
652:       SSE_INLINE_BEGIN_1(pc)
653:       /* Load block from lower triangle */
654:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

658:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
659:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

661:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
662:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

664:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
665:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

667:       /* Compare block to zero block */

669:       SSE_COPY_PS(XMM4,XMM7)
670:       SSE_CMPNEQ_PS(XMM4,XMM0)

672:       SSE_COPY_PS(XMM5,XMM7)
673:       SSE_CMPNEQ_PS(XMM5,XMM1)

675:       SSE_COPY_PS(XMM6,XMM7)
676:       SSE_CMPNEQ_PS(XMM6,XMM2)

678:       SSE_CMPNEQ_PS(XMM7,XMM3)

680:       /* Reduce the comparisons to one SSE register */
681:       SSE_OR_PS(XMM6,XMM7)
682:       SSE_OR_PS(XMM5,XMM4)
683:       SSE_OR_PS(XMM5,XMM6)
684:       SSE_INLINE_END_1;

686:       /* Reduce the one SSE register to an integer register for branching */
687:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
688:       MOVEMASK(nonzero,XMM5);

690:       /* If block is nonzero ... */
691:       if (nonzero) {
692:         pv = ba + 16*diag_offset[row];
693:         PREFETCH_L1(&pv[16]);
694:         pj = bj + diag_offset[row] + 1;

696:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
697:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
698:         /* but the diagonal was inverted already */
699:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

701:         SSE_INLINE_BEGIN_2(pv,pc)
702:         /* Column 0, product is accumulated in XMM4 */
703:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
704:         SSE_SHUFFLE(XMM4,XMM4,0x00)
705:         SSE_MULT_PS(XMM4,XMM0)

707:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
708:         SSE_SHUFFLE(XMM5,XMM5,0x00)
709:         SSE_MULT_PS(XMM5,XMM1)
710:         SSE_ADD_PS(XMM4,XMM5)

712:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
713:         SSE_SHUFFLE(XMM6,XMM6,0x00)
714:         SSE_MULT_PS(XMM6,XMM2)
715:         SSE_ADD_PS(XMM4,XMM6)

717:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
718:         SSE_SHUFFLE(XMM7,XMM7,0x00)
719:         SSE_MULT_PS(XMM7,XMM3)
720:         SSE_ADD_PS(XMM4,XMM7)

722:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
723:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

725:         /* Column 1, product is accumulated in XMM5 */
726:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
727:         SSE_SHUFFLE(XMM5,XMM5,0x00)
728:         SSE_MULT_PS(XMM5,XMM0)

730:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
731:         SSE_SHUFFLE(XMM6,XMM6,0x00)
732:         SSE_MULT_PS(XMM6,XMM1)
733:         SSE_ADD_PS(XMM5,XMM6)

735:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
736:         SSE_SHUFFLE(XMM7,XMM7,0x00)
737:         SSE_MULT_PS(XMM7,XMM2)
738:         SSE_ADD_PS(XMM5,XMM7)

740:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
741:         SSE_SHUFFLE(XMM6,XMM6,0x00)
742:         SSE_MULT_PS(XMM6,XMM3)
743:         SSE_ADD_PS(XMM5,XMM6)

745:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
746:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

748:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

750:         /* Column 2, product is accumulated in XMM6 */
751:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
752:         SSE_SHUFFLE(XMM6,XMM6,0x00)
753:         SSE_MULT_PS(XMM6,XMM0)

755:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
756:         SSE_SHUFFLE(XMM7,XMM7,0x00)
757:         SSE_MULT_PS(XMM7,XMM1)
758:         SSE_ADD_PS(XMM6,XMM7)

760:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
761:         SSE_SHUFFLE(XMM7,XMM7,0x00)
762:         SSE_MULT_PS(XMM7,XMM2)
763:         SSE_ADD_PS(XMM6,XMM7)

765:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
766:         SSE_SHUFFLE(XMM7,XMM7,0x00)
767:         SSE_MULT_PS(XMM7,XMM3)
768:         SSE_ADD_PS(XMM6,XMM7)

770:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
771:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

773:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
774:         /* Column 3, product is accumulated in XMM0 */
775:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
776:         SSE_SHUFFLE(XMM7,XMM7,0x00)
777:         SSE_MULT_PS(XMM0,XMM7)

779:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
780:         SSE_SHUFFLE(XMM7,XMM7,0x00)
781:         SSE_MULT_PS(XMM1,XMM7)
782:         SSE_ADD_PS(XMM0,XMM1)

784:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
785:         SSE_SHUFFLE(XMM1,XMM1,0x00)
786:         SSE_MULT_PS(XMM1,XMM2)
787:         SSE_ADD_PS(XMM0,XMM1)

789:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
790:         SSE_SHUFFLE(XMM7,XMM7,0x00)
791:         SSE_MULT_PS(XMM3,XMM7)
792:         SSE_ADD_PS(XMM0,XMM3)

794:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
795:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

797:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
798:         /* This is code to be maintained and read by humans afterall. */
799:         /* Copy Multiplier Col 3 into XMM3 */
800:         SSE_COPY_PS(XMM3,XMM0)
801:         /* Copy Multiplier Col 2 into XMM2 */
802:         SSE_COPY_PS(XMM2,XMM6)
803:         /* Copy Multiplier Col 1 into XMM1 */
804:         SSE_COPY_PS(XMM1,XMM5)
805:         /* Copy Multiplier Col 0 into XMM0 */
806:         SSE_COPY_PS(XMM0,XMM4)
807:         SSE_INLINE_END_2;

809:         /* Update the row: */
810:         nz  = bi[row+1] - diag_offset[row] - 1;
811:         pv += 16;
812:         for (j=0; j<nz; j++) {
813:           PREFETCH_L1(&pv[16]);
814:           x = rtmp + 16*pj[j];
815: /*            x = rtmp + 4*pj[j]; */

817:           /* X:=X-M*PV, One column at a time */
818:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
819:           SSE_INLINE_BEGIN_2(x,pv)
820:           /* Load First Column of X*/
821:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
822:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

824:           /* Matrix-Vector Product: */
825:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
826:           SSE_SHUFFLE(XMM5,XMM5,0x00)
827:           SSE_MULT_PS(XMM5,XMM0)
828:           SSE_SUB_PS(XMM4,XMM5)

830:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
831:           SSE_SHUFFLE(XMM6,XMM6,0x00)
832:           SSE_MULT_PS(XMM6,XMM1)
833:           SSE_SUB_PS(XMM4,XMM6)

835:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
836:           SSE_SHUFFLE(XMM7,XMM7,0x00)
837:           SSE_MULT_PS(XMM7,XMM2)
838:           SSE_SUB_PS(XMM4,XMM7)

840:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
841:           SSE_SHUFFLE(XMM5,XMM5,0x00)
842:           SSE_MULT_PS(XMM5,XMM3)
843:           SSE_SUB_PS(XMM4,XMM5)

845:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
846:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

848:           /* Second Column */
849:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
850:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

852:           /* Matrix-Vector Product: */
853:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
854:           SSE_SHUFFLE(XMM6,XMM6,0x00)
855:           SSE_MULT_PS(XMM6,XMM0)
856:           SSE_SUB_PS(XMM5,XMM6)

858:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
859:           SSE_SHUFFLE(XMM7,XMM7,0x00)
860:           SSE_MULT_PS(XMM7,XMM1)
861:           SSE_SUB_PS(XMM5,XMM7)

863:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
864:           SSE_SHUFFLE(XMM4,XMM4,0x00)
865:           SSE_MULT_PS(XMM4,XMM2)
866:           SSE_SUB_PS(XMM5,XMM4)

868:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
869:           SSE_SHUFFLE(XMM6,XMM6,0x00)
870:           SSE_MULT_PS(XMM6,XMM3)
871:           SSE_SUB_PS(XMM5,XMM6)

873:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
874:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

876:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

878:           /* Third Column */
879:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
880:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

882:           /* Matrix-Vector Product: */
883:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
884:           SSE_SHUFFLE(XMM7,XMM7,0x00)
885:           SSE_MULT_PS(XMM7,XMM0)
886:           SSE_SUB_PS(XMM6,XMM7)

888:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
889:           SSE_SHUFFLE(XMM4,XMM4,0x00)
890:           SSE_MULT_PS(XMM4,XMM1)
891:           SSE_SUB_PS(XMM6,XMM4)

893:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
894:           SSE_SHUFFLE(XMM5,XMM5,0x00)
895:           SSE_MULT_PS(XMM5,XMM2)
896:           SSE_SUB_PS(XMM6,XMM5)

898:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
899:           SSE_SHUFFLE(XMM7,XMM7,0x00)
900:           SSE_MULT_PS(XMM7,XMM3)
901:           SSE_SUB_PS(XMM6,XMM7)

903:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
904:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

906:           /* Fourth Column */
907:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
908:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

910:           /* Matrix-Vector Product: */
911:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
912:           SSE_SHUFFLE(XMM5,XMM5,0x00)
913:           SSE_MULT_PS(XMM5,XMM0)
914:           SSE_SUB_PS(XMM4,XMM5)

916:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
917:           SSE_SHUFFLE(XMM6,XMM6,0x00)
918:           SSE_MULT_PS(XMM6,XMM1)
919:           SSE_SUB_PS(XMM4,XMM6)

921:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
922:           SSE_SHUFFLE(XMM7,XMM7,0x00)
923:           SSE_MULT_PS(XMM7,XMM2)
924:           SSE_SUB_PS(XMM4,XMM7)

926:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
927:           SSE_SHUFFLE(XMM5,XMM5,0x00)
928:           SSE_MULT_PS(XMM5,XMM3)
929:           SSE_SUB_PS(XMM4,XMM5)

931:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
932:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
933:           SSE_INLINE_END_2;
934:           pv += 16;
935:         }
936:         PetscLogFlops(128.0*nz+112.0);
937:       }
938:       row = *bjtmp++;
939: /*        row = (*bjtmp++)/4; */
940:     }
941:     /* finished row so stick it into b->a */
942:     pv = ba + 16*bi[i];
943:     pj = bj + bi[i];
944:     nz = bi[i+1] - bi[i];

946:     /* Copy x block back into pv block */
947:     for (j=0; j<nz; j++) {
948:       x = rtmp+16*pj[j];
949: /*        x  = rtmp+4*pj[j]; */

951:       SSE_INLINE_BEGIN_2(x,pv)
952:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
953:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
954:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

956:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
957:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

959:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
960:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

962:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
963:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

965:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
966:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

968:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
969:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

971:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
972:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

974:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
975:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
976:       SSE_INLINE_END_2;
977:       pv += 16;
978:     }
979:     /* invert diagonal block */
980:     w = ba + 16*diag_offset[i];
981:     if (pivotinblocks) {
982:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
983:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
984:     } else {
985:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
986:     }
987: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
988:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
989:   }

991:   PetscFree(rtmp);

993:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
994:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
995:   C->assembled           = PETSC_TRUE;

997:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
998:   /* Flop Count from inverting diagonal blocks */
999:   SSE_SCOPE_END;
1000:   return(0);
1001: }

1003: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1004: {
1005:   Mat            A  =C;
1006:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1008:   int            i,j,n = a->mbs;
1009:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1010:   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1011:   unsigned int   row;
1012:   int            nz,*bi=b->i;
1013:   int            *diag_offset = b->diag,*ai=a->i;
1014:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1015:   MatScalar      *ba    = b->a,*aa = a->a;
1016:   int            nonzero=0;
1017:   PetscBool      pivotinblocks = b->pivotinblocks;
1018:   PetscReal      shift = info->shiftamount;
1019:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1022:   allowzeropivot = PetscNot(A->erroriffailure);
1023:   SSE_SCOPE_BEGIN;

1025:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1026:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1027:   PetscMalloc1(16*(n+1),&rtmp);
1028:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1029: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1030: /*      colscale = 4; */
1031: /*    } */

1033:   for (i=0; i<n; i++) {
1034:     nz    = bi[i+1] - bi[i];
1035:     bjtmp = bj + bi[i];
1036:     /* zero out the 4x4 block accumulators */
1037:     /* zero out one register */
1038:     XOR_PS(XMM7,XMM7);
1039:     for  (j=0; j<nz; j++) {
1040:       x = rtmp+16*((unsigned int)bjtmp[j]);
1041: /*        x = rtmp+4*bjtmp[j]; */
1042:       SSE_INLINE_BEGIN_1(x)
1043:       /* Copy zero register to memory locations */
1044:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1045:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1046:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1047:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1048:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1049:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1050:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1051:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1052:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1053:       SSE_INLINE_END_1;
1054:     }
1055:     /* load in initial (unfactored row) */
1056:     nz    = ai[i+1] - ai[i];
1057:     ajtmp = aj + ai[i];
1058:     v     = aa + 16*ai[i];
1059:     for (j=0; j<nz; j++) {
1060:       x = rtmp+16*((unsigned int)ajtmp[j]);
1061: /*        x = rtmp+colscale*ajtmp[j]; */
1062:       /* Copy v block into x block */
1063:       SSE_INLINE_BEGIN_2(v,x)
1064:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1068:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1069:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1071:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1072:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1074:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1075:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1077:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1078:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1080:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1081:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1083:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1084:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1086:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1087:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1088:       SSE_INLINE_END_2;

1090:       v += 16;
1091:     }
1092: /*      row = (*bjtmp++)/4; */
1093:     row = (unsigned int)(*bjtmp++);
1094:     while (row < i) {
1095:       pc = rtmp + 16*row;
1096:       SSE_INLINE_BEGIN_1(pc)
1097:       /* Load block from lower triangle */
1098:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1099:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1100:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1102:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1103:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1105:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1106:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1108:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1109:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1111:       /* Compare block to zero block */

1113:       SSE_COPY_PS(XMM4,XMM7)
1114:       SSE_CMPNEQ_PS(XMM4,XMM0)

1116:       SSE_COPY_PS(XMM5,XMM7)
1117:       SSE_CMPNEQ_PS(XMM5,XMM1)

1119:       SSE_COPY_PS(XMM6,XMM7)
1120:       SSE_CMPNEQ_PS(XMM6,XMM2)

1122:       SSE_CMPNEQ_PS(XMM7,XMM3)

1124:       /* Reduce the comparisons to one SSE register */
1125:       SSE_OR_PS(XMM6,XMM7)
1126:       SSE_OR_PS(XMM5,XMM4)
1127:       SSE_OR_PS(XMM5,XMM6)
1128:       SSE_INLINE_END_1;

1130:       /* Reduce the one SSE register to an integer register for branching */
1131:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1132:       MOVEMASK(nonzero,XMM5);

1134:       /* If block is nonzero ... */
1135:       if (nonzero) {
1136:         pv = ba + 16*diag_offset[row];
1137:         PREFETCH_L1(&pv[16]);
1138:         pj = bj + diag_offset[row] + 1;

1140:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1141:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1142:         /* but the diagonal was inverted already */
1143:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1145:         SSE_INLINE_BEGIN_2(pv,pc)
1146:         /* Column 0, product is accumulated in XMM4 */
1147:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1148:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1149:         SSE_MULT_PS(XMM4,XMM0)

1151:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1152:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1153:         SSE_MULT_PS(XMM5,XMM1)
1154:         SSE_ADD_PS(XMM4,XMM5)

1156:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1157:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1158:         SSE_MULT_PS(XMM6,XMM2)
1159:         SSE_ADD_PS(XMM4,XMM6)

1161:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1162:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1163:         SSE_MULT_PS(XMM7,XMM3)
1164:         SSE_ADD_PS(XMM4,XMM7)

1166:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1167:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1169:         /* Column 1, product is accumulated in XMM5 */
1170:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1171:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1172:         SSE_MULT_PS(XMM5,XMM0)

1174:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1175:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1176:         SSE_MULT_PS(XMM6,XMM1)
1177:         SSE_ADD_PS(XMM5,XMM6)

1179:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1180:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1181:         SSE_MULT_PS(XMM7,XMM2)
1182:         SSE_ADD_PS(XMM5,XMM7)

1184:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1185:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1186:         SSE_MULT_PS(XMM6,XMM3)
1187:         SSE_ADD_PS(XMM5,XMM6)

1189:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1190:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1192:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1194:         /* Column 2, product is accumulated in XMM6 */
1195:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1196:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1197:         SSE_MULT_PS(XMM6,XMM0)

1199:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1200:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1201:         SSE_MULT_PS(XMM7,XMM1)
1202:         SSE_ADD_PS(XMM6,XMM7)

1204:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1205:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1206:         SSE_MULT_PS(XMM7,XMM2)
1207:         SSE_ADD_PS(XMM6,XMM7)

1209:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1210:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1211:         SSE_MULT_PS(XMM7,XMM3)
1212:         SSE_ADD_PS(XMM6,XMM7)

1214:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1215:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1217:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1218:         /* Column 3, product is accumulated in XMM0 */
1219:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1220:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1221:         SSE_MULT_PS(XMM0,XMM7)

1223:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1224:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1225:         SSE_MULT_PS(XMM1,XMM7)
1226:         SSE_ADD_PS(XMM0,XMM1)

1228:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1229:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1230:         SSE_MULT_PS(XMM1,XMM2)
1231:         SSE_ADD_PS(XMM0,XMM1)

1233:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1234:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1235:         SSE_MULT_PS(XMM3,XMM7)
1236:         SSE_ADD_PS(XMM0,XMM3)

1238:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1239:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1241:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1242:         /* This is code to be maintained and read by humans afterall. */
1243:         /* Copy Multiplier Col 3 into XMM3 */
1244:         SSE_COPY_PS(XMM3,XMM0)
1245:         /* Copy Multiplier Col 2 into XMM2 */
1246:         SSE_COPY_PS(XMM2,XMM6)
1247:         /* Copy Multiplier Col 1 into XMM1 */
1248:         SSE_COPY_PS(XMM1,XMM5)
1249:         /* Copy Multiplier Col 0 into XMM0 */
1250:         SSE_COPY_PS(XMM0,XMM4)
1251:         SSE_INLINE_END_2;

1253:         /* Update the row: */
1254:         nz  = bi[row+1] - diag_offset[row] - 1;
1255:         pv += 16;
1256:         for (j=0; j<nz; j++) {
1257:           PREFETCH_L1(&pv[16]);
1258:           x = rtmp + 16*((unsigned int)pj[j]);
1259: /*            x = rtmp + 4*pj[j]; */

1261:           /* X:=X-M*PV, One column at a time */
1262:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1263:           SSE_INLINE_BEGIN_2(x,pv)
1264:           /* Load First Column of X*/
1265:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1268:           /* Matrix-Vector Product: */
1269:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1270:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1271:           SSE_MULT_PS(XMM5,XMM0)
1272:           SSE_SUB_PS(XMM4,XMM5)

1274:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1275:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1276:           SSE_MULT_PS(XMM6,XMM1)
1277:           SSE_SUB_PS(XMM4,XMM6)

1279:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1280:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1281:           SSE_MULT_PS(XMM7,XMM2)
1282:           SSE_SUB_PS(XMM4,XMM7)

1284:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1285:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1286:           SSE_MULT_PS(XMM5,XMM3)
1287:           SSE_SUB_PS(XMM4,XMM5)

1289:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1290:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1292:           /* Second Column */
1293:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1296:           /* Matrix-Vector Product: */
1297:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1298:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1299:           SSE_MULT_PS(XMM6,XMM0)
1300:           SSE_SUB_PS(XMM5,XMM6)

1302:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1303:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1304:           SSE_MULT_PS(XMM7,XMM1)
1305:           SSE_SUB_PS(XMM5,XMM7)

1307:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1308:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1309:           SSE_MULT_PS(XMM4,XMM2)
1310:           SSE_SUB_PS(XMM5,XMM4)

1312:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1313:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1314:           SSE_MULT_PS(XMM6,XMM3)
1315:           SSE_SUB_PS(XMM5,XMM6)

1317:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1318:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1320:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1322:           /* Third Column */
1323:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1326:           /* Matrix-Vector Product: */
1327:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1328:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1329:           SSE_MULT_PS(XMM7,XMM0)
1330:           SSE_SUB_PS(XMM6,XMM7)

1332:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1333:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1334:           SSE_MULT_PS(XMM4,XMM1)
1335:           SSE_SUB_PS(XMM6,XMM4)

1337:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1338:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1339:           SSE_MULT_PS(XMM5,XMM2)
1340:           SSE_SUB_PS(XMM6,XMM5)

1342:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1343:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1344:           SSE_MULT_PS(XMM7,XMM3)
1345:           SSE_SUB_PS(XMM6,XMM7)

1347:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1348:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1350:           /* Fourth Column */
1351:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1354:           /* Matrix-Vector Product: */
1355:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1356:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1357:           SSE_MULT_PS(XMM5,XMM0)
1358:           SSE_SUB_PS(XMM4,XMM5)

1360:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1361:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1362:           SSE_MULT_PS(XMM6,XMM1)
1363:           SSE_SUB_PS(XMM4,XMM6)

1365:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1366:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1367:           SSE_MULT_PS(XMM7,XMM2)
1368:           SSE_SUB_PS(XMM4,XMM7)

1370:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1371:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1372:           SSE_MULT_PS(XMM5,XMM3)
1373:           SSE_SUB_PS(XMM4,XMM5)

1375:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1376:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1377:           SSE_INLINE_END_2;
1378:           pv += 16;
1379:         }
1380:         PetscLogFlops(128.0*nz+112.0);
1381:       }
1382:       row = (unsigned int)(*bjtmp++);
1383: /*        row = (*bjtmp++)/4; */
1384: /*        bjtmp++; */
1385:     }
1386:     /* finished row so stick it into b->a */
1387:     pv = ba + 16*bi[i];
1388:     pj = bj + bi[i];
1389:     nz = bi[i+1] - bi[i];

1391:     /* Copy x block back into pv block */
1392:     for (j=0; j<nz; j++) {
1393:       x = rtmp+16*((unsigned int)pj[j]);
1394: /*        x  = rtmp+4*pj[j]; */

1396:       SSE_INLINE_BEGIN_2(x,pv)
1397:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1398:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1399:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1401:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1402:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1404:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1405:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1407:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1408:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1410:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1411:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1413:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1414:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1416:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1417:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1419:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1420:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1421:       SSE_INLINE_END_2;
1422:       pv += 16;
1423:     }
1424:     /* invert diagonal block */
1425:     w = ba + 16*diag_offset[i];
1426:     if (pivotinblocks) {
1427:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1428:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1429:     } else {
1430:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1431:     }
1432:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1433:   }

1435:   PetscFree(rtmp);

1437:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1438:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1439:   C->assembled           = PETSC_TRUE;

1441:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1442:   /* Flop Count from inverting diagonal blocks */
1443:   SSE_SCOPE_END;
1444:   return(0);
1445: }

1447: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1448: {
1449:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1451:   int            i,j,n = a->mbs;
1452:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1453:   unsigned int   row;
1454:   int            *ajtmpold,nz,*bi=b->i;
1455:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1456:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1457:   MatScalar      *ba    = b->a,*aa = a->a;
1458:   int            nonzero=0;
1459:   PetscBool      pivotinblocks = b->pivotinblocks;
1460:   PetscReal      shift = info->shiftamount;
1461:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1464:   allowzeropivot = PetscNot(A->erroriffailure);
1465:   SSE_SCOPE_BEGIN;

1467:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1468:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1469:   PetscMalloc1(16*(n+1),&rtmp);
1470:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1471: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1472: /*      colscale = 4; */
1473: /*    } */
1474:   if ((unsigned long)bj==(unsigned long)aj) {
1475:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1476:   }

1478:   for (i=0; i<n; i++) {
1479:     nz    = bi[i+1] - bi[i];
1480:     bjtmp = bj + bi[i];
1481:     /* zero out the 4x4 block accumulators */
1482:     /* zero out one register */
1483:     XOR_PS(XMM7,XMM7);
1484:     for  (j=0; j<nz; j++) {
1485:       x = rtmp+16*((unsigned int)bjtmp[j]);
1486: /*        x = rtmp+4*bjtmp[j]; */
1487:       SSE_INLINE_BEGIN_1(x)
1488:       /* Copy zero register to memory locations */
1489:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1490:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1491:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1492:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1493:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1494:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1495:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1496:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1497:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1498:       SSE_INLINE_END_1;
1499:     }
1500:     /* load in initial (unfactored row) */
1501:     nz       = ai[i+1] - ai[i];
1502:     ajtmpold = aj + ai[i];
1503:     v        = aa + 16*ai[i];
1504:     for (j=0; j<nz; j++) {
1505:       x = rtmp+16*ajtmpold[j];
1506: /*        x = rtmp+colscale*ajtmpold[j]; */
1507:       /* Copy v block into x block */
1508:       SSE_INLINE_BEGIN_2(v,x)
1509:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1510:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1511:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1513:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1514:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1516:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1517:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1519:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1520:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1522:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1523:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1525:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1526:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1528:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1529:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1531:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1532:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1533:       SSE_INLINE_END_2;

1535:       v += 16;
1536:     }
1537: /*      row = (*bjtmp++)/4; */
1538:     row = (unsigned int)(*bjtmp++);
1539:     while (row < i) {
1540:       pc = rtmp + 16*row;
1541:       SSE_INLINE_BEGIN_1(pc)
1542:       /* Load block from lower triangle */
1543:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1544:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1545:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1547:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1548:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1550:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1551:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1553:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1554:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1556:       /* Compare block to zero block */

1558:       SSE_COPY_PS(XMM4,XMM7)
1559:       SSE_CMPNEQ_PS(XMM4,XMM0)

1561:       SSE_COPY_PS(XMM5,XMM7)
1562:       SSE_CMPNEQ_PS(XMM5,XMM1)

1564:       SSE_COPY_PS(XMM6,XMM7)
1565:       SSE_CMPNEQ_PS(XMM6,XMM2)

1567:       SSE_CMPNEQ_PS(XMM7,XMM3)

1569:       /* Reduce the comparisons to one SSE register */
1570:       SSE_OR_PS(XMM6,XMM7)
1571:       SSE_OR_PS(XMM5,XMM4)
1572:       SSE_OR_PS(XMM5,XMM6)
1573:       SSE_INLINE_END_1;

1575:       /* Reduce the one SSE register to an integer register for branching */
1576:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1577:       MOVEMASK(nonzero,XMM5);

1579:       /* If block is nonzero ... */
1580:       if (nonzero) {
1581:         pv = ba + 16*diag_offset[row];
1582:         PREFETCH_L1(&pv[16]);
1583:         pj = bj + diag_offset[row] + 1;

1585:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1586:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1587:         /* but the diagonal was inverted already */
1588:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1590:         SSE_INLINE_BEGIN_2(pv,pc)
1591:         /* Column 0, product is accumulated in XMM4 */
1592:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1593:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1594:         SSE_MULT_PS(XMM4,XMM0)

1596:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1597:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1598:         SSE_MULT_PS(XMM5,XMM1)
1599:         SSE_ADD_PS(XMM4,XMM5)

1601:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1602:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1603:         SSE_MULT_PS(XMM6,XMM2)
1604:         SSE_ADD_PS(XMM4,XMM6)

1606:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1607:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1608:         SSE_MULT_PS(XMM7,XMM3)
1609:         SSE_ADD_PS(XMM4,XMM7)

1611:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1612:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1614:         /* Column 1, product is accumulated in XMM5 */
1615:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1616:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1617:         SSE_MULT_PS(XMM5,XMM0)

1619:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1620:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1621:         SSE_MULT_PS(XMM6,XMM1)
1622:         SSE_ADD_PS(XMM5,XMM6)

1624:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1625:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1626:         SSE_MULT_PS(XMM7,XMM2)
1627:         SSE_ADD_PS(XMM5,XMM7)

1629:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1630:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1631:         SSE_MULT_PS(XMM6,XMM3)
1632:         SSE_ADD_PS(XMM5,XMM6)

1634:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1635:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1637:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1639:         /* Column 2, product is accumulated in XMM6 */
1640:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1641:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1642:         SSE_MULT_PS(XMM6,XMM0)

1644:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1645:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1646:         SSE_MULT_PS(XMM7,XMM1)
1647:         SSE_ADD_PS(XMM6,XMM7)

1649:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1650:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1651:         SSE_MULT_PS(XMM7,XMM2)
1652:         SSE_ADD_PS(XMM6,XMM7)

1654:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1655:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1656:         SSE_MULT_PS(XMM7,XMM3)
1657:         SSE_ADD_PS(XMM6,XMM7)

1659:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1660:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1662:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1663:         /* Column 3, product is accumulated in XMM0 */
1664:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1665:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1666:         SSE_MULT_PS(XMM0,XMM7)

1668:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1669:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1670:         SSE_MULT_PS(XMM1,XMM7)
1671:         SSE_ADD_PS(XMM0,XMM1)

1673:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1674:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1675:         SSE_MULT_PS(XMM1,XMM2)
1676:         SSE_ADD_PS(XMM0,XMM1)

1678:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1679:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1680:         SSE_MULT_PS(XMM3,XMM7)
1681:         SSE_ADD_PS(XMM0,XMM3)

1683:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1684:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1686:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1687:         /* This is code to be maintained and read by humans afterall. */
1688:         /* Copy Multiplier Col 3 into XMM3 */
1689:         SSE_COPY_PS(XMM3,XMM0)
1690:         /* Copy Multiplier Col 2 into XMM2 */
1691:         SSE_COPY_PS(XMM2,XMM6)
1692:         /* Copy Multiplier Col 1 into XMM1 */
1693:         SSE_COPY_PS(XMM1,XMM5)
1694:         /* Copy Multiplier Col 0 into XMM0 */
1695:         SSE_COPY_PS(XMM0,XMM4)
1696:         SSE_INLINE_END_2;

1698:         /* Update the row: */
1699:         nz  = bi[row+1] - diag_offset[row] - 1;
1700:         pv += 16;
1701:         for (j=0; j<nz; j++) {
1702:           PREFETCH_L1(&pv[16]);
1703:           x = rtmp + 16*((unsigned int)pj[j]);
1704: /*            x = rtmp + 4*pj[j]; */

1706:           /* X:=X-M*PV, One column at a time */
1707:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1708:           SSE_INLINE_BEGIN_2(x,pv)
1709:           /* Load First Column of X*/
1710:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1713:           /* Matrix-Vector Product: */
1714:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1715:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1716:           SSE_MULT_PS(XMM5,XMM0)
1717:           SSE_SUB_PS(XMM4,XMM5)

1719:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1720:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1721:           SSE_MULT_PS(XMM6,XMM1)
1722:           SSE_SUB_PS(XMM4,XMM6)

1724:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1725:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1726:           SSE_MULT_PS(XMM7,XMM2)
1727:           SSE_SUB_PS(XMM4,XMM7)

1729:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1730:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1731:           SSE_MULT_PS(XMM5,XMM3)
1732:           SSE_SUB_PS(XMM4,XMM5)

1734:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1735:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1737:           /* Second Column */
1738:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1741:           /* Matrix-Vector Product: */
1742:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1743:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1744:           SSE_MULT_PS(XMM6,XMM0)
1745:           SSE_SUB_PS(XMM5,XMM6)

1747:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1748:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1749:           SSE_MULT_PS(XMM7,XMM1)
1750:           SSE_SUB_PS(XMM5,XMM7)

1752:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1753:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1754:           SSE_MULT_PS(XMM4,XMM2)
1755:           SSE_SUB_PS(XMM5,XMM4)

1757:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1758:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1759:           SSE_MULT_PS(XMM6,XMM3)
1760:           SSE_SUB_PS(XMM5,XMM6)

1762:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1763:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1765:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1767:           /* Third Column */
1768:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1771:           /* Matrix-Vector Product: */
1772:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1773:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1774:           SSE_MULT_PS(XMM7,XMM0)
1775:           SSE_SUB_PS(XMM6,XMM7)

1777:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1778:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1779:           SSE_MULT_PS(XMM4,XMM1)
1780:           SSE_SUB_PS(XMM6,XMM4)

1782:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1783:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1784:           SSE_MULT_PS(XMM5,XMM2)
1785:           SSE_SUB_PS(XMM6,XMM5)

1787:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1788:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1789:           SSE_MULT_PS(XMM7,XMM3)
1790:           SSE_SUB_PS(XMM6,XMM7)

1792:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1793:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1795:           /* Fourth Column */
1796:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1799:           /* Matrix-Vector Product: */
1800:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1801:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1802:           SSE_MULT_PS(XMM5,XMM0)
1803:           SSE_SUB_PS(XMM4,XMM5)

1805:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1806:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1807:           SSE_MULT_PS(XMM6,XMM1)
1808:           SSE_SUB_PS(XMM4,XMM6)

1810:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1811:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1812:           SSE_MULT_PS(XMM7,XMM2)
1813:           SSE_SUB_PS(XMM4,XMM7)

1815:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1816:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1817:           SSE_MULT_PS(XMM5,XMM3)
1818:           SSE_SUB_PS(XMM4,XMM5)

1820:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1821:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1822:           SSE_INLINE_END_2;
1823:           pv += 16;
1824:         }
1825:         PetscLogFlops(128.0*nz+112.0);
1826:       }
1827:       row = (unsigned int)(*bjtmp++);
1828: /*        row = (*bjtmp++)/4; */
1829: /*        bjtmp++; */
1830:     }
1831:     /* finished row so stick it into b->a */
1832:     pv = ba + 16*bi[i];
1833:     pj = bj + bi[i];
1834:     nz = bi[i+1] - bi[i];

1836:     /* Copy x block back into pv block */
1837:     for (j=0; j<nz; j++) {
1838:       x = rtmp+16*((unsigned int)pj[j]);
1839: /*        x  = rtmp+4*pj[j]; */

1841:       SSE_INLINE_BEGIN_2(x,pv)
1842:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1843:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1844:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1846:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1847:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1849:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1850:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1852:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1853:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1855:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1856:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1858:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1859:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1861:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1862:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1864:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1865:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1866:       SSE_INLINE_END_2;
1867:       pv += 16;
1868:     }
1869:     /* invert diagonal block */
1870:     w = ba + 16*diag_offset[i];
1871:     if (pivotinblocks) {
1872:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1873:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1874:     } else {
1875:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1876:     }
1877: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1878:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1879:   }

1881:   PetscFree(rtmp);

1883:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1884:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1885:   C->assembled           = PETSC_TRUE;

1887:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1888:   /* Flop Count from inverting diagonal blocks */
1889:   SSE_SCOPE_END;
1890:   return(0);
1891: }

1893: #endif