Actual source code: baijfact11.c

petsc-3.7.3 2016-08-01
Report Typos and Errors
  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 15: {
 16:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 17:   IS             isrow = b->row,isicol = b->icol;
 19:   const PetscInt *r,*ic;
 20:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 21:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 22:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 23:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 24:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 25:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 26:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 27:   MatScalar      m13,m14,m15,m16;
 28:   MatScalar      *ba           = b->a,*aa = a->a;
 29:   PetscBool      pivotinblocks = b->pivotinblocks;
 30:   PetscReal      shift         = info->shiftamount;
 31:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

 34:   ISGetIndices(isrow,&r);
 35:   ISGetIndices(isicol,&ic);
 36:   PetscMalloc1(16*(n+1),&rtmp);
 37:   allowzeropivot = PetscNot(A->erroriffailure);

 39:   for (i=0; i<n; i++) {
 40:     nz    = bi[i+1] - bi[i];
 41:     ajtmp = bj + bi[i];
 42:     for  (j=0; j<nz; j++) {
 43:       x     = rtmp+16*ajtmp[j];
 44:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 45:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 46:     }
 47:     /* load in initial (unfactored row) */
 48:     idx      = r[i];
 49:     nz       = ai[idx+1] - ai[idx];
 50:     ajtmpold = aj + ai[idx];
 51:     v        = aa + 16*ai[idx];
 52:     for (j=0; j<nz; j++) {
 53:       x     = rtmp+16*ic[ajtmpold[j]];
 54:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 55:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 56:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 57:       x[14] = v[14]; x[15] = v[15];
 58:       v    += 16;
 59:     }
 60:     row = *ajtmp++;
 61:     while (row < i) {
 62:       pc  = rtmp + 16*row;
 63:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 64:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 65:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 66:       p15 = pc[14]; p16 = pc[15];
 67:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 68:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 69:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 70:           || p16 != 0.0) {
 71:         pv    = ba + 16*diag_offset[row];
 72:         pj    = bj + diag_offset[row] + 1;
 73:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 74:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 75:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 76:         x15   = pv[14]; x16 = pv[15];
 77:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 78:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 79:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 80:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 82:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 83:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 84:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 85:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 87:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 88:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 89:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 90:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 92:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 93:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 94:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 95:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 97:         nz  = bi[row+1] - diag_offset[row] - 1;
 98:         pv += 16;
 99:         for (j=0; j<nz; j++) {
100:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
101:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
102:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
103:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
104:           x     = rtmp + 16*pj[j];
105:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
106:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
107:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
108:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

110:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
111:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
112:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
113:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

115:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
116:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
117:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
118:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

120:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
121:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
122:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
123:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

125:           pv += 16;
126:         }
127:         PetscLogFlops(128.0*nz+112.0);
128:       }
129:       row = *ajtmp++;
130:     }
131:     /* finished row so stick it into b->a */
132:     pv = ba + 16*bi[i];
133:     pj = bj + bi[i];
134:     nz = bi[i+1] - bi[i];
135:     for (j=0; j<nz; j++) {
136:       x      = rtmp+16*pj[j];
137:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
138:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
139:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
140:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
141:       pv    += 16;
142:     }
143:     /* invert diagonal block */
144:     w = ba + 16*diag_offset[i];
145:     if (pivotinblocks) {
146:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
147:       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
148:     } else {
149:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
150:     }
151:   }

153:   PetscFree(rtmp);
154:   ISRestoreIndices(isicol,&ic);
155:   ISRestoreIndices(isrow,&r);

157:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
158:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
159:   C->assembled           = PETSC_TRUE;

161:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
162:   return(0);
163: }

165: /* MatLUFactorNumeric_SeqBAIJ_4 -
166:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
167:        PetscKernel_A_gets_A_times_B()
168:        PetscKernel_A_gets_A_minus_B_times_C()
169:        PetscKernel_A_gets_inverse_A()
170: */

174: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
175: {
176:   Mat            C     = B;
177:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
178:   IS             isrow = b->row,isicol = b->icol;
180:   const PetscInt *r,*ic;
181:   PetscInt       i,j,k,nz,nzL,row;
182:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
183:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
184:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
185:   PetscInt       flg;
186:   PetscReal      shift;
187:   PetscBool      allowzeropivot,zeropivotdetected;

190:   allowzeropivot = PetscNot(A->erroriffailure);
191:   ISGetIndices(isrow,&r);
192:   ISGetIndices(isicol,&ic);

194:   if (info->shifttype == MAT_SHIFT_NONE) {
195:     shift = 0;
196:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
197:     shift = info->shiftamount;
198:   }

200:   /* generate work space needed by the factorization */
201:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
202:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

204:   for (i=0; i<n; i++) {
205:     /* zero rtmp */
206:     /* L part */
207:     nz    = bi[i+1] - bi[i];
208:     bjtmp = bj + bi[i];
209:     for  (j=0; j<nz; j++) {
210:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
211:     }

213:     /* U part */
214:     nz    = bdiag[i]-bdiag[i+1];
215:     bjtmp = bj + bdiag[i+1]+1;
216:     for  (j=0; j<nz; j++) {
217:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
218:     }

220:     /* load in initial (unfactored row) */
221:     nz    = ai[r[i]+1] - ai[r[i]];
222:     ajtmp = aj + ai[r[i]];
223:     v     = aa + bs2*ai[r[i]];
224:     for (j=0; j<nz; j++) {
225:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
226:     }

228:     /* elimination */
229:     bjtmp = bj + bi[i];
230:     nzL   = bi[i+1] - bi[i];
231:     for (k=0; k < nzL; k++) {
232:       row = bjtmp[k];
233:       pc  = rtmp + bs2*row;
234:       for (flg=0,j=0; j<bs2; j++) {
235:         if (pc[j]!=0.0) {
236:           flg = 1;
237:           break;
238:         }
239:       }
240:       if (flg) {
241:         pv = b->a + bs2*bdiag[row];
242:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
243:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

245:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
246:         pv = b->a + bs2*(bdiag[row+1]+1);
247:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
248:         for (j=0; j<nz; j++) {
249:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
250:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
251:           v    = rtmp + bs2*pj[j];
252:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
253:           pv  += bs2;
254:         }
255:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
256:       }
257:     }

259:     /* finished row so stick it into b->a */
260:     /* L part */
261:     pv = b->a + bs2*bi[i];
262:     pj = b->j + bi[i];
263:     nz = bi[i+1] - bi[i];
264:     for (j=0; j<nz; j++) {
265:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
266:     }

268:     /* Mark diagonal and invert diagonal for simplier triangular solves */
269:     pv   = b->a + bs2*bdiag[i];
270:     pj   = b->j + bdiag[i];
271:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
272:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
273:     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

275:     /* U part */
276:     pv = b->a + bs2*(bdiag[i+1]+1);
277:     pj = b->j + bdiag[i+1]+1;
278:     nz = bdiag[i] - bdiag[i+1] - 1;
279:     for (j=0; j<nz; j++) {
280:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
281:     }
282:   }

284:   PetscFree2(rtmp,mwork);
285:   ISRestoreIndices(isicol,&ic);
286:   ISRestoreIndices(isrow,&r);

288:   C->ops->solve          = MatSolve_SeqBAIJ_4;
289:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
290:   C->assembled           = PETSC_TRUE;

292:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
293:   return(0);
294: }

298: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
299: {
300: /*
301:     Default Version for when blocks are 4 by 4 Using natural ordering
302: */
303:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
305:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
306:   PetscInt       *ajtmpold,*ajtmp,nz,row;
307:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
308:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
309:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
310:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
311:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
312:   MatScalar      m13,m14,m15,m16;
313:   MatScalar      *ba           = b->a,*aa = a->a;
314:   PetscBool      pivotinblocks = b->pivotinblocks;
315:   PetscReal      shift         = info->shiftamount;
316:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

319:   allowzeropivot = PetscNot(A->erroriffailure);
320:   PetscMalloc1(16*(n+1),&rtmp);

322:   for (i=0; i<n; i++) {
323:     nz    = bi[i+1] - bi[i];
324:     ajtmp = bj + bi[i];
325:     for  (j=0; j<nz; j++) {
326:       x     = rtmp+16*ajtmp[j];
327:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
328:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
329:     }
330:     /* load in initial (unfactored row) */
331:     nz       = ai[i+1] - ai[i];
332:     ajtmpold = aj + ai[i];
333:     v        = aa + 16*ai[i];
334:     for (j=0; j<nz; j++) {
335:       x     = rtmp+16*ajtmpold[j];
336:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
337:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
338:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
339:       x[14] = v[14]; x[15] = v[15];
340:       v    += 16;
341:     }
342:     row = *ajtmp++;
343:     while (row < i) {
344:       pc  = rtmp + 16*row;
345:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
346:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
347:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
348:       p15 = pc[14]; p16 = pc[15];
349:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
350:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
351:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
352:           || p16 != 0.0) {
353:         pv    = ba + 16*diag_offset[row];
354:         pj    = bj + diag_offset[row] + 1;
355:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
356:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
357:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
358:         x15   = pv[14]; x16 = pv[15];
359:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
360:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
361:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
362:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

364:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
365:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
366:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
367:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

369:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
370:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
371:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
372:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

374:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
375:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
376:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
377:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
378:         nz     = bi[row+1] - diag_offset[row] - 1;
379:         pv    += 16;
380:         for (j=0; j<nz; j++) {
381:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
382:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
383:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
384:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
385:           x     = rtmp + 16*pj[j];
386:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
387:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
388:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
389:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

391:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
392:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
393:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
394:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

396:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
397:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
398:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
399:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

401:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
402:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
403:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
404:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

406:           pv += 16;
407:         }
408:         PetscLogFlops(128.0*nz+112.0);
409:       }
410:       row = *ajtmp++;
411:     }
412:     /* finished row so stick it into b->a */
413:     pv = ba + 16*bi[i];
414:     pj = bj + bi[i];
415:     nz = bi[i+1] - bi[i];
416:     for (j=0; j<nz; j++) {
417:       x      = rtmp+16*pj[j];
418:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
419:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
420:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
421:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
422:       pv    += 16;
423:     }
424:     /* invert diagonal block */
425:     w = ba + 16*diag_offset[i];
426:     if (pivotinblocks) {
427:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
428:       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
429:     } else {
430:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
431:     }
432:   }

434:   PetscFree(rtmp);

436:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
437:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
438:   C->assembled           = PETSC_TRUE;

440:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
441:   return(0);
442: }

444: /*
445:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
446:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
447: */
450: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
451: {
452:   Mat            C =B;
453:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
455:   PetscInt       i,j,k,nz,nzL,row;
456:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
457:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
458:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
459:   PetscInt       flg;
460:   PetscReal      shift;
461:   PetscBool      allowzeropivot,zeropivotdetected;

464:   allowzeropivot = PetscNot(A->erroriffailure);

466:   /* generate work space needed by the factorization */
467:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
468:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

470:   if (info->shifttype == MAT_SHIFT_NONE) {
471:     shift = 0;
472:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
473:     shift = info->shiftamount;
474:   }

476:   for (i=0; i<n; i++) {
477:     /* zero rtmp */
478:     /* L part */
479:     nz    = bi[i+1] - bi[i];
480:     bjtmp = bj + bi[i];
481:     for  (j=0; j<nz; j++) {
482:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
483:     }

485:     /* U part */
486:     nz    = bdiag[i] - bdiag[i+1];
487:     bjtmp = bj + bdiag[i+1]+1;
488:     for  (j=0; j<nz; j++) {
489:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
490:     }

492:     /* load in initial (unfactored row) */
493:     nz    = ai[i+1] - ai[i];
494:     ajtmp = aj + ai[i];
495:     v     = aa + bs2*ai[i];
496:     for (j=0; j<nz; j++) {
497:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
498:     }

500:     /* elimination */
501:     bjtmp = bj + bi[i];
502:     nzL   = bi[i+1] - bi[i];
503:     for (k=0; k < nzL; k++) {
504:       row = bjtmp[k];
505:       pc  = rtmp + bs2*row;
506:       for (flg=0,j=0; j<bs2; j++) {
507:         if (pc[j]!=0.0) {
508:           flg = 1;
509:           break;
510:         }
511:       }
512:       if (flg) {
513:         pv = b->a + bs2*bdiag[row];
514:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
515:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

517:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
518:         pv = b->a + bs2*(bdiag[row+1]+1);
519:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
520:         for (j=0; j<nz; j++) {
521:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
522:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
523:           v    = rtmp + bs2*pj[j];
524:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
525:           pv  += bs2;
526:         }
527:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
528:       }
529:     }

531:     /* finished row so stick it into b->a */
532:     /* L part */
533:     pv = b->a + bs2*bi[i];
534:     pj = b->j + bi[i];
535:     nz = bi[i+1] - bi[i];
536:     for (j=0; j<nz; j++) {
537:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
538:     }

540:     /* Mark diagonal and invert diagonal for simplier triangular solves */
541:     pv   = b->a + bs2*bdiag[i];
542:     pj   = b->j + bdiag[i];
543:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
544:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
545:     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

547:     /* U part */
548:     pv = b->a + bs2*(bdiag[i+1]+1);
549:     pj = b->j + bdiag[i+1]+1;
550:     nz = bdiag[i] - bdiag[i+1] - 1;
551:     for (j=0; j<nz; j++) {
552:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
553:     }
554:   }
555:   PetscFree2(rtmp,mwork);

557:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
558:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
559:   C->assembled           = PETSC_TRUE;

561:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
562:   return(0);
563: }

565: #if defined(PETSC_HAVE_SSE)

567: #include PETSC_HAVE_SSE

569: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
572: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
573: {
574:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
576:   int            i,j,n = a->mbs;
577:   int            *bj = b->j,*bjtmp,*pj;
578:   int            row;
579:   int            *ajtmpold,nz,*bi=b->i;
580:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
581:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
582:   MatScalar      *ba    = b->a,*aa = a->a;
583:   int            nonzero=0;
584:   PetscBool      pivotinblocks = b->pivotinblocks;
585:   PetscReal      shift         = info->shiftamount;
586:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

589:   allowzeropivot = PetscNot(A->erroriffailure);
590:   SSE_SCOPE_BEGIN;

592:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
593:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
594:   PetscMalloc1(16*(n+1),&rtmp);
595:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
596: /*    if ((unsigned long)bj==(unsigned long)aj) { */
597: /*      colscale = 4; */
598: /*    } */
599:   for (i=0; i<n; i++) {
600:     nz    = bi[i+1] - bi[i];
601:     bjtmp = bj + bi[i];
602:     /* zero out the 4x4 block accumulators */
603:     /* zero out one register */
604:     XOR_PS(XMM7,XMM7);
605:     for  (j=0; j<nz; j++) {
606:       x = rtmp+16*bjtmp[j];
607: /*        x = rtmp+4*bjtmp[j]; */
608:       SSE_INLINE_BEGIN_1(x)
609:       /* Copy zero register to memory locations */
610:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
611:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
612:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
613:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
614:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
615:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
616:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
617:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
618:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
619:       SSE_INLINE_END_1;
620:     }
621:     /* load in initial (unfactored row) */
622:     nz       = ai[i+1] - ai[i];
623:     ajtmpold = aj + ai[i];
624:     v        = aa + 16*ai[i];
625:     for (j=0; j<nz; j++) {
626:       x = rtmp+16*ajtmpold[j];
627: /*        x = rtmp+colscale*ajtmpold[j]; */
628:       /* Copy v block into x block */
629:       SSE_INLINE_BEGIN_2(v,x)
630:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
631:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
632:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

634:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
635:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

637:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
638:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

640:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
641:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

643:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
644:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

646:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
647:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

649:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
650:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

652:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
653:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
654:       SSE_INLINE_END_2;

656:       v += 16;
657:     }
658: /*      row = (*bjtmp++)/4; */
659:     row = *bjtmp++;
660:     while (row < i) {
661:       pc = rtmp + 16*row;
662:       SSE_INLINE_BEGIN_1(pc)
663:       /* Load block from lower triangle */
664:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
665:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
666:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

668:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
669:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

671:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
672:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

674:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
675:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

677:       /* Compare block to zero block */

679:       SSE_COPY_PS(XMM4,XMM7)
680:       SSE_CMPNEQ_PS(XMM4,XMM0)

682:       SSE_COPY_PS(XMM5,XMM7)
683:       SSE_CMPNEQ_PS(XMM5,XMM1)

685:       SSE_COPY_PS(XMM6,XMM7)
686:       SSE_CMPNEQ_PS(XMM6,XMM2)

688:       SSE_CMPNEQ_PS(XMM7,XMM3)

690:       /* Reduce the comparisons to one SSE register */
691:       SSE_OR_PS(XMM6,XMM7)
692:       SSE_OR_PS(XMM5,XMM4)
693:       SSE_OR_PS(XMM5,XMM6)
694:       SSE_INLINE_END_1;

696:       /* Reduce the one SSE register to an integer register for branching */
697:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
698:       MOVEMASK(nonzero,XMM5);

700:       /* If block is nonzero ... */
701:       if (nonzero) {
702:         pv = ba + 16*diag_offset[row];
703:         PREFETCH_L1(&pv[16]);
704:         pj = bj + diag_offset[row] + 1;

706:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
707:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
708:         /* but the diagonal was inverted already */
709:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

711:         SSE_INLINE_BEGIN_2(pv,pc)
712:         /* Column 0, product is accumulated in XMM4 */
713:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
714:         SSE_SHUFFLE(XMM4,XMM4,0x00)
715:         SSE_MULT_PS(XMM4,XMM0)

717:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
718:         SSE_SHUFFLE(XMM5,XMM5,0x00)
719:         SSE_MULT_PS(XMM5,XMM1)
720:         SSE_ADD_PS(XMM4,XMM5)

722:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
723:         SSE_SHUFFLE(XMM6,XMM6,0x00)
724:         SSE_MULT_PS(XMM6,XMM2)
725:         SSE_ADD_PS(XMM4,XMM6)

727:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
728:         SSE_SHUFFLE(XMM7,XMM7,0x00)
729:         SSE_MULT_PS(XMM7,XMM3)
730:         SSE_ADD_PS(XMM4,XMM7)

732:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
733:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

735:         /* Column 1, product is accumulated in XMM5 */
736:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
737:         SSE_SHUFFLE(XMM5,XMM5,0x00)
738:         SSE_MULT_PS(XMM5,XMM0)

740:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
741:         SSE_SHUFFLE(XMM6,XMM6,0x00)
742:         SSE_MULT_PS(XMM6,XMM1)
743:         SSE_ADD_PS(XMM5,XMM6)

745:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
746:         SSE_SHUFFLE(XMM7,XMM7,0x00)
747:         SSE_MULT_PS(XMM7,XMM2)
748:         SSE_ADD_PS(XMM5,XMM7)

750:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
751:         SSE_SHUFFLE(XMM6,XMM6,0x00)
752:         SSE_MULT_PS(XMM6,XMM3)
753:         SSE_ADD_PS(XMM5,XMM6)

755:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
756:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

758:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

760:         /* Column 2, product is accumulated in XMM6 */
761:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
762:         SSE_SHUFFLE(XMM6,XMM6,0x00)
763:         SSE_MULT_PS(XMM6,XMM0)

765:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
766:         SSE_SHUFFLE(XMM7,XMM7,0x00)
767:         SSE_MULT_PS(XMM7,XMM1)
768:         SSE_ADD_PS(XMM6,XMM7)

770:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
771:         SSE_SHUFFLE(XMM7,XMM7,0x00)
772:         SSE_MULT_PS(XMM7,XMM2)
773:         SSE_ADD_PS(XMM6,XMM7)

775:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
776:         SSE_SHUFFLE(XMM7,XMM7,0x00)
777:         SSE_MULT_PS(XMM7,XMM3)
778:         SSE_ADD_PS(XMM6,XMM7)

780:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
781:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

783:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
784:         /* Column 3, product is accumulated in XMM0 */
785:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
786:         SSE_SHUFFLE(XMM7,XMM7,0x00)
787:         SSE_MULT_PS(XMM0,XMM7)

789:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
790:         SSE_SHUFFLE(XMM7,XMM7,0x00)
791:         SSE_MULT_PS(XMM1,XMM7)
792:         SSE_ADD_PS(XMM0,XMM1)

794:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
795:         SSE_SHUFFLE(XMM1,XMM1,0x00)
796:         SSE_MULT_PS(XMM1,XMM2)
797:         SSE_ADD_PS(XMM0,XMM1)

799:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
800:         SSE_SHUFFLE(XMM7,XMM7,0x00)
801:         SSE_MULT_PS(XMM3,XMM7)
802:         SSE_ADD_PS(XMM0,XMM3)

804:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
805:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

807:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
808:         /* This is code to be maintained and read by humans afterall. */
809:         /* Copy Multiplier Col 3 into XMM3 */
810:         SSE_COPY_PS(XMM3,XMM0)
811:         /* Copy Multiplier Col 2 into XMM2 */
812:         SSE_COPY_PS(XMM2,XMM6)
813:         /* Copy Multiplier Col 1 into XMM1 */
814:         SSE_COPY_PS(XMM1,XMM5)
815:         /* Copy Multiplier Col 0 into XMM0 */
816:         SSE_COPY_PS(XMM0,XMM4)
817:         SSE_INLINE_END_2;

819:         /* Update the row: */
820:         nz  = bi[row+1] - diag_offset[row] - 1;
821:         pv += 16;
822:         for (j=0; j<nz; j++) {
823:           PREFETCH_L1(&pv[16]);
824:           x = rtmp + 16*pj[j];
825: /*            x = rtmp + 4*pj[j]; */

827:           /* X:=X-M*PV, One column at a time */
828:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
829:           SSE_INLINE_BEGIN_2(x,pv)
830:           /* Load First Column of X*/
831:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
832:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

834:           /* Matrix-Vector Product: */
835:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
836:           SSE_SHUFFLE(XMM5,XMM5,0x00)
837:           SSE_MULT_PS(XMM5,XMM0)
838:           SSE_SUB_PS(XMM4,XMM5)

840:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
841:           SSE_SHUFFLE(XMM6,XMM6,0x00)
842:           SSE_MULT_PS(XMM6,XMM1)
843:           SSE_SUB_PS(XMM4,XMM6)

845:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
846:           SSE_SHUFFLE(XMM7,XMM7,0x00)
847:           SSE_MULT_PS(XMM7,XMM2)
848:           SSE_SUB_PS(XMM4,XMM7)

850:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
851:           SSE_SHUFFLE(XMM5,XMM5,0x00)
852:           SSE_MULT_PS(XMM5,XMM3)
853:           SSE_SUB_PS(XMM4,XMM5)

855:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
856:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

858:           /* Second Column */
859:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
860:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

862:           /* Matrix-Vector Product: */
863:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
864:           SSE_SHUFFLE(XMM6,XMM6,0x00)
865:           SSE_MULT_PS(XMM6,XMM0)
866:           SSE_SUB_PS(XMM5,XMM6)

868:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
869:           SSE_SHUFFLE(XMM7,XMM7,0x00)
870:           SSE_MULT_PS(XMM7,XMM1)
871:           SSE_SUB_PS(XMM5,XMM7)

873:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
874:           SSE_SHUFFLE(XMM4,XMM4,0x00)
875:           SSE_MULT_PS(XMM4,XMM2)
876:           SSE_SUB_PS(XMM5,XMM4)

878:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
879:           SSE_SHUFFLE(XMM6,XMM6,0x00)
880:           SSE_MULT_PS(XMM6,XMM3)
881:           SSE_SUB_PS(XMM5,XMM6)

883:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
884:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

886:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

888:           /* Third Column */
889:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
890:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

892:           /* Matrix-Vector Product: */
893:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
894:           SSE_SHUFFLE(XMM7,XMM7,0x00)
895:           SSE_MULT_PS(XMM7,XMM0)
896:           SSE_SUB_PS(XMM6,XMM7)

898:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
899:           SSE_SHUFFLE(XMM4,XMM4,0x00)
900:           SSE_MULT_PS(XMM4,XMM1)
901:           SSE_SUB_PS(XMM6,XMM4)

903:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
904:           SSE_SHUFFLE(XMM5,XMM5,0x00)
905:           SSE_MULT_PS(XMM5,XMM2)
906:           SSE_SUB_PS(XMM6,XMM5)

908:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
909:           SSE_SHUFFLE(XMM7,XMM7,0x00)
910:           SSE_MULT_PS(XMM7,XMM3)
911:           SSE_SUB_PS(XMM6,XMM7)

913:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
914:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

916:           /* Fourth Column */
917:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
918:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

920:           /* Matrix-Vector Product: */
921:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
922:           SSE_SHUFFLE(XMM5,XMM5,0x00)
923:           SSE_MULT_PS(XMM5,XMM0)
924:           SSE_SUB_PS(XMM4,XMM5)

926:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
927:           SSE_SHUFFLE(XMM6,XMM6,0x00)
928:           SSE_MULT_PS(XMM6,XMM1)
929:           SSE_SUB_PS(XMM4,XMM6)

931:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
932:           SSE_SHUFFLE(XMM7,XMM7,0x00)
933:           SSE_MULT_PS(XMM7,XMM2)
934:           SSE_SUB_PS(XMM4,XMM7)

936:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
937:           SSE_SHUFFLE(XMM5,XMM5,0x00)
938:           SSE_MULT_PS(XMM5,XMM3)
939:           SSE_SUB_PS(XMM4,XMM5)

941:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
942:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
943:           SSE_INLINE_END_2;
944:           pv += 16;
945:         }
946:         PetscLogFlops(128.0*nz+112.0);
947:       }
948:       row = *bjtmp++;
949: /*        row = (*bjtmp++)/4; */
950:     }
951:     /* finished row so stick it into b->a */
952:     pv = ba + 16*bi[i];
953:     pj = bj + bi[i];
954:     nz = bi[i+1] - bi[i];

956:     /* Copy x block back into pv block */
957:     for (j=0; j<nz; j++) {
958:       x = rtmp+16*pj[j];
959: /*        x  = rtmp+4*pj[j]; */

961:       SSE_INLINE_BEGIN_2(x,pv)
962:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
963:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
964:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

966:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
967:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

969:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
970:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

972:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
973:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

975:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
976:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

978:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
979:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

981:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
982:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

984:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
985:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
986:       SSE_INLINE_END_2;
987:       pv += 16;
988:     }
989:     /* invert diagonal block */
990:     w = ba + 16*diag_offset[i];
991:     if (pivotinblocks) {
992:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
993:       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
994:     } else {
995:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
996:     }
997: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
998:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
999:   }

1001:   PetscFree(rtmp);

1003:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1004:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1005:   C->assembled           = PETSC_TRUE;

1007:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1008:   /* Flop Count from inverting diagonal blocks */
1009:   SSE_SCOPE_END;
1010:   return(0);
1011: }

1015: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1016: {
1017:   Mat            A  =C;
1018:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1020:   int            i,j,n = a->mbs;
1021:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1022:   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1023:   unsigned int   row;
1024:   int            nz,*bi=b->i;
1025:   int            *diag_offset = b->diag,*ai=a->i;
1026:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1027:   MatScalar      *ba    = b->a,*aa = a->a;
1028:   int            nonzero=0;
1029:   PetscBool      pivotinblocks = b->pivotinblocks;
1030:   PetscReal      shift = info->shiftamount;
1031:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1034:   allowzeropivot = PetscNot(A->erroriffailure);
1035:   SSE_SCOPE_BEGIN;

1037:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1038:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1039:   PetscMalloc1(16*(n+1),&rtmp);
1040:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1041: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1042: /*      colscale = 4; */
1043: /*    } */

1045:   for (i=0; i<n; i++) {
1046:     nz    = bi[i+1] - bi[i];
1047:     bjtmp = bj + bi[i];
1048:     /* zero out the 4x4 block accumulators */
1049:     /* zero out one register */
1050:     XOR_PS(XMM7,XMM7);
1051:     for  (j=0; j<nz; j++) {
1052:       x = rtmp+16*((unsigned int)bjtmp[j]);
1053: /*        x = rtmp+4*bjtmp[j]; */
1054:       SSE_INLINE_BEGIN_1(x)
1055:       /* Copy zero register to memory locations */
1056:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1057:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1058:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1059:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1060:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1061:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1062:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1063:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1064:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1065:       SSE_INLINE_END_1;
1066:     }
1067:     /* load in initial (unfactored row) */
1068:     nz    = ai[i+1] - ai[i];
1069:     ajtmp = aj + ai[i];
1070:     v     = aa + 16*ai[i];
1071:     for (j=0; j<nz; j++) {
1072:       x = rtmp+16*((unsigned int)ajtmp[j]);
1073: /*        x = rtmp+colscale*ajtmp[j]; */
1074:       /* Copy v block into x block */
1075:       SSE_INLINE_BEGIN_2(v,x)
1076:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1077:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1078:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1080:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1081:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1083:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1084:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1086:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1087:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1089:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1090:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1092:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1093:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1095:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1096:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1098:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1099:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1100:       SSE_INLINE_END_2;

1102:       v += 16;
1103:     }
1104: /*      row = (*bjtmp++)/4; */
1105:     row = (unsigned int)(*bjtmp++);
1106:     while (row < i) {
1107:       pc = rtmp + 16*row;
1108:       SSE_INLINE_BEGIN_1(pc)
1109:       /* Load block from lower triangle */
1110:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1111:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1112:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1114:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1115:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1117:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1118:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1120:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1121:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1123:       /* Compare block to zero block */

1125:       SSE_COPY_PS(XMM4,XMM7)
1126:       SSE_CMPNEQ_PS(XMM4,XMM0)

1128:       SSE_COPY_PS(XMM5,XMM7)
1129:       SSE_CMPNEQ_PS(XMM5,XMM1)

1131:       SSE_COPY_PS(XMM6,XMM7)
1132:       SSE_CMPNEQ_PS(XMM6,XMM2)

1134:       SSE_CMPNEQ_PS(XMM7,XMM3)

1136:       /* Reduce the comparisons to one SSE register */
1137:       SSE_OR_PS(XMM6,XMM7)
1138:       SSE_OR_PS(XMM5,XMM4)
1139:       SSE_OR_PS(XMM5,XMM6)
1140:       SSE_INLINE_END_1;

1142:       /* Reduce the one SSE register to an integer register for branching */
1143:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1144:       MOVEMASK(nonzero,XMM5);

1146:       /* If block is nonzero ... */
1147:       if (nonzero) {
1148:         pv = ba + 16*diag_offset[row];
1149:         PREFETCH_L1(&pv[16]);
1150:         pj = bj + diag_offset[row] + 1;

1152:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1153:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1154:         /* but the diagonal was inverted already */
1155:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1157:         SSE_INLINE_BEGIN_2(pv,pc)
1158:         /* Column 0, product is accumulated in XMM4 */
1159:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1160:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1161:         SSE_MULT_PS(XMM4,XMM0)

1163:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1164:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1165:         SSE_MULT_PS(XMM5,XMM1)
1166:         SSE_ADD_PS(XMM4,XMM5)

1168:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1169:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1170:         SSE_MULT_PS(XMM6,XMM2)
1171:         SSE_ADD_PS(XMM4,XMM6)

1173:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1174:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1175:         SSE_MULT_PS(XMM7,XMM3)
1176:         SSE_ADD_PS(XMM4,XMM7)

1178:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1179:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1181:         /* Column 1, product is accumulated in XMM5 */
1182:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1183:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1184:         SSE_MULT_PS(XMM5,XMM0)

1186:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1187:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1188:         SSE_MULT_PS(XMM6,XMM1)
1189:         SSE_ADD_PS(XMM5,XMM6)

1191:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1192:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1193:         SSE_MULT_PS(XMM7,XMM2)
1194:         SSE_ADD_PS(XMM5,XMM7)

1196:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1197:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1198:         SSE_MULT_PS(XMM6,XMM3)
1199:         SSE_ADD_PS(XMM5,XMM6)

1201:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1202:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1204:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1206:         /* Column 2, product is accumulated in XMM6 */
1207:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1208:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1209:         SSE_MULT_PS(XMM6,XMM0)

1211:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1212:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1213:         SSE_MULT_PS(XMM7,XMM1)
1214:         SSE_ADD_PS(XMM6,XMM7)

1216:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1217:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1218:         SSE_MULT_PS(XMM7,XMM2)
1219:         SSE_ADD_PS(XMM6,XMM7)

1221:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1222:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1223:         SSE_MULT_PS(XMM7,XMM3)
1224:         SSE_ADD_PS(XMM6,XMM7)

1226:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1227:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1229:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1230:         /* Column 3, product is accumulated in XMM0 */
1231:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1232:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1233:         SSE_MULT_PS(XMM0,XMM7)

1235:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1236:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1237:         SSE_MULT_PS(XMM1,XMM7)
1238:         SSE_ADD_PS(XMM0,XMM1)

1240:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1241:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1242:         SSE_MULT_PS(XMM1,XMM2)
1243:         SSE_ADD_PS(XMM0,XMM1)

1245:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1246:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1247:         SSE_MULT_PS(XMM3,XMM7)
1248:         SSE_ADD_PS(XMM0,XMM3)

1250:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1251:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1253:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1254:         /* This is code to be maintained and read by humans afterall. */
1255:         /* Copy Multiplier Col 3 into XMM3 */
1256:         SSE_COPY_PS(XMM3,XMM0)
1257:         /* Copy Multiplier Col 2 into XMM2 */
1258:         SSE_COPY_PS(XMM2,XMM6)
1259:         /* Copy Multiplier Col 1 into XMM1 */
1260:         SSE_COPY_PS(XMM1,XMM5)
1261:         /* Copy Multiplier Col 0 into XMM0 */
1262:         SSE_COPY_PS(XMM0,XMM4)
1263:         SSE_INLINE_END_2;

1265:         /* Update the row: */
1266:         nz  = bi[row+1] - diag_offset[row] - 1;
1267:         pv += 16;
1268:         for (j=0; j<nz; j++) {
1269:           PREFETCH_L1(&pv[16]);
1270:           x = rtmp + 16*((unsigned int)pj[j]);
1271: /*            x = rtmp + 4*pj[j]; */

1273:           /* X:=X-M*PV, One column at a time */
1274:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1275:           SSE_INLINE_BEGIN_2(x,pv)
1276:           /* Load First Column of X*/
1277:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1278:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1280:           /* Matrix-Vector Product: */
1281:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1282:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1283:           SSE_MULT_PS(XMM5,XMM0)
1284:           SSE_SUB_PS(XMM4,XMM5)

1286:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1287:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1288:           SSE_MULT_PS(XMM6,XMM1)
1289:           SSE_SUB_PS(XMM4,XMM6)

1291:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1292:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1293:           SSE_MULT_PS(XMM7,XMM2)
1294:           SSE_SUB_PS(XMM4,XMM7)

1296:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1297:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1298:           SSE_MULT_PS(XMM5,XMM3)
1299:           SSE_SUB_PS(XMM4,XMM5)

1301:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1302:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1304:           /* Second Column */
1305:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1306:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1308:           /* Matrix-Vector Product: */
1309:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1310:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1311:           SSE_MULT_PS(XMM6,XMM0)
1312:           SSE_SUB_PS(XMM5,XMM6)

1314:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1315:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1316:           SSE_MULT_PS(XMM7,XMM1)
1317:           SSE_SUB_PS(XMM5,XMM7)

1319:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1320:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1321:           SSE_MULT_PS(XMM4,XMM2)
1322:           SSE_SUB_PS(XMM5,XMM4)

1324:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1325:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1326:           SSE_MULT_PS(XMM6,XMM3)
1327:           SSE_SUB_PS(XMM5,XMM6)

1329:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1330:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1332:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1334:           /* Third Column */
1335:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1336:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1338:           /* Matrix-Vector Product: */
1339:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1340:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1341:           SSE_MULT_PS(XMM7,XMM0)
1342:           SSE_SUB_PS(XMM6,XMM7)

1344:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1345:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1346:           SSE_MULT_PS(XMM4,XMM1)
1347:           SSE_SUB_PS(XMM6,XMM4)

1349:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1350:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1351:           SSE_MULT_PS(XMM5,XMM2)
1352:           SSE_SUB_PS(XMM6,XMM5)

1354:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1355:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1356:           SSE_MULT_PS(XMM7,XMM3)
1357:           SSE_SUB_PS(XMM6,XMM7)

1359:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1360:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1362:           /* Fourth Column */
1363:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1364:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1366:           /* Matrix-Vector Product: */
1367:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1368:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1369:           SSE_MULT_PS(XMM5,XMM0)
1370:           SSE_SUB_PS(XMM4,XMM5)

1372:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1373:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1374:           SSE_MULT_PS(XMM6,XMM1)
1375:           SSE_SUB_PS(XMM4,XMM6)

1377:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1378:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1379:           SSE_MULT_PS(XMM7,XMM2)
1380:           SSE_SUB_PS(XMM4,XMM7)

1382:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1383:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1384:           SSE_MULT_PS(XMM5,XMM3)
1385:           SSE_SUB_PS(XMM4,XMM5)

1387:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1388:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1389:           SSE_INLINE_END_2;
1390:           pv += 16;
1391:         }
1392:         PetscLogFlops(128.0*nz+112.0);
1393:       }
1394:       row = (unsigned int)(*bjtmp++);
1395: /*        row = (*bjtmp++)/4; */
1396: /*        bjtmp++; */
1397:     }
1398:     /* finished row so stick it into b->a */
1399:     pv = ba + 16*bi[i];
1400:     pj = bj + bi[i];
1401:     nz = bi[i+1] - bi[i];

1403:     /* Copy x block back into pv block */
1404:     for (j=0; j<nz; j++) {
1405:       x = rtmp+16*((unsigned int)pj[j]);
1406: /*        x  = rtmp+4*pj[j]; */

1408:       SSE_INLINE_BEGIN_2(x,pv)
1409:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1410:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1411:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1413:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1414:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1416:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1417:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1419:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1420:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1422:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1423:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1425:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1426:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1428:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1429:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1431:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1432:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1433:       SSE_INLINE_END_2;
1434:       pv += 16;
1435:     }
1436:     /* invert diagonal block */
1437:     w = ba + 16*diag_offset[i];
1438:     if (pivotinblocks) {
1439:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1440:       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1441:     } else {
1442:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1443:     }
1444:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1445:   }

1447:   PetscFree(rtmp);

1449:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1450:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1451:   C->assembled           = PETSC_TRUE;

1453:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1454:   /* Flop Count from inverting diagonal blocks */
1455:   SSE_SCOPE_END;
1456:   return(0);
1457: }

1461: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1462: {
1463:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1465:   int            i,j,n = a->mbs;
1466:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1467:   unsigned int   row;
1468:   int            *ajtmpold,nz,*bi=b->i;
1469:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1470:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1471:   MatScalar      *ba    = b->a,*aa = a->a;
1472:   int            nonzero=0;
1473:   PetscBool      pivotinblocks = b->pivotinblocks;
1474:   PetscReal      shift = info->shiftamount;
1475:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1478:   allowzeropivot = PetscNot(A->erroriffailure);
1479:   SSE_SCOPE_BEGIN;

1481:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1482:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1483:   PetscMalloc1(16*(n+1),&rtmp);
1484:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1485: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1486: /*      colscale = 4; */
1487: /*    } */
1488:   if ((unsigned long)bj==(unsigned long)aj) {
1489:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1490:   }

1492:   for (i=0; i<n; i++) {
1493:     nz    = bi[i+1] - bi[i];
1494:     bjtmp = bj + bi[i];
1495:     /* zero out the 4x4 block accumulators */
1496:     /* zero out one register */
1497:     XOR_PS(XMM7,XMM7);
1498:     for  (j=0; j<nz; j++) {
1499:       x = rtmp+16*((unsigned int)bjtmp[j]);
1500: /*        x = rtmp+4*bjtmp[j]; */
1501:       SSE_INLINE_BEGIN_1(x)
1502:       /* Copy zero register to memory locations */
1503:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1504:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1505:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1506:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1507:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1508:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1509:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1510:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1511:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1512:       SSE_INLINE_END_1;
1513:     }
1514:     /* load in initial (unfactored row) */
1515:     nz       = ai[i+1] - ai[i];
1516:     ajtmpold = aj + ai[i];
1517:     v        = aa + 16*ai[i];
1518:     for (j=0; j<nz; j++) {
1519:       x = rtmp+16*ajtmpold[j];
1520: /*        x = rtmp+colscale*ajtmpold[j]; */
1521:       /* Copy v block into x block */
1522:       SSE_INLINE_BEGIN_2(v,x)
1523:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1524:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1525:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1527:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1528:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1530:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1531:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1533:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1534:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1536:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1537:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1539:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1540:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1542:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1543:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1545:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1546:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1547:       SSE_INLINE_END_2;

1549:       v += 16;
1550:     }
1551: /*      row = (*bjtmp++)/4; */
1552:     row = (unsigned int)(*bjtmp++);
1553:     while (row < i) {
1554:       pc = rtmp + 16*row;
1555:       SSE_INLINE_BEGIN_1(pc)
1556:       /* Load block from lower triangle */
1557:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1558:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1559:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1561:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1562:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1564:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1565:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1567:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1568:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1570:       /* Compare block to zero block */

1572:       SSE_COPY_PS(XMM4,XMM7)
1573:       SSE_CMPNEQ_PS(XMM4,XMM0)

1575:       SSE_COPY_PS(XMM5,XMM7)
1576:       SSE_CMPNEQ_PS(XMM5,XMM1)

1578:       SSE_COPY_PS(XMM6,XMM7)
1579:       SSE_CMPNEQ_PS(XMM6,XMM2)

1581:       SSE_CMPNEQ_PS(XMM7,XMM3)

1583:       /* Reduce the comparisons to one SSE register */
1584:       SSE_OR_PS(XMM6,XMM7)
1585:       SSE_OR_PS(XMM5,XMM4)
1586:       SSE_OR_PS(XMM5,XMM6)
1587:       SSE_INLINE_END_1;

1589:       /* Reduce the one SSE register to an integer register for branching */
1590:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1591:       MOVEMASK(nonzero,XMM5);

1593:       /* If block is nonzero ... */
1594:       if (nonzero) {
1595:         pv = ba + 16*diag_offset[row];
1596:         PREFETCH_L1(&pv[16]);
1597:         pj = bj + diag_offset[row] + 1;

1599:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1600:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1601:         /* but the diagonal was inverted already */
1602:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1604:         SSE_INLINE_BEGIN_2(pv,pc)
1605:         /* Column 0, product is accumulated in XMM4 */
1606:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1607:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1608:         SSE_MULT_PS(XMM4,XMM0)

1610:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1611:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1612:         SSE_MULT_PS(XMM5,XMM1)
1613:         SSE_ADD_PS(XMM4,XMM5)

1615:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1616:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1617:         SSE_MULT_PS(XMM6,XMM2)
1618:         SSE_ADD_PS(XMM4,XMM6)

1620:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1621:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1622:         SSE_MULT_PS(XMM7,XMM3)
1623:         SSE_ADD_PS(XMM4,XMM7)

1625:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1626:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1628:         /* Column 1, product is accumulated in XMM5 */
1629:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1630:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1631:         SSE_MULT_PS(XMM5,XMM0)

1633:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1634:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1635:         SSE_MULT_PS(XMM6,XMM1)
1636:         SSE_ADD_PS(XMM5,XMM6)

1638:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1639:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1640:         SSE_MULT_PS(XMM7,XMM2)
1641:         SSE_ADD_PS(XMM5,XMM7)

1643:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1644:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1645:         SSE_MULT_PS(XMM6,XMM3)
1646:         SSE_ADD_PS(XMM5,XMM6)

1648:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1649:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1651:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1653:         /* Column 2, product is accumulated in XMM6 */
1654:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1655:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1656:         SSE_MULT_PS(XMM6,XMM0)

1658:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1659:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1660:         SSE_MULT_PS(XMM7,XMM1)
1661:         SSE_ADD_PS(XMM6,XMM7)

1663:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1664:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1665:         SSE_MULT_PS(XMM7,XMM2)
1666:         SSE_ADD_PS(XMM6,XMM7)

1668:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1669:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1670:         SSE_MULT_PS(XMM7,XMM3)
1671:         SSE_ADD_PS(XMM6,XMM7)

1673:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1674:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1676:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1677:         /* Column 3, product is accumulated in XMM0 */
1678:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1679:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1680:         SSE_MULT_PS(XMM0,XMM7)

1682:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1683:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1684:         SSE_MULT_PS(XMM1,XMM7)
1685:         SSE_ADD_PS(XMM0,XMM1)

1687:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1688:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1689:         SSE_MULT_PS(XMM1,XMM2)
1690:         SSE_ADD_PS(XMM0,XMM1)

1692:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1693:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1694:         SSE_MULT_PS(XMM3,XMM7)
1695:         SSE_ADD_PS(XMM0,XMM3)

1697:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1698:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1700:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1701:         /* This is code to be maintained and read by humans afterall. */
1702:         /* Copy Multiplier Col 3 into XMM3 */
1703:         SSE_COPY_PS(XMM3,XMM0)
1704:         /* Copy Multiplier Col 2 into XMM2 */
1705:         SSE_COPY_PS(XMM2,XMM6)
1706:         /* Copy Multiplier Col 1 into XMM1 */
1707:         SSE_COPY_PS(XMM1,XMM5)
1708:         /* Copy Multiplier Col 0 into XMM0 */
1709:         SSE_COPY_PS(XMM0,XMM4)
1710:         SSE_INLINE_END_2;

1712:         /* Update the row: */
1713:         nz  = bi[row+1] - diag_offset[row] - 1;
1714:         pv += 16;
1715:         for (j=0; j<nz; j++) {
1716:           PREFETCH_L1(&pv[16]);
1717:           x = rtmp + 16*((unsigned int)pj[j]);
1718: /*            x = rtmp + 4*pj[j]; */

1720:           /* X:=X-M*PV, One column at a time */
1721:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1722:           SSE_INLINE_BEGIN_2(x,pv)
1723:           /* Load First Column of X*/
1724:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1725:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1727:           /* Matrix-Vector Product: */
1728:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1729:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1730:           SSE_MULT_PS(XMM5,XMM0)
1731:           SSE_SUB_PS(XMM4,XMM5)

1733:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1734:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1735:           SSE_MULT_PS(XMM6,XMM1)
1736:           SSE_SUB_PS(XMM4,XMM6)

1738:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1739:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1740:           SSE_MULT_PS(XMM7,XMM2)
1741:           SSE_SUB_PS(XMM4,XMM7)

1743:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1744:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1745:           SSE_MULT_PS(XMM5,XMM3)
1746:           SSE_SUB_PS(XMM4,XMM5)

1748:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1749:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1751:           /* Second Column */
1752:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1753:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1755:           /* Matrix-Vector Product: */
1756:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1757:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1758:           SSE_MULT_PS(XMM6,XMM0)
1759:           SSE_SUB_PS(XMM5,XMM6)

1761:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1762:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1763:           SSE_MULT_PS(XMM7,XMM1)
1764:           SSE_SUB_PS(XMM5,XMM7)

1766:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1767:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1768:           SSE_MULT_PS(XMM4,XMM2)
1769:           SSE_SUB_PS(XMM5,XMM4)

1771:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1772:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1773:           SSE_MULT_PS(XMM6,XMM3)
1774:           SSE_SUB_PS(XMM5,XMM6)

1776:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1777:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1779:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1781:           /* Third Column */
1782:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1783:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1785:           /* Matrix-Vector Product: */
1786:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1787:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1788:           SSE_MULT_PS(XMM7,XMM0)
1789:           SSE_SUB_PS(XMM6,XMM7)

1791:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1792:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1793:           SSE_MULT_PS(XMM4,XMM1)
1794:           SSE_SUB_PS(XMM6,XMM4)

1796:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1797:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1798:           SSE_MULT_PS(XMM5,XMM2)
1799:           SSE_SUB_PS(XMM6,XMM5)

1801:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1802:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1803:           SSE_MULT_PS(XMM7,XMM3)
1804:           SSE_SUB_PS(XMM6,XMM7)

1806:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1807:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1809:           /* Fourth Column */
1810:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1811:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1813:           /* Matrix-Vector Product: */
1814:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1815:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1816:           SSE_MULT_PS(XMM5,XMM0)
1817:           SSE_SUB_PS(XMM4,XMM5)

1819:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1820:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1821:           SSE_MULT_PS(XMM6,XMM1)
1822:           SSE_SUB_PS(XMM4,XMM6)

1824:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1825:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1826:           SSE_MULT_PS(XMM7,XMM2)
1827:           SSE_SUB_PS(XMM4,XMM7)

1829:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1830:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1831:           SSE_MULT_PS(XMM5,XMM3)
1832:           SSE_SUB_PS(XMM4,XMM5)

1834:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1835:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1836:           SSE_INLINE_END_2;
1837:           pv += 16;
1838:         }
1839:         PetscLogFlops(128.0*nz+112.0);
1840:       }
1841:       row = (unsigned int)(*bjtmp++);
1842: /*        row = (*bjtmp++)/4; */
1843: /*        bjtmp++; */
1844:     }
1845:     /* finished row so stick it into b->a */
1846:     pv = ba + 16*bi[i];
1847:     pj = bj + bi[i];
1848:     nz = bi[i+1] - bi[i];

1850:     /* Copy x block back into pv block */
1851:     for (j=0; j<nz; j++) {
1852:       x = rtmp+16*((unsigned int)pj[j]);
1853: /*        x  = rtmp+4*pj[j]; */

1855:       SSE_INLINE_BEGIN_2(x,pv)
1856:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1857:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1858:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1860:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1861:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1863:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1864:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1866:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1867:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1869:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1870:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1872:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1873:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1875:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1876:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1878:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1879:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1880:       SSE_INLINE_END_2;
1881:       pv += 16;
1882:     }
1883:     /* invert diagonal block */
1884:     w = ba + 16*diag_offset[i];
1885:     if (pivotinblocks) {
1886:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1887:       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1888:     } else {
1889:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1890:     }
1891: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1892:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1893:   }

1895:   PetscFree(rtmp);

1897:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1898:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1899:   C->assembled           = PETSC_TRUE;

1901:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1902:   /* Flop Count from inverting diagonal blocks */
1903:   SSE_SCOPE_END;
1904:   return(0);
1905: }

1907: #endif