Actual source code: baijfact11.c

petsc-3.4.5 2014-06-29
  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc-private/kernels/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 15: {
 16:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 17:   IS             isrow = b->row,isicol = b->icol;
 19:   const PetscInt *r,*ic;
 20:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 21:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 22:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 23:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 24:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 25:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 26:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 27:   MatScalar      m13,m14,m15,m16;
 28:   MatScalar      *ba           = b->a,*aa = a->a;
 29:   PetscBool      pivotinblocks = b->pivotinblocks;
 30:   PetscReal      shift         = info->shiftamount;

 33:   ISGetIndices(isrow,&r);
 34:   ISGetIndices(isicol,&ic);
 35:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

 37:   for (i=0; i<n; i++) {
 38:     nz    = bi[i+1] - bi[i];
 39:     ajtmp = bj + bi[i];
 40:     for  (j=0; j<nz; j++) {
 41:       x     = rtmp+16*ajtmp[j];
 42:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 43:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 44:     }
 45:     /* load in initial (unfactored row) */
 46:     idx      = r[i];
 47:     nz       = ai[idx+1] - ai[idx];
 48:     ajtmpold = aj + ai[idx];
 49:     v        = aa + 16*ai[idx];
 50:     for (j=0; j<nz; j++) {
 51:       x     = rtmp+16*ic[ajtmpold[j]];
 52:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 53:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 54:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 55:       x[14] = v[14]; x[15] = v[15];
 56:       v    += 16;
 57:     }
 58:     row = *ajtmp++;
 59:     while (row < i) {
 60:       pc  = rtmp + 16*row;
 61:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 62:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 63:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 64:       p15 = pc[14]; p16 = pc[15];
 65:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 66:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 67:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 68:           || p16 != 0.0) {
 69:         pv    = ba + 16*diag_offset[row];
 70:         pj    = bj + diag_offset[row] + 1;
 71:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 72:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 73:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 74:         x15   = pv[14]; x16 = pv[15];
 75:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 76:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 77:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 78:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 80:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 81:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 82:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 83:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 85:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 86:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 87:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 88:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 90:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 91:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 92:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 93:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 95:         nz  = bi[row+1] - diag_offset[row] - 1;
 96:         pv += 16;
 97:         for (j=0; j<nz; j++) {
 98:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 99:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102:           x     = rtmp + 16*pj[j];
103:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

108:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

113:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

118:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

123:           pv += 16;
124:         }
125:         PetscLogFlops(128.0*nz+112.0);
126:       }
127:       row = *ajtmp++;
128:     }
129:     /* finished row so stick it into b->a */
130:     pv = ba + 16*bi[i];
131:     pj = bj + bi[i];
132:     nz = bi[i+1] - bi[i];
133:     for (j=0; j<nz; j++) {
134:       x      = rtmp+16*pj[j];
135:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139:       pv    += 16;
140:     }
141:     /* invert diagonal block */
142:     w = ba + 16*diag_offset[i];
143:     if (pivotinblocks) {
144:       PetscKernel_A_gets_inverse_A_4(w,shift);
145:     } else {
146:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
147:     }
148:   }

150:   PetscFree(rtmp);
151:   ISRestoreIndices(isicol,&ic);
152:   ISRestoreIndices(isrow,&r);

154:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156:   C->assembled           = PETSC_TRUE;

158:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
159:   return(0);
160: }

162: /* MatLUFactorNumeric_SeqBAIJ_4 -
163:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
164:        PetscKernel_A_gets_A_times_B()
165:        PetscKernel_A_gets_A_minus_B_times_C()
166:        PetscKernel_A_gets_inverse_A()
167: */

171: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
172: {
173:   Mat            C     = B;
174:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
175:   IS             isrow = b->row,isicol = b->icol;
177:   const PetscInt *r,*ic;
178:   PetscInt       i,j,k,nz,nzL,row;
179:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
180:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
181:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
182:   PetscInt       flg;
183:   PetscReal      shift;

186:   ISGetIndices(isrow,&r);
187:   ISGetIndices(isicol,&ic);

189:   if (info->shifttype == MAT_SHIFT_NONE) {
190:     shift = 0;
191:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
192:     shift = info->shiftamount;
193:   }

195:   /* generate work space needed by the factorization */
196:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
197:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

199:   for (i=0; i<n; i++) {
200:     /* zero rtmp */
201:     /* L part */
202:     nz    = bi[i+1] - bi[i];
203:     bjtmp = bj + bi[i];
204:     for  (j=0; j<nz; j++) {
205:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
206:     }

208:     /* U part */
209:     nz    = bdiag[i]-bdiag[i+1];
210:     bjtmp = bj + bdiag[i+1]+1;
211:     for  (j=0; j<nz; j++) {
212:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
213:     }

215:     /* load in initial (unfactored row) */
216:     nz    = ai[r[i]+1] - ai[r[i]];
217:     ajtmp = aj + ai[r[i]];
218:     v     = aa + bs2*ai[r[i]];
219:     for (j=0; j<nz; j++) {
220:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
221:     }

223:     /* elimination */
224:     bjtmp = bj + bi[i];
225:     nzL   = bi[i+1] - bi[i];
226:     for (k=0; k < nzL; k++) {
227:       row = bjtmp[k];
228:       pc  = rtmp + bs2*row;
229:       for (flg=0,j=0; j<bs2; j++) {
230:         if (pc[j]!=0.0) {
231:           flg = 1;
232:           break;
233:         }
234:       }
235:       if (flg) {
236:         pv = b->a + bs2*bdiag[row];
237:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
238:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

240:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
241:         pv = b->a + bs2*(bdiag[row+1]+1);
242:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
243:         for (j=0; j<nz; j++) {
244:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
245:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
246:           v    = rtmp + bs2*pj[j];
247:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
248:           pv  += bs2;
249:         }
250:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
251:       }
252:     }

254:     /* finished row so stick it into b->a */
255:     /* L part */
256:     pv = b->a + bs2*bi[i];
257:     pj = b->j + bi[i];
258:     nz = bi[i+1] - bi[i];
259:     for (j=0; j<nz; j++) {
260:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
261:     }

263:     /* Mark diagonal and invert diagonal for simplier triangular solves */
264:     pv   = b->a + bs2*bdiag[i];
265:     pj   = b->j + bdiag[i];
266:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
267:     PetscKernel_A_gets_inverse_A_4(pv,shift);

269:     /* U part */
270:     pv = b->a + bs2*(bdiag[i+1]+1);
271:     pj = b->j + bdiag[i+1]+1;
272:     nz = bdiag[i] - bdiag[i+1] - 1;
273:     for (j=0; j<nz; j++) {
274:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
275:     }
276:   }

278:   PetscFree2(rtmp,mwork);
279:   ISRestoreIndices(isicol,&ic);
280:   ISRestoreIndices(isrow,&r);

282:   C->ops->solve          = MatSolve_SeqBAIJ_4;
283:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
284:   C->assembled           = PETSC_TRUE;

286:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
287:   return(0);
288: }

292: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293: {
294: /*
295:     Default Version for when blocks are 4 by 4 Using natural ordering
296: */
297:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
299:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300:   PetscInt       *ajtmpold,*ajtmp,nz,row;
301:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
303:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306:   MatScalar      m13,m14,m15,m16;
307:   MatScalar      *ba           = b->a,*aa = a->a;
308:   PetscBool      pivotinblocks = b->pivotinblocks;
309:   PetscReal      shift         = info->shiftamount;

312:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

314:   for (i=0; i<n; i++) {
315:     nz    = bi[i+1] - bi[i];
316:     ajtmp = bj + bi[i];
317:     for  (j=0; j<nz; j++) {
318:       x     = rtmp+16*ajtmp[j];
319:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
320:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
321:     }
322:     /* load in initial (unfactored row) */
323:     nz       = ai[i+1] - ai[i];
324:     ajtmpold = aj + ai[i];
325:     v        = aa + 16*ai[i];
326:     for (j=0; j<nz; j++) {
327:       x     = rtmp+16*ajtmpold[j];
328:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
329:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
330:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
331:       x[14] = v[14]; x[15] = v[15];
332:       v    += 16;
333:     }
334:     row = *ajtmp++;
335:     while (row < i) {
336:       pc  = rtmp + 16*row;
337:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
338:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
339:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
340:       p15 = pc[14]; p16 = pc[15];
341:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
342:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
343:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
344:           || p16 != 0.0) {
345:         pv    = ba + 16*diag_offset[row];
346:         pj    = bj + diag_offset[row] + 1;
347:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
348:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
349:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
350:         x15   = pv[14]; x16 = pv[15];
351:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
352:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
353:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
354:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

356:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
357:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
358:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
359:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

361:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
362:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
363:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
364:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

366:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
367:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
368:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
369:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
370:         nz     = bi[row+1] - diag_offset[row] - 1;
371:         pv    += 16;
372:         for (j=0; j<nz; j++) {
373:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
374:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
375:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
376:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
377:           x     = rtmp + 16*pj[j];
378:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
379:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
380:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
381:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

383:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
384:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
385:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
386:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

388:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
389:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
390:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
391:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

393:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
394:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
395:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
396:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

398:           pv += 16;
399:         }
400:         PetscLogFlops(128.0*nz+112.0);
401:       }
402:       row = *ajtmp++;
403:     }
404:     /* finished row so stick it into b->a */
405:     pv = ba + 16*bi[i];
406:     pj = bj + bi[i];
407:     nz = bi[i+1] - bi[i];
408:     for (j=0; j<nz; j++) {
409:       x      = rtmp+16*pj[j];
410:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
411:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
412:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
413:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
414:       pv    += 16;
415:     }
416:     /* invert diagonal block */
417:     w = ba + 16*diag_offset[i];
418:     if (pivotinblocks) {
419:       PetscKernel_A_gets_inverse_A_4(w,shift);
420:     } else {
421:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
422:     }
423:   }

425:   PetscFree(rtmp);

427:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
428:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
429:   C->assembled           = PETSC_TRUE;

431:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
432:   return(0);
433: }

435: /*
436:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
437:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
438: */
441: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
442: {
443:   Mat            C =B;
444:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
446:   PetscInt       i,j,k,nz,nzL,row;
447:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
448:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
449:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
450:   PetscInt       flg;
451:   PetscReal      shift;

454:   /* generate work space needed by the factorization */
455:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
456:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

458:   if (info->shifttype == MAT_SHIFT_NONE) {
459:     shift = 0;
460:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
461:     shift = info->shiftamount;
462:   }

464:   for (i=0; i<n; i++) {
465:     /* zero rtmp */
466:     /* L part */
467:     nz    = bi[i+1] - bi[i];
468:     bjtmp = bj + bi[i];
469:     for  (j=0; j<nz; j++) {
470:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
471:     }

473:     /* U part */
474:     nz    = bdiag[i] - bdiag[i+1];
475:     bjtmp = bj + bdiag[i+1]+1;
476:     for  (j=0; j<nz; j++) {
477:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
478:     }

480:     /* load in initial (unfactored row) */
481:     nz    = ai[i+1] - ai[i];
482:     ajtmp = aj + ai[i];
483:     v     = aa + bs2*ai[i];
484:     for (j=0; j<nz; j++) {
485:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
486:     }

488:     /* elimination */
489:     bjtmp = bj + bi[i];
490:     nzL   = bi[i+1] - bi[i];
491:     for (k=0; k < nzL; k++) {
492:       row = bjtmp[k];
493:       pc  = rtmp + bs2*row;
494:       for (flg=0,j=0; j<bs2; j++) {
495:         if (pc[j]!=0.0) {
496:           flg = 1;
497:           break;
498:         }
499:       }
500:       if (flg) {
501:         pv = b->a + bs2*bdiag[row];
502:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
503:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

505:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
506:         pv = b->a + bs2*(bdiag[row+1]+1);
507:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
508:         for (j=0; j<nz; j++) {
509:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
510:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
511:           v    = rtmp + bs2*pj[j];
512:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
513:           pv  += bs2;
514:         }
515:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
516:       }
517:     }

519:     /* finished row so stick it into b->a */
520:     /* L part */
521:     pv = b->a + bs2*bi[i];
522:     pj = b->j + bi[i];
523:     nz = bi[i+1] - bi[i];
524:     for (j=0; j<nz; j++) {
525:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
526:     }

528:     /* Mark diagonal and invert diagonal for simplier triangular solves */
529:     pv   = b->a + bs2*bdiag[i];
530:     pj   = b->j + bdiag[i];
531:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
532:     PetscKernel_A_gets_inverse_A_4(pv,shift);

534:     /* U part */
535:     pv = b->a + bs2*(bdiag[i+1]+1);
536:     pj = b->j + bdiag[i+1]+1;
537:     nz = bdiag[i] - bdiag[i+1] - 1;
538:     for (j=0; j<nz; j++) {
539:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
540:     }
541:   }
542:   PetscFree2(rtmp,mwork);

544:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
545:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
546:   C->assembled           = PETSC_TRUE;

548:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
549:   return(0);
550: }

552: #if defined(PETSC_HAVE_SSE)

554: #include PETSC_HAVE_SSE

556: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
559: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
560: {
561:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
563:   int            i,j,n = a->mbs;
564:   int            *bj = b->j,*bjtmp,*pj;
565:   int            row;
566:   int            *ajtmpold,nz,*bi=b->i;
567:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
568:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
569:   MatScalar      *ba    = b->a,*aa = a->a;
570:   int            nonzero=0;
571: /*    int            nonzero=0,colscale = 16; */
572:   PetscBool pivotinblocks = b->pivotinblocks;
573:   PetscReal shift         = info->shiftamount;

576:   SSE_SCOPE_BEGIN;

578:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
579:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
580:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
581:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
582: /*    if ((unsigned long)bj==(unsigned long)aj) { */
583: /*      colscale = 4; */
584: /*    } */
585:   for (i=0; i<n; i++) {
586:     nz    = bi[i+1] - bi[i];
587:     bjtmp = bj + bi[i];
588:     /* zero out the 4x4 block accumulators */
589:     /* zero out one register */
590:     XOR_PS(XMM7,XMM7);
591:     for  (j=0; j<nz; j++) {
592:       x = rtmp+16*bjtmp[j];
593: /*        x = rtmp+4*bjtmp[j]; */
594:       SSE_INLINE_BEGIN_1(x)
595:       /* Copy zero register to memory locations */
596:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
597:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
598:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
599:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
600:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
601:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
602:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
603:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
604:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
605:       SSE_INLINE_END_1;
606:     }
607:     /* load in initial (unfactored row) */
608:     nz       = ai[i+1] - ai[i];
609:     ajtmpold = aj + ai[i];
610:     v        = aa + 16*ai[i];
611:     for (j=0; j<nz; j++) {
612:       x = rtmp+16*ajtmpold[j];
613: /*        x = rtmp+colscale*ajtmpold[j]; */
614:       /* Copy v block into x block */
615:       SSE_INLINE_BEGIN_2(v,x)
616:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
617:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
618:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

620:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
621:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

623:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
624:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

626:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
627:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

629:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
630:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

632:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
633:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

635:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
636:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

638:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
639:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
640:       SSE_INLINE_END_2;

642:       v += 16;
643:     }
644: /*      row = (*bjtmp++)/4; */
645:     row = *bjtmp++;
646:     while (row < i) {
647:       pc = rtmp + 16*row;
648:       SSE_INLINE_BEGIN_1(pc)
649:       /* Load block from lower triangle */
650:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
651:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
652:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

654:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
655:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

657:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
658:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

660:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
661:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

663:       /* Compare block to zero block */

665:       SSE_COPY_PS(XMM4,XMM7)
666:       SSE_CMPNEQ_PS(XMM4,XMM0)

668:       SSE_COPY_PS(XMM5,XMM7)
669:       SSE_CMPNEQ_PS(XMM5,XMM1)

671:       SSE_COPY_PS(XMM6,XMM7)
672:       SSE_CMPNEQ_PS(XMM6,XMM2)

674:       SSE_CMPNEQ_PS(XMM7,XMM3)

676:       /* Reduce the comparisons to one SSE register */
677:       SSE_OR_PS(XMM6,XMM7)
678:       SSE_OR_PS(XMM5,XMM4)
679:       SSE_OR_PS(XMM5,XMM6)
680:       SSE_INLINE_END_1;

682:       /* Reduce the one SSE register to an integer register for branching */
683:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
684:       MOVEMASK(nonzero,XMM5);

686:       /* If block is nonzero ... */
687:       if (nonzero) {
688:         pv = ba + 16*diag_offset[row];
689:         PREFETCH_L1(&pv[16]);
690:         pj = bj + diag_offset[row] + 1;

692:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
693:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
694:         /* but the diagonal was inverted already */
695:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

697:         SSE_INLINE_BEGIN_2(pv,pc)
698:         /* Column 0, product is accumulated in XMM4 */
699:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
700:         SSE_SHUFFLE(XMM4,XMM4,0x00)
701:         SSE_MULT_PS(XMM4,XMM0)

703:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
704:         SSE_SHUFFLE(XMM5,XMM5,0x00)
705:         SSE_MULT_PS(XMM5,XMM1)
706:         SSE_ADD_PS(XMM4,XMM5)

708:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
709:         SSE_SHUFFLE(XMM6,XMM6,0x00)
710:         SSE_MULT_PS(XMM6,XMM2)
711:         SSE_ADD_PS(XMM4,XMM6)

713:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
714:         SSE_SHUFFLE(XMM7,XMM7,0x00)
715:         SSE_MULT_PS(XMM7,XMM3)
716:         SSE_ADD_PS(XMM4,XMM7)

718:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
719:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

721:         /* Column 1, product is accumulated in XMM5 */
722:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
723:         SSE_SHUFFLE(XMM5,XMM5,0x00)
724:         SSE_MULT_PS(XMM5,XMM0)

726:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
727:         SSE_SHUFFLE(XMM6,XMM6,0x00)
728:         SSE_MULT_PS(XMM6,XMM1)
729:         SSE_ADD_PS(XMM5,XMM6)

731:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
732:         SSE_SHUFFLE(XMM7,XMM7,0x00)
733:         SSE_MULT_PS(XMM7,XMM2)
734:         SSE_ADD_PS(XMM5,XMM7)

736:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
737:         SSE_SHUFFLE(XMM6,XMM6,0x00)
738:         SSE_MULT_PS(XMM6,XMM3)
739:         SSE_ADD_PS(XMM5,XMM6)

741:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
742:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

744:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

746:         /* Column 2, product is accumulated in XMM6 */
747:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
748:         SSE_SHUFFLE(XMM6,XMM6,0x00)
749:         SSE_MULT_PS(XMM6,XMM0)

751:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
752:         SSE_SHUFFLE(XMM7,XMM7,0x00)
753:         SSE_MULT_PS(XMM7,XMM1)
754:         SSE_ADD_PS(XMM6,XMM7)

756:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
757:         SSE_SHUFFLE(XMM7,XMM7,0x00)
758:         SSE_MULT_PS(XMM7,XMM2)
759:         SSE_ADD_PS(XMM6,XMM7)

761:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
762:         SSE_SHUFFLE(XMM7,XMM7,0x00)
763:         SSE_MULT_PS(XMM7,XMM3)
764:         SSE_ADD_PS(XMM6,XMM7)

766:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
767:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

769:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
770:         /* Column 3, product is accumulated in XMM0 */
771:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
772:         SSE_SHUFFLE(XMM7,XMM7,0x00)
773:         SSE_MULT_PS(XMM0,XMM7)

775:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
776:         SSE_SHUFFLE(XMM7,XMM7,0x00)
777:         SSE_MULT_PS(XMM1,XMM7)
778:         SSE_ADD_PS(XMM0,XMM1)

780:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
781:         SSE_SHUFFLE(XMM1,XMM1,0x00)
782:         SSE_MULT_PS(XMM1,XMM2)
783:         SSE_ADD_PS(XMM0,XMM1)

785:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
786:         SSE_SHUFFLE(XMM7,XMM7,0x00)
787:         SSE_MULT_PS(XMM3,XMM7)
788:         SSE_ADD_PS(XMM0,XMM3)

790:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
791:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

793:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
794:         /* This is code to be maintained and read by humans afterall. */
795:         /* Copy Multiplier Col 3 into XMM3 */
796:         SSE_COPY_PS(XMM3,XMM0)
797:         /* Copy Multiplier Col 2 into XMM2 */
798:         SSE_COPY_PS(XMM2,XMM6)
799:         /* Copy Multiplier Col 1 into XMM1 */
800:         SSE_COPY_PS(XMM1,XMM5)
801:         /* Copy Multiplier Col 0 into XMM0 */
802:         SSE_COPY_PS(XMM0,XMM4)
803:         SSE_INLINE_END_2;

805:         /* Update the row: */
806:         nz  = bi[row+1] - diag_offset[row] - 1;
807:         pv += 16;
808:         for (j=0; j<nz; j++) {
809:           PREFETCH_L1(&pv[16]);
810:           x = rtmp + 16*pj[j];
811: /*            x = rtmp + 4*pj[j]; */

813:           /* X:=X-M*PV, One column at a time */
814:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
815:           SSE_INLINE_BEGIN_2(x,pv)
816:           /* Load First Column of X*/
817:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
818:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

820:           /* Matrix-Vector Product: */
821:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
822:           SSE_SHUFFLE(XMM5,XMM5,0x00)
823:           SSE_MULT_PS(XMM5,XMM0)
824:           SSE_SUB_PS(XMM4,XMM5)

826:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
827:           SSE_SHUFFLE(XMM6,XMM6,0x00)
828:           SSE_MULT_PS(XMM6,XMM1)
829:           SSE_SUB_PS(XMM4,XMM6)

831:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
832:           SSE_SHUFFLE(XMM7,XMM7,0x00)
833:           SSE_MULT_PS(XMM7,XMM2)
834:           SSE_SUB_PS(XMM4,XMM7)

836:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
837:           SSE_SHUFFLE(XMM5,XMM5,0x00)
838:           SSE_MULT_PS(XMM5,XMM3)
839:           SSE_SUB_PS(XMM4,XMM5)

841:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
842:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

844:           /* Second Column */
845:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
846:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

848:           /* Matrix-Vector Product: */
849:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
850:           SSE_SHUFFLE(XMM6,XMM6,0x00)
851:           SSE_MULT_PS(XMM6,XMM0)
852:           SSE_SUB_PS(XMM5,XMM6)

854:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
855:           SSE_SHUFFLE(XMM7,XMM7,0x00)
856:           SSE_MULT_PS(XMM7,XMM1)
857:           SSE_SUB_PS(XMM5,XMM7)

859:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
860:           SSE_SHUFFLE(XMM4,XMM4,0x00)
861:           SSE_MULT_PS(XMM4,XMM2)
862:           SSE_SUB_PS(XMM5,XMM4)

864:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
865:           SSE_SHUFFLE(XMM6,XMM6,0x00)
866:           SSE_MULT_PS(XMM6,XMM3)
867:           SSE_SUB_PS(XMM5,XMM6)

869:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
870:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

872:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

874:           /* Third Column */
875:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
876:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

878:           /* Matrix-Vector Product: */
879:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
880:           SSE_SHUFFLE(XMM7,XMM7,0x00)
881:           SSE_MULT_PS(XMM7,XMM0)
882:           SSE_SUB_PS(XMM6,XMM7)

884:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
885:           SSE_SHUFFLE(XMM4,XMM4,0x00)
886:           SSE_MULT_PS(XMM4,XMM1)
887:           SSE_SUB_PS(XMM6,XMM4)

889:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
890:           SSE_SHUFFLE(XMM5,XMM5,0x00)
891:           SSE_MULT_PS(XMM5,XMM2)
892:           SSE_SUB_PS(XMM6,XMM5)

894:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
895:           SSE_SHUFFLE(XMM7,XMM7,0x00)
896:           SSE_MULT_PS(XMM7,XMM3)
897:           SSE_SUB_PS(XMM6,XMM7)

899:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
900:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

902:           /* Fourth Column */
903:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
904:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

906:           /* Matrix-Vector Product: */
907:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
908:           SSE_SHUFFLE(XMM5,XMM5,0x00)
909:           SSE_MULT_PS(XMM5,XMM0)
910:           SSE_SUB_PS(XMM4,XMM5)

912:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
913:           SSE_SHUFFLE(XMM6,XMM6,0x00)
914:           SSE_MULT_PS(XMM6,XMM1)
915:           SSE_SUB_PS(XMM4,XMM6)

917:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
918:           SSE_SHUFFLE(XMM7,XMM7,0x00)
919:           SSE_MULT_PS(XMM7,XMM2)
920:           SSE_SUB_PS(XMM4,XMM7)

922:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
923:           SSE_SHUFFLE(XMM5,XMM5,0x00)
924:           SSE_MULT_PS(XMM5,XMM3)
925:           SSE_SUB_PS(XMM4,XMM5)

927:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
928:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
929:           SSE_INLINE_END_2;
930:           pv += 16;
931:         }
932:         PetscLogFlops(128.0*nz+112.0);
933:       }
934:       row = *bjtmp++;
935: /*        row = (*bjtmp++)/4; */
936:     }
937:     /* finished row so stick it into b->a */
938:     pv = ba + 16*bi[i];
939:     pj = bj + bi[i];
940:     nz = bi[i+1] - bi[i];

942:     /* Copy x block back into pv block */
943:     for (j=0; j<nz; j++) {
944:       x = rtmp+16*pj[j];
945: /*        x  = rtmp+4*pj[j]; */

947:       SSE_INLINE_BEGIN_2(x,pv)
948:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
949:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
950:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

952:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
953:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

955:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
956:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

958:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
959:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

961:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
962:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

964:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
965:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

967:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
968:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

970:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
971:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
972:       SSE_INLINE_END_2;
973:       pv += 16;
974:     }
975:     /* invert diagonal block */
976:     w = ba + 16*diag_offset[i];
977:     if (pivotinblocks) {
978:       PetscKernel_A_gets_inverse_A_4(w,shift);
979:     } else {
980:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
981:     }
982: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
983:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
984:   }

986:   PetscFree(rtmp);

988:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
989:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
990:   C->assembled           = PETSC_TRUE;

992:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
993:   /* Flop Count from inverting diagonal blocks */
994:   SSE_SCOPE_END;
995:   return(0);
996: }

1000: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1001: {
1002:   Mat            A  =C;
1003:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1005:   int            i,j,n = a->mbs;
1006:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1007:   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1008:   unsigned int   row;
1009:   int            nz,*bi=b->i;
1010:   int            *diag_offset = b->diag,*ai=a->i;
1011:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1012:   MatScalar      *ba    = b->a,*aa = a->a;
1013:   int            nonzero=0;
1014: /*    int            nonzero=0,colscale = 16; */
1015:   PetscBool pivotinblocks = b->pivotinblocks;
1016:   PetscReal shift         = info->shiftamount;

1019:   SSE_SCOPE_BEGIN;

1021:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1022:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1023:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1024:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1025: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1026: /*      colscale = 4; */
1027: /*    } */

1029:   for (i=0; i<n; i++) {
1030:     nz    = bi[i+1] - bi[i];
1031:     bjtmp = bj + bi[i];
1032:     /* zero out the 4x4 block accumulators */
1033:     /* zero out one register */
1034:     XOR_PS(XMM7,XMM7);
1035:     for  (j=0; j<nz; j++) {
1036:       x = rtmp+16*((unsigned int)bjtmp[j]);
1037: /*        x = rtmp+4*bjtmp[j]; */
1038:       SSE_INLINE_BEGIN_1(x)
1039:       /* Copy zero register to memory locations */
1040:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1042:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1043:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1044:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1045:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1046:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1047:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1048:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1049:       SSE_INLINE_END_1;
1050:     }
1051:     /* load in initial (unfactored row) */
1052:     nz    = ai[i+1] - ai[i];
1053:     ajtmp = aj + ai[i];
1054:     v     = aa + 16*ai[i];
1055:     for (j=0; j<nz; j++) {
1056:       x = rtmp+16*((unsigned int)ajtmp[j]);
1057: /*        x = rtmp+colscale*ajtmp[j]; */
1058:       /* Copy v block into x block */
1059:       SSE_INLINE_BEGIN_2(v,x)
1060:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1061:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1062:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1064:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1065:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1067:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1068:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1070:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1071:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1073:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1074:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1076:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1077:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1079:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1080:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1082:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1083:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1084:       SSE_INLINE_END_2;

1086:       v += 16;
1087:     }
1088: /*      row = (*bjtmp++)/4; */
1089:     row = (unsigned int)(*bjtmp++);
1090:     while (row < i) {
1091:       pc = rtmp + 16*row;
1092:       SSE_INLINE_BEGIN_1(pc)
1093:       /* Load block from lower triangle */
1094:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1095:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1098:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1101:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1102:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1104:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1105:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1107:       /* Compare block to zero block */

1109:       SSE_COPY_PS(XMM4,XMM7)
1110:       SSE_CMPNEQ_PS(XMM4,XMM0)

1112:       SSE_COPY_PS(XMM5,XMM7)
1113:       SSE_CMPNEQ_PS(XMM5,XMM1)

1115:       SSE_COPY_PS(XMM6,XMM7)
1116:       SSE_CMPNEQ_PS(XMM6,XMM2)

1118:       SSE_CMPNEQ_PS(XMM7,XMM3)

1120:       /* Reduce the comparisons to one SSE register */
1121:       SSE_OR_PS(XMM6,XMM7)
1122:       SSE_OR_PS(XMM5,XMM4)
1123:       SSE_OR_PS(XMM5,XMM6)
1124:       SSE_INLINE_END_1;

1126:       /* Reduce the one SSE register to an integer register for branching */
1127:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1128:       MOVEMASK(nonzero,XMM5);

1130:       /* If block is nonzero ... */
1131:       if (nonzero) {
1132:         pv = ba + 16*diag_offset[row];
1133:         PREFETCH_L1(&pv[16]);
1134:         pj = bj + diag_offset[row] + 1;

1136:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1137:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1138:         /* but the diagonal was inverted already */
1139:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1141:         SSE_INLINE_BEGIN_2(pv,pc)
1142:         /* Column 0, product is accumulated in XMM4 */
1143:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1144:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1145:         SSE_MULT_PS(XMM4,XMM0)

1147:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1148:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1149:         SSE_MULT_PS(XMM5,XMM1)
1150:         SSE_ADD_PS(XMM4,XMM5)

1152:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1153:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1154:         SSE_MULT_PS(XMM6,XMM2)
1155:         SSE_ADD_PS(XMM4,XMM6)

1157:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1158:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1159:         SSE_MULT_PS(XMM7,XMM3)
1160:         SSE_ADD_PS(XMM4,XMM7)

1162:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1163:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1165:         /* Column 1, product is accumulated in XMM5 */
1166:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1167:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1168:         SSE_MULT_PS(XMM5,XMM0)

1170:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1171:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1172:         SSE_MULT_PS(XMM6,XMM1)
1173:         SSE_ADD_PS(XMM5,XMM6)

1175:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1176:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1177:         SSE_MULT_PS(XMM7,XMM2)
1178:         SSE_ADD_PS(XMM5,XMM7)

1180:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1181:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1182:         SSE_MULT_PS(XMM6,XMM3)
1183:         SSE_ADD_PS(XMM5,XMM6)

1185:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1186:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1188:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1190:         /* Column 2, product is accumulated in XMM6 */
1191:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1192:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1193:         SSE_MULT_PS(XMM6,XMM0)

1195:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1196:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1197:         SSE_MULT_PS(XMM7,XMM1)
1198:         SSE_ADD_PS(XMM6,XMM7)

1200:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1201:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1202:         SSE_MULT_PS(XMM7,XMM2)
1203:         SSE_ADD_PS(XMM6,XMM7)

1205:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1206:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1207:         SSE_MULT_PS(XMM7,XMM3)
1208:         SSE_ADD_PS(XMM6,XMM7)

1210:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1211:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1213:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1214:         /* Column 3, product is accumulated in XMM0 */
1215:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1216:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1217:         SSE_MULT_PS(XMM0,XMM7)

1219:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1220:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1221:         SSE_MULT_PS(XMM1,XMM7)
1222:         SSE_ADD_PS(XMM0,XMM1)

1224:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1225:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1226:         SSE_MULT_PS(XMM1,XMM2)
1227:         SSE_ADD_PS(XMM0,XMM1)

1229:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1230:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1231:         SSE_MULT_PS(XMM3,XMM7)
1232:         SSE_ADD_PS(XMM0,XMM3)

1234:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1235:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1237:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1238:         /* This is code to be maintained and read by humans afterall. */
1239:         /* Copy Multiplier Col 3 into XMM3 */
1240:         SSE_COPY_PS(XMM3,XMM0)
1241:         /* Copy Multiplier Col 2 into XMM2 */
1242:         SSE_COPY_PS(XMM2,XMM6)
1243:         /* Copy Multiplier Col 1 into XMM1 */
1244:         SSE_COPY_PS(XMM1,XMM5)
1245:         /* Copy Multiplier Col 0 into XMM0 */
1246:         SSE_COPY_PS(XMM0,XMM4)
1247:         SSE_INLINE_END_2;

1249:         /* Update the row: */
1250:         nz  = bi[row+1] - diag_offset[row] - 1;
1251:         pv += 16;
1252:         for (j=0; j<nz; j++) {
1253:           PREFETCH_L1(&pv[16]);
1254:           x = rtmp + 16*((unsigned int)pj[j]);
1255: /*            x = rtmp + 4*pj[j]; */

1257:           /* X:=X-M*PV, One column at a time */
1258:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1259:           SSE_INLINE_BEGIN_2(x,pv)
1260:           /* Load First Column of X*/
1261:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1262:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1264:           /* Matrix-Vector Product: */
1265:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1266:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1267:           SSE_MULT_PS(XMM5,XMM0)
1268:           SSE_SUB_PS(XMM4,XMM5)

1270:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1271:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1272:           SSE_MULT_PS(XMM6,XMM1)
1273:           SSE_SUB_PS(XMM4,XMM6)

1275:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1276:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1277:           SSE_MULT_PS(XMM7,XMM2)
1278:           SSE_SUB_PS(XMM4,XMM7)

1280:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1281:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1282:           SSE_MULT_PS(XMM5,XMM3)
1283:           SSE_SUB_PS(XMM4,XMM5)

1285:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1286:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1288:           /* Second Column */
1289:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1290:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1292:           /* Matrix-Vector Product: */
1293:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1294:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1295:           SSE_MULT_PS(XMM6,XMM0)
1296:           SSE_SUB_PS(XMM5,XMM6)

1298:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1299:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1300:           SSE_MULT_PS(XMM7,XMM1)
1301:           SSE_SUB_PS(XMM5,XMM7)

1303:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1304:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1305:           SSE_MULT_PS(XMM4,XMM2)
1306:           SSE_SUB_PS(XMM5,XMM4)

1308:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1309:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1310:           SSE_MULT_PS(XMM6,XMM3)
1311:           SSE_SUB_PS(XMM5,XMM6)

1313:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1314:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1316:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1318:           /* Third Column */
1319:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1320:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1322:           /* Matrix-Vector Product: */
1323:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1324:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1325:           SSE_MULT_PS(XMM7,XMM0)
1326:           SSE_SUB_PS(XMM6,XMM7)

1328:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1329:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1330:           SSE_MULT_PS(XMM4,XMM1)
1331:           SSE_SUB_PS(XMM6,XMM4)

1333:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1334:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1335:           SSE_MULT_PS(XMM5,XMM2)
1336:           SSE_SUB_PS(XMM6,XMM5)

1338:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1339:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1340:           SSE_MULT_PS(XMM7,XMM3)
1341:           SSE_SUB_PS(XMM6,XMM7)

1343:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1344:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1346:           /* Fourth Column */
1347:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1348:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1350:           /* Matrix-Vector Product: */
1351:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1352:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1353:           SSE_MULT_PS(XMM5,XMM0)
1354:           SSE_SUB_PS(XMM4,XMM5)

1356:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1357:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1358:           SSE_MULT_PS(XMM6,XMM1)
1359:           SSE_SUB_PS(XMM4,XMM6)

1361:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1362:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1363:           SSE_MULT_PS(XMM7,XMM2)
1364:           SSE_SUB_PS(XMM4,XMM7)

1366:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1367:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1368:           SSE_MULT_PS(XMM5,XMM3)
1369:           SSE_SUB_PS(XMM4,XMM5)

1371:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1372:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1373:           SSE_INLINE_END_2;
1374:           pv += 16;
1375:         }
1376:         PetscLogFlops(128.0*nz+112.0);
1377:       }
1378:       row = (unsigned int)(*bjtmp++);
1379: /*        row = (*bjtmp++)/4; */
1380: /*        bjtmp++; */
1381:     }
1382:     /* finished row so stick it into b->a */
1383:     pv = ba + 16*bi[i];
1384:     pj = bj + bi[i];
1385:     nz = bi[i+1] - bi[i];

1387:     /* Copy x block back into pv block */
1388:     for (j=0; j<nz; j++) {
1389:       x = rtmp+16*((unsigned int)pj[j]);
1390: /*        x  = rtmp+4*pj[j]; */

1392:       SSE_INLINE_BEGIN_2(x,pv)
1393:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1394:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1395:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1397:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1398:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1400:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1401:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1403:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1404:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1406:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1407:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1409:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1410:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1412:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1413:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1415:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1416:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1417:       SSE_INLINE_END_2;
1418:       pv += 16;
1419:     }
1420:     /* invert diagonal block */
1421:     w = ba + 16*diag_offset[i];
1422:     if (pivotinblocks) {
1423:       PetscKernel_A_gets_inverse_A_4(w,shift);
1424:     } else {
1425:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1426:     }
1427: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1428:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1429:   }

1431:   PetscFree(rtmp);

1433:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1434:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1435:   C->assembled           = PETSC_TRUE;

1437:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1438:   /* Flop Count from inverting diagonal blocks */
1439:   SSE_SCOPE_END;
1440:   return(0);
1441: }

1445: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1446: {
1447:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1449:   int            i,j,n = a->mbs;
1450:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1451:   unsigned int   row;
1452:   int            *ajtmpold,nz,*bi=b->i;
1453:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1454:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1455:   MatScalar      *ba    = b->a,*aa = a->a;
1456:   int            nonzero=0;
1457: /*    int            nonzero=0,colscale = 16; */
1458:   PetscBool pivotinblocks = b->pivotinblocks;
1459:   PetscReal shift         = info->shiftamount;

1462:   SSE_SCOPE_BEGIN;

1464:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1465:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1466:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1467:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1468: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1469: /*      colscale = 4; */
1470: /*    } */
1471:   if ((unsigned long)bj==(unsigned long)aj) {
1472:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1473:   }

1475:   for (i=0; i<n; i++) {
1476:     nz    = bi[i+1] - bi[i];
1477:     bjtmp = bj + bi[i];
1478:     /* zero out the 4x4 block accumulators */
1479:     /* zero out one register */
1480:     XOR_PS(XMM7,XMM7);
1481:     for  (j=0; j<nz; j++) {
1482:       x = rtmp+16*((unsigned int)bjtmp[j]);
1483: /*        x = rtmp+4*bjtmp[j]; */
1484:       SSE_INLINE_BEGIN_1(x)
1485:       /* Copy zero register to memory locations */
1486:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1487:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1488:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1489:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1490:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1491:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1492:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1493:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1494:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1495:       SSE_INLINE_END_1;
1496:     }
1497:     /* load in initial (unfactored row) */
1498:     nz       = ai[i+1] - ai[i];
1499:     ajtmpold = aj + ai[i];
1500:     v        = aa + 16*ai[i];
1501:     for (j=0; j<nz; j++) {
1502:       x = rtmp+16*ajtmpold[j];
1503: /*        x = rtmp+colscale*ajtmpold[j]; */
1504:       /* Copy v block into x block */
1505:       SSE_INLINE_BEGIN_2(v,x)
1506:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1507:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1508:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1510:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1511:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1513:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1514:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1516:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1517:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1519:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1520:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1522:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1523:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1525:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1526:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1528:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1529:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1530:       SSE_INLINE_END_2;

1532:       v += 16;
1533:     }
1534: /*      row = (*bjtmp++)/4; */
1535:     row = (unsigned int)(*bjtmp++);
1536:     while (row < i) {
1537:       pc = rtmp + 16*row;
1538:       SSE_INLINE_BEGIN_1(pc)
1539:       /* Load block from lower triangle */
1540:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1541:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1542:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1544:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1545:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1547:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1548:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1550:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1551:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1553:       /* Compare block to zero block */

1555:       SSE_COPY_PS(XMM4,XMM7)
1556:       SSE_CMPNEQ_PS(XMM4,XMM0)

1558:       SSE_COPY_PS(XMM5,XMM7)
1559:       SSE_CMPNEQ_PS(XMM5,XMM1)

1561:       SSE_COPY_PS(XMM6,XMM7)
1562:       SSE_CMPNEQ_PS(XMM6,XMM2)

1564:       SSE_CMPNEQ_PS(XMM7,XMM3)

1566:       /* Reduce the comparisons to one SSE register */
1567:       SSE_OR_PS(XMM6,XMM7)
1568:       SSE_OR_PS(XMM5,XMM4)
1569:       SSE_OR_PS(XMM5,XMM6)
1570:       SSE_INLINE_END_1;

1572:       /* Reduce the one SSE register to an integer register for branching */
1573:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1574:       MOVEMASK(nonzero,XMM5);

1576:       /* If block is nonzero ... */
1577:       if (nonzero) {
1578:         pv = ba + 16*diag_offset[row];
1579:         PREFETCH_L1(&pv[16]);
1580:         pj = bj + diag_offset[row] + 1;

1582:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1583:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1584:         /* but the diagonal was inverted already */
1585:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1587:         SSE_INLINE_BEGIN_2(pv,pc)
1588:         /* Column 0, product is accumulated in XMM4 */
1589:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1590:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1591:         SSE_MULT_PS(XMM4,XMM0)

1593:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1594:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1595:         SSE_MULT_PS(XMM5,XMM1)
1596:         SSE_ADD_PS(XMM4,XMM5)

1598:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1599:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1600:         SSE_MULT_PS(XMM6,XMM2)
1601:         SSE_ADD_PS(XMM4,XMM6)

1603:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1604:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1605:         SSE_MULT_PS(XMM7,XMM3)
1606:         SSE_ADD_PS(XMM4,XMM7)

1608:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1609:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1611:         /* Column 1, product is accumulated in XMM5 */
1612:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1613:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1614:         SSE_MULT_PS(XMM5,XMM0)

1616:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1617:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1618:         SSE_MULT_PS(XMM6,XMM1)
1619:         SSE_ADD_PS(XMM5,XMM6)

1621:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1622:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1623:         SSE_MULT_PS(XMM7,XMM2)
1624:         SSE_ADD_PS(XMM5,XMM7)

1626:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1627:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1628:         SSE_MULT_PS(XMM6,XMM3)
1629:         SSE_ADD_PS(XMM5,XMM6)

1631:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1632:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1634:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1636:         /* Column 2, product is accumulated in XMM6 */
1637:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1638:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1639:         SSE_MULT_PS(XMM6,XMM0)

1641:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1642:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1643:         SSE_MULT_PS(XMM7,XMM1)
1644:         SSE_ADD_PS(XMM6,XMM7)

1646:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1647:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1648:         SSE_MULT_PS(XMM7,XMM2)
1649:         SSE_ADD_PS(XMM6,XMM7)

1651:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1652:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1653:         SSE_MULT_PS(XMM7,XMM3)
1654:         SSE_ADD_PS(XMM6,XMM7)

1656:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1657:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1659:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1660:         /* Column 3, product is accumulated in XMM0 */
1661:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1662:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1663:         SSE_MULT_PS(XMM0,XMM7)

1665:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1666:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1667:         SSE_MULT_PS(XMM1,XMM7)
1668:         SSE_ADD_PS(XMM0,XMM1)

1670:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1671:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1672:         SSE_MULT_PS(XMM1,XMM2)
1673:         SSE_ADD_PS(XMM0,XMM1)

1675:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1676:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1677:         SSE_MULT_PS(XMM3,XMM7)
1678:         SSE_ADD_PS(XMM0,XMM3)

1680:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1681:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1683:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1684:         /* This is code to be maintained and read by humans afterall. */
1685:         /* Copy Multiplier Col 3 into XMM3 */
1686:         SSE_COPY_PS(XMM3,XMM0)
1687:         /* Copy Multiplier Col 2 into XMM2 */
1688:         SSE_COPY_PS(XMM2,XMM6)
1689:         /* Copy Multiplier Col 1 into XMM1 */
1690:         SSE_COPY_PS(XMM1,XMM5)
1691:         /* Copy Multiplier Col 0 into XMM0 */
1692:         SSE_COPY_PS(XMM0,XMM4)
1693:         SSE_INLINE_END_2;

1695:         /* Update the row: */
1696:         nz  = bi[row+1] - diag_offset[row] - 1;
1697:         pv += 16;
1698:         for (j=0; j<nz; j++) {
1699:           PREFETCH_L1(&pv[16]);
1700:           x = rtmp + 16*((unsigned int)pj[j]);
1701: /*            x = rtmp + 4*pj[j]; */

1703:           /* X:=X-M*PV, One column at a time */
1704:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1705:           SSE_INLINE_BEGIN_2(x,pv)
1706:           /* Load First Column of X*/
1707:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1708:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1710:           /* Matrix-Vector Product: */
1711:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1712:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1713:           SSE_MULT_PS(XMM5,XMM0)
1714:           SSE_SUB_PS(XMM4,XMM5)

1716:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1717:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1718:           SSE_MULT_PS(XMM6,XMM1)
1719:           SSE_SUB_PS(XMM4,XMM6)

1721:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1722:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1723:           SSE_MULT_PS(XMM7,XMM2)
1724:           SSE_SUB_PS(XMM4,XMM7)

1726:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1727:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1728:           SSE_MULT_PS(XMM5,XMM3)
1729:           SSE_SUB_PS(XMM4,XMM5)

1731:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1732:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1734:           /* Second Column */
1735:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1736:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1738:           /* Matrix-Vector Product: */
1739:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1740:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1741:           SSE_MULT_PS(XMM6,XMM0)
1742:           SSE_SUB_PS(XMM5,XMM6)

1744:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1745:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1746:           SSE_MULT_PS(XMM7,XMM1)
1747:           SSE_SUB_PS(XMM5,XMM7)

1749:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1750:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1751:           SSE_MULT_PS(XMM4,XMM2)
1752:           SSE_SUB_PS(XMM5,XMM4)

1754:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1755:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1756:           SSE_MULT_PS(XMM6,XMM3)
1757:           SSE_SUB_PS(XMM5,XMM6)

1759:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1760:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1762:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1764:           /* Third Column */
1765:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1766:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1768:           /* Matrix-Vector Product: */
1769:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1770:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1771:           SSE_MULT_PS(XMM7,XMM0)
1772:           SSE_SUB_PS(XMM6,XMM7)

1774:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1775:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1776:           SSE_MULT_PS(XMM4,XMM1)
1777:           SSE_SUB_PS(XMM6,XMM4)

1779:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1780:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1781:           SSE_MULT_PS(XMM5,XMM2)
1782:           SSE_SUB_PS(XMM6,XMM5)

1784:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1785:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1786:           SSE_MULT_PS(XMM7,XMM3)
1787:           SSE_SUB_PS(XMM6,XMM7)

1789:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1790:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1792:           /* Fourth Column */
1793:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1794:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1796:           /* Matrix-Vector Product: */
1797:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1798:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1799:           SSE_MULT_PS(XMM5,XMM0)
1800:           SSE_SUB_PS(XMM4,XMM5)

1802:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1803:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1804:           SSE_MULT_PS(XMM6,XMM1)
1805:           SSE_SUB_PS(XMM4,XMM6)

1807:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1808:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1809:           SSE_MULT_PS(XMM7,XMM2)
1810:           SSE_SUB_PS(XMM4,XMM7)

1812:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1813:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1814:           SSE_MULT_PS(XMM5,XMM3)
1815:           SSE_SUB_PS(XMM4,XMM5)

1817:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1818:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1819:           SSE_INLINE_END_2;
1820:           pv += 16;
1821:         }
1822:         PetscLogFlops(128.0*nz+112.0);
1823:       }
1824:       row = (unsigned int)(*bjtmp++);
1825: /*        row = (*bjtmp++)/4; */
1826: /*        bjtmp++; */
1827:     }
1828:     /* finished row so stick it into b->a */
1829:     pv = ba + 16*bi[i];
1830:     pj = bj + bi[i];
1831:     nz = bi[i+1] - bi[i];

1833:     /* Copy x block back into pv block */
1834:     for (j=0; j<nz; j++) {
1835:       x = rtmp+16*((unsigned int)pj[j]);
1836: /*        x  = rtmp+4*pj[j]; */

1838:       SSE_INLINE_BEGIN_2(x,pv)
1839:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1840:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1841:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1843:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1844:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1846:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1847:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1849:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1850:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1852:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1853:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1855:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1856:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1858:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1859:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1861:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1862:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1863:       SSE_INLINE_END_2;
1864:       pv += 16;
1865:     }
1866:     /* invert diagonal block */
1867:     w = ba + 16*diag_offset[i];
1868:     if (pivotinblocks) {
1869:       PetscKernel_A_gets_inverse_A_4(w,shift);
1870:     } else {
1871:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1872:     }
1873: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1874:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1875:   }

1877:   PetscFree(rtmp);

1879:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1880:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1881:   C->assembled           = PETSC_TRUE;

1883:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1884:   /* Flop Count from inverting diagonal blocks */
1885:   SSE_SCOPE_END;
1886:   return(0);
1887: }

1889: #endif