Actual source code: baijfact11.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <../src/mat/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 15: {
 16:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
 17:   IS             isrow = b->row,isicol = b->icol;
 19:   const PetscInt *r,*ic;
 20:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 21:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 22:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 23:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 24:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 25:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 26:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 27:   MatScalar      m13,m14,m15,m16;
 28:   MatScalar      *ba = b->a,*aa = a->a;
 29:   PetscBool      pivotinblocks = b->pivotinblocks;
 30:   PetscReal      shift = info->shiftamount;

 33:   ISGetIndices(isrow,&r);
 34:   ISGetIndices(isicol,&ic);
 35:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

 37:   for (i=0; i<n; i++) {
 38:     nz    = bi[i+1] - bi[i];
 39:     ajtmp = bj + bi[i];
 40:     for  (j=0; j<nz; j++) {
 41:       x = rtmp+16*ajtmp[j];
 42:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 43:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 44:     }
 45:     /* load in initial (unfactored row) */
 46:     idx      = r[i];
 47:     nz       = ai[idx+1] - ai[idx];
 48:     ajtmpold = aj + ai[idx];
 49:     v        = aa + 16*ai[idx];
 50:     for (j=0; j<nz; j++) {
 51:       x    = rtmp+16*ic[ajtmpold[j]];
 52:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 53:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 54:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 55:       x[14] = v[14]; x[15] = v[15];
 56:       v    += 16;
 57:     }
 58:     row = *ajtmp++;
 59:     while (row < i) {
 60:       pc  = rtmp + 16*row;
 61:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 62:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 63:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 64:       p15 = pc[14]; p16 = pc[15];
 65:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 66:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 67:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 68:           || p16 != 0.0) {
 69:         pv = ba + 16*diag_offset[row];
 70:         pj = bj + diag_offset[row] + 1;
 71:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 72:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 73:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 74:         x15 = pv[14]; x16 = pv[15];
 75:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 76:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 77:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 78:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 80:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 81:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 82:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 83:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 85:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 86:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 87:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 88:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 90:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 91:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 92:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 93:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 95:         nz = bi[row+1] - diag_offset[row] - 1;
 96:         pv += 16;
 97:         for (j=0; j<nz; j++) {
 98:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 99:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
102:           x    = rtmp + 16*pj[j];
103:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

108:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

113:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

118:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

123:           pv   += 16;
124:         }
125:         PetscLogFlops(128.0*nz+112.0);
126:       }
127:       row = *ajtmp++;
128:     }
129:     /* finished row so stick it into b->a */
130:     pv = ba + 16*bi[i];
131:     pj = bj + bi[i];
132:     nz = bi[i+1] - bi[i];
133:     for (j=0; j<nz; j++) {
134:       x      = rtmp+16*pj[j];
135:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139:       pv   += 16;
140:     }
141:     /* invert diagonal block */
142:     w    = ba + 16*diag_offset[i];
143:     if (pivotinblocks) {
144:       PetscKernel_A_gets_inverse_A_4(w,shift);
145:     } else {
146:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
147:     }
148:   }

150:   PetscFree(rtmp);
151:   ISRestoreIndices(isicol,&ic);
152:   ISRestoreIndices(isrow,&r);
153:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
154:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155:   C->assembled = PETSC_TRUE;
156:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
157:   return(0);
158: }

160: /* MatLUFactorNumeric_SeqBAIJ_4 - 
161:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 
162:        PetscKernel_A_gets_A_times_B()
163:        PetscKernel_A_gets_A_minus_B_times_C()
164:        PetscKernel_A_gets_inverse_A()
165: */

169: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170: {
171:   Mat            C=B;
172:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173:   IS             isrow = b->row,isicol = b->icol;
175:   const PetscInt *r,*ic;
176:   PetscInt       i,j,k,nz,nzL,row;
177:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
179:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180:   PetscInt       flg;
181:   PetscReal      shift;

184:   ISGetIndices(isrow,&r);
185:   ISGetIndices(isicol,&ic);

187:   if (info->shifttype == MAT_SHIFT_NONE){
188:     shift = 0;
189:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
190:     shift = info->shiftamount;
191:   }

193:   /* generate work space needed by the factorization */
194:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
195:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

197:   for (i=0; i<n; i++){
198:     /* zero rtmp */
199:     /* L part */
200:     nz    = bi[i+1] - bi[i];
201:     bjtmp = bj + bi[i];
202:     for  (j=0; j<nz; j++){
203:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
204:     }

206:     /* U part */
207:     nz = bdiag[i]-bdiag[i+1];
208:     bjtmp = bj + bdiag[i+1]+1;
209:     for  (j=0; j<nz; j++){
210:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
211:     }
212: 
213:     /* load in initial (unfactored row) */
214:     nz    = ai[r[i]+1] - ai[r[i]];
215:     ajtmp = aj + ai[r[i]];
216:     v     = aa + bs2*ai[r[i]];
217:     for (j=0; j<nz; j++) {
218:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
219:     }

221:     /* elimination */
222:     bjtmp = bj + bi[i];
223:     nzL   = bi[i+1] - bi[i];
224:     for(k=0;k < nzL;k++) {
225:       row = bjtmp[k];
226:       pc = rtmp + bs2*row;
227:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
228:       if (flg) {
229:         pv = b->a + bs2*bdiag[row];
230:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
231:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
232: 
233:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
234:         pv = b->a + bs2*(bdiag[row+1]+1);
235:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
236:         for (j=0; j<nz; j++) {
237:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
238:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
239:           v    = rtmp + bs2*pj[j];
240:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
241:           pv  += bs2;
242:         }
243:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
244:       }
245:     }

247:     /* finished row so stick it into b->a */
248:     /* L part */
249:     pv   = b->a + bs2*bi[i] ;
250:     pj   = b->j + bi[i] ;
251:     nz   = bi[i+1] - bi[i];
252:     for (j=0; j<nz; j++) {
253:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
254:     }

256:     /* Mark diagonal and invert diagonal for simplier triangular solves */
257:     pv   = b->a + bs2*bdiag[i];
258:     pj   = b->j + bdiag[i];
259:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
260:     PetscKernel_A_gets_inverse_A_4(pv,shift);
261: 
262:     /* U part */
263:     pv = b->a + bs2*(bdiag[i+1]+1);
264:     pj = b->j + bdiag[i+1]+1;
265:     nz = bdiag[i] - bdiag[i+1] - 1;
266:     for (j=0; j<nz; j++){
267:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
268:     }
269:   }

271:   PetscFree2(rtmp,mwork);
272:   ISRestoreIndices(isicol,&ic);
273:   ISRestoreIndices(isrow,&r);
274:   C->ops->solve          = MatSolve_SeqBAIJ_4;
275:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
276:   C->assembled = PETSC_TRUE;
277:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
278:   return(0);
279: }

283: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
284: {
285: /*
286:     Default Version for when blocks are 4 by 4 Using natural ordering
287: */
288:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
290:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
291:   PetscInt       *ajtmpold,*ajtmp,nz,row;
292:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
293:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
294:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
295:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
296:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
297:   MatScalar      m13,m14,m15,m16;
298:   MatScalar      *ba = b->a,*aa = a->a;
299:   PetscBool      pivotinblocks = b->pivotinblocks;
300:   PetscReal      shift = info->shiftamount;

303:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

305:   for (i=0; i<n; i++) {
306:     nz    = bi[i+1] - bi[i];
307:     ajtmp = bj + bi[i];
308:     for  (j=0; j<nz; j++) {
309:       x = rtmp+16*ajtmp[j];
310:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
311:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
312:     }
313:     /* load in initial (unfactored row) */
314:     nz       = ai[i+1] - ai[i];
315:     ajtmpold = aj + ai[i];
316:     v        = aa + 16*ai[i];
317:     for (j=0; j<nz; j++) {
318:       x    = rtmp+16*ajtmpold[j];
319:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
320:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
321:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
322:       x[14] = v[14]; x[15] = v[15];
323:       v    += 16;
324:     }
325:     row = *ajtmp++;
326:     while (row < i) {
327:       pc  = rtmp + 16*row;
328:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
329:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
330:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
331:       p15 = pc[14]; p16 = pc[15];
332:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
333:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
334:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
335:           || p16 != 0.0) {
336:         pv = ba + 16*diag_offset[row];
337:         pj = bj + diag_offset[row] + 1;
338:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
339:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
340:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
341:         x15 = pv[14]; x16 = pv[15];
342:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
343:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
344:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
345:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

347:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
348:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
349:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
350:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

352:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
353:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
354:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
355:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

357:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
358:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
359:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
360:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
361:         nz = bi[row+1] - diag_offset[row] - 1;
362:         pv += 16;
363:         for (j=0; j<nz; j++) {
364:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
365:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
366:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
367:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
368:           x    = rtmp + 16*pj[j];
369:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
370:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
371:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
372:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

374:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
375:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
376:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
377:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

379:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
380:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
381:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
382:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

384:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
385:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
386:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
387:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

389:           pv   += 16;
390:         }
391:         PetscLogFlops(128.0*nz+112.0);
392:       }
393:       row = *ajtmp++;
394:     }
395:     /* finished row so stick it into b->a */
396:     pv = ba + 16*bi[i];
397:     pj = bj + bi[i];
398:     nz = bi[i+1] - bi[i];
399:     for (j=0; j<nz; j++) {
400:       x      = rtmp+16*pj[j];
401:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
402:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
403:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
404:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
405:       pv   += 16;
406:     }
407:     /* invert diagonal block */
408:     w = ba + 16*diag_offset[i];
409:     if (pivotinblocks) {
410:       PetscKernel_A_gets_inverse_A_4(w,shift);
411:     } else {
412:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
413:     }
414:   }

416:   PetscFree(rtmp);
417:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
418:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
419:   C->assembled = PETSC_TRUE;
420:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
421:   return(0);
422: }

424: /*
425:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
426:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
427: */
430: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
431: {
432:   Mat            C=B;
433:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
435:   PetscInt       i,j,k,nz,nzL,row;
436:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
437:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
438:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
439:   PetscInt       flg;
440:   PetscReal      shift;

443:   /* generate work space needed by the factorization */
444:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
445:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

447:   if (info->shifttype == MAT_SHIFT_NONE){
448:     shift = 0;
449:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
450:     shift = info->shiftamount;
451:   }

453:   for (i=0; i<n; i++){
454:     /* zero rtmp */
455:     /* L part */
456:     nz    = bi[i+1] - bi[i];
457:     bjtmp = bj + bi[i];
458:     for  (j=0; j<nz; j++){
459:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
460:     }

462:     /* U part */
463:     nz = bdiag[i] - bdiag[i+1];
464:     bjtmp = bj + bdiag[i+1]+1;
465:     for  (j=0; j<nz; j++){
466:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
467:     }
468: 
469:     /* load in initial (unfactored row) */
470:     nz    = ai[i+1] - ai[i];
471:     ajtmp = aj + ai[i];
472:     v     = aa + bs2*ai[i];
473:     for (j=0; j<nz; j++) {
474:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
475:     }

477:     /* elimination */
478:     bjtmp = bj + bi[i];
479:     nzL   = bi[i+1] - bi[i];
480:     for(k=0;k < nzL;k++) {
481:       row = bjtmp[k];
482:       pc = rtmp + bs2*row;
483:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
484:       if (flg) {
485:         pv = b->a + bs2*bdiag[row];
486:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
487:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
488: 
489:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
490:         pv = b->a + bs2*(bdiag[row+1]+1);
491:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
492:         for (j=0; j<nz; j++) {
493:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
494:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
495:           v    = rtmp + bs2*pj[j];
496:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
497:           pv  += bs2;
498:         }
499:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
500:       }
501:     }

503:     /* finished row so stick it into b->a */
504:     /* L part */
505:     pv   = b->a + bs2*bi[i] ;
506:     pj   = b->j + bi[i] ;
507:     nz   = bi[i+1] - bi[i];
508:     for (j=0; j<nz; j++) {
509:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
510:     }

512:     /* Mark diagonal and invert diagonal for simplier triangular solves */
513:     pv   = b->a + bs2*bdiag[i];
514:     pj   = b->j + bdiag[i];
515:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
516:     PetscKernel_A_gets_inverse_A_4(pv,shift);
517: 
518:     /* U part */
519:     pv = b->a + bs2*(bdiag[i+1]+1);
520:     pj = b->j + bdiag[i+1]+1;
521:     nz = bdiag[i] - bdiag[i+1] - 1;
522:     for (j=0; j<nz; j++){
523:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
524:     }
525:   }
526:   PetscFree2(rtmp,mwork);
527:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
528:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
529:   C->assembled = PETSC_TRUE;
530:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
531:   return(0);
532: }

534: #if defined(PETSC_HAVE_SSE)

536: #include PETSC_HAVE_SSE

538: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
542: {
543:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
545:   int i,j,n = a->mbs;
546:   int         *bj = b->j,*bjtmp,*pj;
547:   int         row;
548:   int         *ajtmpold,nz,*bi=b->i;
549:   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
550:   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
551:   MatScalar   *ba = b->a,*aa = a->a;
552:   int         nonzero=0;
553: /*    int            nonzero=0,colscale = 16; */
554:   PetscBool   pivotinblocks = b->pivotinblocks;
555:   PetscReal      shift = info->shiftamount;

558:   SSE_SCOPE_BEGIN;

560:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
561:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
562:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
563:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
564: /*    if ((unsigned long)bj==(unsigned long)aj) { */
565: /*      colscale = 4; */
566: /*    } */
567:   for (i=0; i<n; i++) {
568:     nz    = bi[i+1] - bi[i];
569:     bjtmp = bj + bi[i];
570:     /* zero out the 4x4 block accumulators */
571:     /* zero out one register */
572:     XOR_PS(XMM7,XMM7);
573:     for  (j=0; j<nz; j++) {
574:       x = rtmp+16*bjtmp[j];
575: /*        x = rtmp+4*bjtmp[j]; */
576:       SSE_INLINE_BEGIN_1(x)
577:         /* Copy zero register to memory locations */
578:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
579:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
580:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
581:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
582:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
583:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
584:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
585:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
586:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
587:       SSE_INLINE_END_1;
588:     }
589:     /* load in initial (unfactored row) */
590:     nz       = ai[i+1] - ai[i];
591:     ajtmpold = aj + ai[i];
592:     v        = aa + 16*ai[i];
593:     for (j=0; j<nz; j++) {
594:       x = rtmp+16*ajtmpold[j];
595: /*        x = rtmp+colscale*ajtmpold[j]; */
596:       /* Copy v block into x block */
597:       SSE_INLINE_BEGIN_2(v,x)
598:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
599:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
600:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

602:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
603:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

605:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
606:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

608:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
609:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

611:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
612:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

614:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
615:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

617:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
618:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

620:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
621:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
622:       SSE_INLINE_END_2;

624:       v += 16;
625:     }
626: /*      row = (*bjtmp++)/4; */
627:     row = *bjtmp++;
628:     while (row < i) {
629:       pc  = rtmp + 16*row;
630:       SSE_INLINE_BEGIN_1(pc)
631:         /* Load block from lower triangle */
632:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
633:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
634:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

636:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
637:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

639:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
640:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

642:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
643:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

645:         /* Compare block to zero block */

647:         SSE_COPY_PS(XMM4,XMM7)
648:         SSE_CMPNEQ_PS(XMM4,XMM0)

650:         SSE_COPY_PS(XMM5,XMM7)
651:         SSE_CMPNEQ_PS(XMM5,XMM1)

653:         SSE_COPY_PS(XMM6,XMM7)
654:         SSE_CMPNEQ_PS(XMM6,XMM2)

656:         SSE_CMPNEQ_PS(XMM7,XMM3)

658:         /* Reduce the comparisons to one SSE register */
659:         SSE_OR_PS(XMM6,XMM7)
660:         SSE_OR_PS(XMM5,XMM4)
661:         SSE_OR_PS(XMM5,XMM6)
662:       SSE_INLINE_END_1;

664:       /* Reduce the one SSE register to an integer register for branching */
665:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
666:       MOVEMASK(nonzero,XMM5);

668:       /* If block is nonzero ... */
669:       if (nonzero) {
670:         pv = ba + 16*diag_offset[row];
671:         PREFETCH_L1(&pv[16]);
672:         pj = bj + diag_offset[row] + 1;

674:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
675:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
676:         /* but the diagonal was inverted already */
677:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

679:         SSE_INLINE_BEGIN_2(pv,pc)
680:           /* Column 0, product is accumulated in XMM4 */
681:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
682:           SSE_SHUFFLE(XMM4,XMM4,0x00)
683:           SSE_MULT_PS(XMM4,XMM0)

685:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
686:           SSE_SHUFFLE(XMM5,XMM5,0x00)
687:           SSE_MULT_PS(XMM5,XMM1)
688:           SSE_ADD_PS(XMM4,XMM5)

690:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
691:           SSE_SHUFFLE(XMM6,XMM6,0x00)
692:           SSE_MULT_PS(XMM6,XMM2)
693:           SSE_ADD_PS(XMM4,XMM6)

695:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
696:           SSE_SHUFFLE(XMM7,XMM7,0x00)
697:           SSE_MULT_PS(XMM7,XMM3)
698:           SSE_ADD_PS(XMM4,XMM7)

700:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
701:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

703:           /* Column 1, product is accumulated in XMM5 */
704:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
705:           SSE_SHUFFLE(XMM5,XMM5,0x00)
706:           SSE_MULT_PS(XMM5,XMM0)

708:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
709:           SSE_SHUFFLE(XMM6,XMM6,0x00)
710:           SSE_MULT_PS(XMM6,XMM1)
711:           SSE_ADD_PS(XMM5,XMM6)

713:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
714:           SSE_SHUFFLE(XMM7,XMM7,0x00)
715:           SSE_MULT_PS(XMM7,XMM2)
716:           SSE_ADD_PS(XMM5,XMM7)

718:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
719:           SSE_SHUFFLE(XMM6,XMM6,0x00)
720:           SSE_MULT_PS(XMM6,XMM3)
721:           SSE_ADD_PS(XMM5,XMM6)

723:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
724:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

726:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

728:           /* Column 2, product is accumulated in XMM6 */
729:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
730:           SSE_SHUFFLE(XMM6,XMM6,0x00)
731:           SSE_MULT_PS(XMM6,XMM0)

733:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
734:           SSE_SHUFFLE(XMM7,XMM7,0x00)
735:           SSE_MULT_PS(XMM7,XMM1)
736:           SSE_ADD_PS(XMM6,XMM7)

738:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
739:           SSE_SHUFFLE(XMM7,XMM7,0x00)
740:           SSE_MULT_PS(XMM7,XMM2)
741:           SSE_ADD_PS(XMM6,XMM7)

743:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
744:           SSE_SHUFFLE(XMM7,XMM7,0x00)
745:           SSE_MULT_PS(XMM7,XMM3)
746:           SSE_ADD_PS(XMM6,XMM7)
747: 
748:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
749:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

751:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
752:           /* Column 3, product is accumulated in XMM0 */
753:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
754:           SSE_SHUFFLE(XMM7,XMM7,0x00)
755:           SSE_MULT_PS(XMM0,XMM7)

757:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
758:           SSE_SHUFFLE(XMM7,XMM7,0x00)
759:           SSE_MULT_PS(XMM1,XMM7)
760:           SSE_ADD_PS(XMM0,XMM1)

762:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
763:           SSE_SHUFFLE(XMM1,XMM1,0x00)
764:           SSE_MULT_PS(XMM1,XMM2)
765:           SSE_ADD_PS(XMM0,XMM1)

767:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
768:           SSE_SHUFFLE(XMM7,XMM7,0x00)
769:           SSE_MULT_PS(XMM3,XMM7)
770:           SSE_ADD_PS(XMM0,XMM3)

772:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
773:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

775:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
776:           /* This is code to be maintained and read by humans afterall. */
777:           /* Copy Multiplier Col 3 into XMM3 */
778:           SSE_COPY_PS(XMM3,XMM0)
779:           /* Copy Multiplier Col 2 into XMM2 */
780:           SSE_COPY_PS(XMM2,XMM6)
781:           /* Copy Multiplier Col 1 into XMM1 */
782:           SSE_COPY_PS(XMM1,XMM5)
783:           /* Copy Multiplier Col 0 into XMM0 */
784:           SSE_COPY_PS(XMM0,XMM4)
785:         SSE_INLINE_END_2;

787:         /* Update the row: */
788:         nz = bi[row+1] - diag_offset[row] - 1;
789:         pv += 16;
790:         for (j=0; j<nz; j++) {
791:           PREFETCH_L1(&pv[16]);
792:           x = rtmp + 16*pj[j];
793: /*            x = rtmp + 4*pj[j]; */

795:           /* X:=X-M*PV, One column at a time */
796:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
797:           SSE_INLINE_BEGIN_2(x,pv)
798:             /* Load First Column of X*/
799:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
800:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

802:             /* Matrix-Vector Product: */
803:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
804:             SSE_SHUFFLE(XMM5,XMM5,0x00)
805:             SSE_MULT_PS(XMM5,XMM0)
806:             SSE_SUB_PS(XMM4,XMM5)

808:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
809:             SSE_SHUFFLE(XMM6,XMM6,0x00)
810:             SSE_MULT_PS(XMM6,XMM1)
811:             SSE_SUB_PS(XMM4,XMM6)

813:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
814:             SSE_SHUFFLE(XMM7,XMM7,0x00)
815:             SSE_MULT_PS(XMM7,XMM2)
816:             SSE_SUB_PS(XMM4,XMM7)

818:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
819:             SSE_SHUFFLE(XMM5,XMM5,0x00)
820:             SSE_MULT_PS(XMM5,XMM3)
821:             SSE_SUB_PS(XMM4,XMM5)

823:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
824:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

826:             /* Second Column */
827:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
828:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

830:             /* Matrix-Vector Product: */
831:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
832:             SSE_SHUFFLE(XMM6,XMM6,0x00)
833:             SSE_MULT_PS(XMM6,XMM0)
834:             SSE_SUB_PS(XMM5,XMM6)

836:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
837:             SSE_SHUFFLE(XMM7,XMM7,0x00)
838:             SSE_MULT_PS(XMM7,XMM1)
839:             SSE_SUB_PS(XMM5,XMM7)

841:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
842:             SSE_SHUFFLE(XMM4,XMM4,0x00)
843:             SSE_MULT_PS(XMM4,XMM2)
844:             SSE_SUB_PS(XMM5,XMM4)

846:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
847:             SSE_SHUFFLE(XMM6,XMM6,0x00)
848:             SSE_MULT_PS(XMM6,XMM3)
849:             SSE_SUB_PS(XMM5,XMM6)
850: 
851:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
852:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

854:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

856:             /* Third Column */
857:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
858:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

860:             /* Matrix-Vector Product: */
861:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
862:             SSE_SHUFFLE(XMM7,XMM7,0x00)
863:             SSE_MULT_PS(XMM7,XMM0)
864:             SSE_SUB_PS(XMM6,XMM7)

866:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
867:             SSE_SHUFFLE(XMM4,XMM4,0x00)
868:             SSE_MULT_PS(XMM4,XMM1)
869:             SSE_SUB_PS(XMM6,XMM4)

871:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
872:             SSE_SHUFFLE(XMM5,XMM5,0x00)
873:             SSE_MULT_PS(XMM5,XMM2)
874:             SSE_SUB_PS(XMM6,XMM5)

876:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
877:             SSE_SHUFFLE(XMM7,XMM7,0x00)
878:             SSE_MULT_PS(XMM7,XMM3)
879:             SSE_SUB_PS(XMM6,XMM7)
880: 
881:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
882:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
883: 
884:             /* Fourth Column */
885:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
886:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

888:             /* Matrix-Vector Product: */
889:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
890:             SSE_SHUFFLE(XMM5,XMM5,0x00)
891:             SSE_MULT_PS(XMM5,XMM0)
892:             SSE_SUB_PS(XMM4,XMM5)

894:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
895:             SSE_SHUFFLE(XMM6,XMM6,0x00)
896:             SSE_MULT_PS(XMM6,XMM1)
897:             SSE_SUB_PS(XMM4,XMM6)

899:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
900:             SSE_SHUFFLE(XMM7,XMM7,0x00)
901:             SSE_MULT_PS(XMM7,XMM2)
902:             SSE_SUB_PS(XMM4,XMM7)

904:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
905:             SSE_SHUFFLE(XMM5,XMM5,0x00)
906:             SSE_MULT_PS(XMM5,XMM3)
907:             SSE_SUB_PS(XMM4,XMM5)
908: 
909:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
910:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
911:           SSE_INLINE_END_2;
912:           pv   += 16;
913:         }
914:         PetscLogFlops(128.0*nz+112.0);
915:       }
916:       row = *bjtmp++;
917: /*        row = (*bjtmp++)/4; */
918:     }
919:     /* finished row so stick it into b->a */
920:     pv = ba + 16*bi[i];
921:     pj = bj + bi[i];
922:     nz = bi[i+1] - bi[i];

924:     /* Copy x block back into pv block */
925:     for (j=0; j<nz; j++) {
926:       x  = rtmp+16*pj[j];
927: /*        x  = rtmp+4*pj[j]; */

929:       SSE_INLINE_BEGIN_2(x,pv)
930:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
931:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
932:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

934:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
935:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

937:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
938:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

940:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
941:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

943:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
944:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

946:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
947:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

949:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
950:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

952:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
953:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
954:       SSE_INLINE_END_2;
955:       pv += 16;
956:     }
957:     /* invert diagonal block */
958:     w = ba + 16*diag_offset[i];
959:     if (pivotinblocks) {
960:       PetscKernel_A_gets_inverse_A_4(w,shift);
961:     } else {
962:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
963:     }
964: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
965:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
966:   }

968:   PetscFree(rtmp);
969:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
970:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
971:   C->assembled = PETSC_TRUE;
972:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
973:   /* Flop Count from inverting diagonal blocks */
974:   SSE_SCOPE_END;
975:   return(0);
976: }

980: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
981: {
982:   Mat            A=C;
983:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
985:   int i,j,n = a->mbs;
986:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
987:   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
988:   unsigned int   row;
989:   int            nz,*bi=b->i;
990:   int            *diag_offset = b->diag,*ai=a->i;
991:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
992:   MatScalar      *ba = b->a,*aa = a->a;
993:   int            nonzero=0;
994: /*    int            nonzero=0,colscale = 16; */
995:   PetscBool      pivotinblocks = b->pivotinblocks;
996:   PetscReal      shift = info->shiftamount;

999:   SSE_SCOPE_BEGIN;

1001:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1002:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1003:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1004:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1005: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1006: /*      colscale = 4; */
1007: /*    } */
1008: 
1009:   for (i=0; i<n; i++) {
1010:     nz    = bi[i+1] - bi[i];
1011:     bjtmp = bj + bi[i];
1012:     /* zero out the 4x4 block accumulators */
1013:     /* zero out one register */
1014:     XOR_PS(XMM7,XMM7);
1015:     for  (j=0; j<nz; j++) {
1016:       x = rtmp+16*((unsigned int)bjtmp[j]);
1017: /*        x = rtmp+4*bjtmp[j]; */
1018:       SSE_INLINE_BEGIN_1(x)
1019:         /* Copy zero register to memory locations */
1020:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1021:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1022:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1023:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1024:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1025:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1026:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1027:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1028:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1029:       SSE_INLINE_END_1;
1030:     }
1031:     /* load in initial (unfactored row) */
1032:     nz    = ai[i+1] - ai[i];
1033:     ajtmp = aj + ai[i];
1034:     v     = aa + 16*ai[i];
1035:     for (j=0; j<nz; j++) {
1036:       x = rtmp+16*((unsigned int)ajtmp[j]);
1037: /*        x = rtmp+colscale*ajtmp[j]; */
1038:       /* Copy v block into x block */
1039:       SSE_INLINE_BEGIN_2(v,x)
1040:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1042:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1044:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1045:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1047:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1048:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1050:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1051:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1053:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1054:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1056:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1057:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1059:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1060:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1062:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1063:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1064:       SSE_INLINE_END_2;

1066:       v += 16;
1067:     }
1068: /*      row = (*bjtmp++)/4; */
1069:     row = (unsigned int)(*bjtmp++);
1070:     while (row < i) {
1071:       pc  = rtmp + 16*row;
1072:       SSE_INLINE_BEGIN_1(pc)
1073:         /* Load block from lower triangle */
1074:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1075:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1076:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1078:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1079:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1081:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1082:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1084:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1085:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1087:         /* Compare block to zero block */

1089:         SSE_COPY_PS(XMM4,XMM7)
1090:         SSE_CMPNEQ_PS(XMM4,XMM0)

1092:         SSE_COPY_PS(XMM5,XMM7)
1093:         SSE_CMPNEQ_PS(XMM5,XMM1)

1095:         SSE_COPY_PS(XMM6,XMM7)
1096:         SSE_CMPNEQ_PS(XMM6,XMM2)

1098:         SSE_CMPNEQ_PS(XMM7,XMM3)

1100:         /* Reduce the comparisons to one SSE register */
1101:         SSE_OR_PS(XMM6,XMM7)
1102:         SSE_OR_PS(XMM5,XMM4)
1103:         SSE_OR_PS(XMM5,XMM6)
1104:       SSE_INLINE_END_1;

1106:       /* Reduce the one SSE register to an integer register for branching */
1107:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1108:       MOVEMASK(nonzero,XMM5);

1110:       /* If block is nonzero ... */
1111:       if (nonzero) {
1112:         pv = ba + 16*diag_offset[row];
1113:         PREFETCH_L1(&pv[16]);
1114:         pj = bj + diag_offset[row] + 1;

1116:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1117:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1118:         /* but the diagonal was inverted already */
1119:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1121:         SSE_INLINE_BEGIN_2(pv,pc)
1122:           /* Column 0, product is accumulated in XMM4 */
1123:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1124:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1125:           SSE_MULT_PS(XMM4,XMM0)

1127:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1128:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1129:           SSE_MULT_PS(XMM5,XMM1)
1130:           SSE_ADD_PS(XMM4,XMM5)

1132:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1133:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1134:           SSE_MULT_PS(XMM6,XMM2)
1135:           SSE_ADD_PS(XMM4,XMM6)

1137:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1138:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1139:           SSE_MULT_PS(XMM7,XMM3)
1140:           SSE_ADD_PS(XMM4,XMM7)

1142:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1143:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1145:           /* Column 1, product is accumulated in XMM5 */
1146:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1147:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1148:           SSE_MULT_PS(XMM5,XMM0)

1150:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1151:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1152:           SSE_MULT_PS(XMM6,XMM1)
1153:           SSE_ADD_PS(XMM5,XMM6)

1155:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1156:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1157:           SSE_MULT_PS(XMM7,XMM2)
1158:           SSE_ADD_PS(XMM5,XMM7)

1160:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1161:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1162:           SSE_MULT_PS(XMM6,XMM3)
1163:           SSE_ADD_PS(XMM5,XMM6)

1165:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1166:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1168:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1170:           /* Column 2, product is accumulated in XMM6 */
1171:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1172:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1173:           SSE_MULT_PS(XMM6,XMM0)

1175:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1176:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1177:           SSE_MULT_PS(XMM7,XMM1)
1178:           SSE_ADD_PS(XMM6,XMM7)

1180:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1181:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1182:           SSE_MULT_PS(XMM7,XMM2)
1183:           SSE_ADD_PS(XMM6,XMM7)

1185:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1186:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1187:           SSE_MULT_PS(XMM7,XMM3)
1188:           SSE_ADD_PS(XMM6,XMM7)
1189: 
1190:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1191:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1193:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1194:           /* Column 3, product is accumulated in XMM0 */
1195:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1196:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1197:           SSE_MULT_PS(XMM0,XMM7)

1199:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1200:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1201:           SSE_MULT_PS(XMM1,XMM7)
1202:           SSE_ADD_PS(XMM0,XMM1)

1204:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1205:           SSE_SHUFFLE(XMM1,XMM1,0x00)
1206:           SSE_MULT_PS(XMM1,XMM2)
1207:           SSE_ADD_PS(XMM0,XMM1)

1209:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1210:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1211:           SSE_MULT_PS(XMM3,XMM7)
1212:           SSE_ADD_PS(XMM0,XMM3)

1214:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1215:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1217:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1218:           /* This is code to be maintained and read by humans afterall. */
1219:           /* Copy Multiplier Col 3 into XMM3 */
1220:           SSE_COPY_PS(XMM3,XMM0)
1221:           /* Copy Multiplier Col 2 into XMM2 */
1222:           SSE_COPY_PS(XMM2,XMM6)
1223:           /* Copy Multiplier Col 1 into XMM1 */
1224:           SSE_COPY_PS(XMM1,XMM5)
1225:           /* Copy Multiplier Col 0 into XMM0 */
1226:           SSE_COPY_PS(XMM0,XMM4)
1227:         SSE_INLINE_END_2;

1229:         /* Update the row: */
1230:         nz = bi[row+1] - diag_offset[row] - 1;
1231:         pv += 16;
1232:         for (j=0; j<nz; j++) {
1233:           PREFETCH_L1(&pv[16]);
1234:           x = rtmp + 16*((unsigned int)pj[j]);
1235: /*            x = rtmp + 4*pj[j]; */

1237:           /* X:=X-M*PV, One column at a time */
1238:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1239:           SSE_INLINE_BEGIN_2(x,pv)
1240:             /* Load First Column of X*/
1241:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1242:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1244:             /* Matrix-Vector Product: */
1245:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1246:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1247:             SSE_MULT_PS(XMM5,XMM0)
1248:             SSE_SUB_PS(XMM4,XMM5)

1250:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1251:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1252:             SSE_MULT_PS(XMM6,XMM1)
1253:             SSE_SUB_PS(XMM4,XMM6)

1255:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1256:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1257:             SSE_MULT_PS(XMM7,XMM2)
1258:             SSE_SUB_PS(XMM4,XMM7)

1260:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1261:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1262:             SSE_MULT_PS(XMM5,XMM3)
1263:             SSE_SUB_PS(XMM4,XMM5)

1265:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1268:             /* Second Column */
1269:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1270:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1272:             /* Matrix-Vector Product: */
1273:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1274:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1275:             SSE_MULT_PS(XMM6,XMM0)
1276:             SSE_SUB_PS(XMM5,XMM6)

1278:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1279:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1280:             SSE_MULT_PS(XMM7,XMM1)
1281:             SSE_SUB_PS(XMM5,XMM7)

1283:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1284:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1285:             SSE_MULT_PS(XMM4,XMM2)
1286:             SSE_SUB_PS(XMM5,XMM4)

1288:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1289:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1290:             SSE_MULT_PS(XMM6,XMM3)
1291:             SSE_SUB_PS(XMM5,XMM6)
1292: 
1293:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1296:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1298:             /* Third Column */
1299:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1300:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1302:             /* Matrix-Vector Product: */
1303:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1304:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1305:             SSE_MULT_PS(XMM7,XMM0)
1306:             SSE_SUB_PS(XMM6,XMM7)

1308:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1309:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1310:             SSE_MULT_PS(XMM4,XMM1)
1311:             SSE_SUB_PS(XMM6,XMM4)

1313:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1314:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1315:             SSE_MULT_PS(XMM5,XMM2)
1316:             SSE_SUB_PS(XMM6,XMM5)

1318:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1319:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1320:             SSE_MULT_PS(XMM7,XMM3)
1321:             SSE_SUB_PS(XMM6,XMM7)
1322: 
1323:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325: 
1326:             /* Fourth Column */
1327:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1328:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1330:             /* Matrix-Vector Product: */
1331:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1332:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1333:             SSE_MULT_PS(XMM5,XMM0)
1334:             SSE_SUB_PS(XMM4,XMM5)

1336:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1337:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1338:             SSE_MULT_PS(XMM6,XMM1)
1339:             SSE_SUB_PS(XMM4,XMM6)

1341:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1342:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1343:             SSE_MULT_PS(XMM7,XMM2)
1344:             SSE_SUB_PS(XMM4,XMM7)

1346:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1347:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1348:             SSE_MULT_PS(XMM5,XMM3)
1349:             SSE_SUB_PS(XMM4,XMM5)
1350: 
1351:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353:           SSE_INLINE_END_2;
1354:           pv   += 16;
1355:         }
1356:         PetscLogFlops(128.0*nz+112.0);
1357:       }
1358:       row = (unsigned int)(*bjtmp++);
1359: /*        row = (*bjtmp++)/4; */
1360: /*        bjtmp++; */
1361:     }
1362:     /* finished row so stick it into b->a */
1363:     pv = ba + 16*bi[i];
1364:     pj = bj + bi[i];
1365:     nz = bi[i+1] - bi[i];

1367:     /* Copy x block back into pv block */
1368:     for (j=0; j<nz; j++) {
1369:       x  = rtmp+16*((unsigned int)pj[j]);
1370: /*        x  = rtmp+4*pj[j]; */

1372:       SSE_INLINE_BEGIN_2(x,pv)
1373:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1374:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1375:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1377:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1378:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1380:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1381:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1383:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1384:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1386:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1387:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1389:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1390:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1392:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1393:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1395:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1396:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1397:       SSE_INLINE_END_2;
1398:       pv += 16;
1399:     }
1400:     /* invert diagonal block */
1401:     w = ba + 16*diag_offset[i];
1402:     if (pivotinblocks) {
1403:       PetscKernel_A_gets_inverse_A_4(w,shift);
1404:     } else {
1405:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1406:     }
1407: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1408:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1409:   }

1411:   PetscFree(rtmp);
1412:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1413:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1414:   C->assembled = PETSC_TRUE;
1415:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1416:   /* Flop Count from inverting diagonal blocks */
1417:   SSE_SCOPE_END;
1418:   return(0);
1419: }

1423: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1424: {
1425:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1427:   int  i,j,n = a->mbs;
1428:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1429:   unsigned int   row;
1430:   int            *ajtmpold,nz,*bi=b->i;
1431:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1432:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1433:   MatScalar      *ba = b->a,*aa = a->a;
1434:   int            nonzero=0;
1435: /*    int            nonzero=0,colscale = 16; */
1436:   PetscBool      pivotinblocks = b->pivotinblocks;
1437:   PetscReal      shift = info->shiftamount;

1440:   SSE_SCOPE_BEGIN;

1442:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1443:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1444:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1445:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1446: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1447: /*      colscale = 4; */
1448: /*    } */
1449:   if ((unsigned long)bj==(unsigned long)aj) {
1450:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1451:   }
1452: 
1453:   for (i=0; i<n; i++) {
1454:     nz    = bi[i+1] - bi[i];
1455:     bjtmp = bj + bi[i];
1456:     /* zero out the 4x4 block accumulators */
1457:     /* zero out one register */
1458:     XOR_PS(XMM7,XMM7);
1459:     for  (j=0; j<nz; j++) {
1460:       x = rtmp+16*((unsigned int)bjtmp[j]);
1461: /*        x = rtmp+4*bjtmp[j]; */
1462:       SSE_INLINE_BEGIN_1(x)
1463:         /* Copy zero register to memory locations */
1464:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1465:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1466:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1467:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1468:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1469:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1470:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1471:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1472:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1473:       SSE_INLINE_END_1;
1474:     }
1475:     /* load in initial (unfactored row) */
1476:     nz       = ai[i+1] - ai[i];
1477:     ajtmpold = aj + ai[i];
1478:     v        = aa + 16*ai[i];
1479:     for (j=0; j<nz; j++) {
1480:       x = rtmp+16*ajtmpold[j];
1481: /*        x = rtmp+colscale*ajtmpold[j]; */
1482:       /* Copy v block into x block */
1483:       SSE_INLINE_BEGIN_2(v,x)
1484:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1486:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1488:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1489:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1491:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1492:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1494:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1495:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1497:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1498:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1500:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1501:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1503:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1504:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1506:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1507:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1508:       SSE_INLINE_END_2;

1510:       v += 16;
1511:     }
1512: /*      row = (*bjtmp++)/4; */
1513:     row = (unsigned int)(*bjtmp++);
1514:     while (row < i) {
1515:       pc  = rtmp + 16*row;
1516:       SSE_INLINE_BEGIN_1(pc)
1517:         /* Load block from lower triangle */
1518:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1519:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1520:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1522:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1523:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1525:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1526:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1528:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1529:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1531:         /* Compare block to zero block */

1533:         SSE_COPY_PS(XMM4,XMM7)
1534:         SSE_CMPNEQ_PS(XMM4,XMM0)

1536:         SSE_COPY_PS(XMM5,XMM7)
1537:         SSE_CMPNEQ_PS(XMM5,XMM1)

1539:         SSE_COPY_PS(XMM6,XMM7)
1540:         SSE_CMPNEQ_PS(XMM6,XMM2)

1542:         SSE_CMPNEQ_PS(XMM7,XMM3)

1544:         /* Reduce the comparisons to one SSE register */
1545:         SSE_OR_PS(XMM6,XMM7)
1546:         SSE_OR_PS(XMM5,XMM4)
1547:         SSE_OR_PS(XMM5,XMM6)
1548:       SSE_INLINE_END_1;

1550:       /* Reduce the one SSE register to an integer register for branching */
1551:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1552:       MOVEMASK(nonzero,XMM5);

1554:       /* If block is nonzero ... */
1555:       if (nonzero) {
1556:         pv = ba + 16*diag_offset[row];
1557:         PREFETCH_L1(&pv[16]);
1558:         pj = bj + diag_offset[row] + 1;

1560:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1561:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1562:         /* but the diagonal was inverted already */
1563:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1565:         SSE_INLINE_BEGIN_2(pv,pc)
1566:           /* Column 0, product is accumulated in XMM4 */
1567:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1568:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1569:           SSE_MULT_PS(XMM4,XMM0)

1571:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1572:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1573:           SSE_MULT_PS(XMM5,XMM1)
1574:           SSE_ADD_PS(XMM4,XMM5)

1576:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1577:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1578:           SSE_MULT_PS(XMM6,XMM2)
1579:           SSE_ADD_PS(XMM4,XMM6)

1581:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1582:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1583:           SSE_MULT_PS(XMM7,XMM3)
1584:           SSE_ADD_PS(XMM4,XMM7)

1586:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1587:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1589:           /* Column 1, product is accumulated in XMM5 */
1590:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1591:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1592:           SSE_MULT_PS(XMM5,XMM0)

1594:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1595:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1596:           SSE_MULT_PS(XMM6,XMM1)
1597:           SSE_ADD_PS(XMM5,XMM6)

1599:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1600:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1601:           SSE_MULT_PS(XMM7,XMM2)
1602:           SSE_ADD_PS(XMM5,XMM7)

1604:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1605:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1606:           SSE_MULT_PS(XMM6,XMM3)
1607:           SSE_ADD_PS(XMM5,XMM6)

1609:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1610:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1612:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1614:           /* Column 2, product is accumulated in XMM6 */
1615:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1616:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1617:           SSE_MULT_PS(XMM6,XMM0)

1619:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1620:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1621:           SSE_MULT_PS(XMM7,XMM1)
1622:           SSE_ADD_PS(XMM6,XMM7)

1624:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1625:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1626:           SSE_MULT_PS(XMM7,XMM2)
1627:           SSE_ADD_PS(XMM6,XMM7)

1629:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1630:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1631:           SSE_MULT_PS(XMM7,XMM3)
1632:           SSE_ADD_PS(XMM6,XMM7)
1633: 
1634:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1635:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1637:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1638:           /* Column 3, product is accumulated in XMM0 */
1639:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1640:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1641:           SSE_MULT_PS(XMM0,XMM7)

1643:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1644:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1645:           SSE_MULT_PS(XMM1,XMM7)
1646:           SSE_ADD_PS(XMM0,XMM1)

1648:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1649:           SSE_SHUFFLE(XMM1,XMM1,0x00)
1650:           SSE_MULT_PS(XMM1,XMM2)
1651:           SSE_ADD_PS(XMM0,XMM1)

1653:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1654:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1655:           SSE_MULT_PS(XMM3,XMM7)
1656:           SSE_ADD_PS(XMM0,XMM3)

1658:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1659:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1661:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1662:           /* This is code to be maintained and read by humans afterall. */
1663:           /* Copy Multiplier Col 3 into XMM3 */
1664:           SSE_COPY_PS(XMM3,XMM0)
1665:           /* Copy Multiplier Col 2 into XMM2 */
1666:           SSE_COPY_PS(XMM2,XMM6)
1667:           /* Copy Multiplier Col 1 into XMM1 */
1668:           SSE_COPY_PS(XMM1,XMM5)
1669:           /* Copy Multiplier Col 0 into XMM0 */
1670:           SSE_COPY_PS(XMM0,XMM4)
1671:         SSE_INLINE_END_2;

1673:         /* Update the row: */
1674:         nz = bi[row+1] - diag_offset[row] - 1;
1675:         pv += 16;
1676:         for (j=0; j<nz; j++) {
1677:           PREFETCH_L1(&pv[16]);
1678:           x = rtmp + 16*((unsigned int)pj[j]);
1679: /*            x = rtmp + 4*pj[j]; */

1681:           /* X:=X-M*PV, One column at a time */
1682:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1683:           SSE_INLINE_BEGIN_2(x,pv)
1684:             /* Load First Column of X*/
1685:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1686:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1688:             /* Matrix-Vector Product: */
1689:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1690:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1691:             SSE_MULT_PS(XMM5,XMM0)
1692:             SSE_SUB_PS(XMM4,XMM5)

1694:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1695:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1696:             SSE_MULT_PS(XMM6,XMM1)
1697:             SSE_SUB_PS(XMM4,XMM6)

1699:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1700:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1701:             SSE_MULT_PS(XMM7,XMM2)
1702:             SSE_SUB_PS(XMM4,XMM7)

1704:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1705:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1706:             SSE_MULT_PS(XMM5,XMM3)
1707:             SSE_SUB_PS(XMM4,XMM5)

1709:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1710:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1712:             /* Second Column */
1713:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1714:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1716:             /* Matrix-Vector Product: */
1717:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1718:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1719:             SSE_MULT_PS(XMM6,XMM0)
1720:             SSE_SUB_PS(XMM5,XMM6)

1722:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1723:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1724:             SSE_MULT_PS(XMM7,XMM1)
1725:             SSE_SUB_PS(XMM5,XMM7)

1727:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1728:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1729:             SSE_MULT_PS(XMM4,XMM2)
1730:             SSE_SUB_PS(XMM5,XMM4)

1732:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1733:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1734:             SSE_MULT_PS(XMM6,XMM3)
1735:             SSE_SUB_PS(XMM5,XMM6)
1736: 
1737:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1738:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1740:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1742:             /* Third Column */
1743:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1744:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1746:             /* Matrix-Vector Product: */
1747:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1748:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1749:             SSE_MULT_PS(XMM7,XMM0)
1750:             SSE_SUB_PS(XMM6,XMM7)

1752:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1753:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1754:             SSE_MULT_PS(XMM4,XMM1)
1755:             SSE_SUB_PS(XMM6,XMM4)

1757:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1758:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1759:             SSE_MULT_PS(XMM5,XMM2)
1760:             SSE_SUB_PS(XMM6,XMM5)

1762:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1763:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1764:             SSE_MULT_PS(XMM7,XMM3)
1765:             SSE_SUB_PS(XMM6,XMM7)
1766: 
1767:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1768:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1769: 
1770:             /* Fourth Column */
1771:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1772:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1774:             /* Matrix-Vector Product: */
1775:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1776:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1777:             SSE_MULT_PS(XMM5,XMM0)
1778:             SSE_SUB_PS(XMM4,XMM5)

1780:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1781:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1782:             SSE_MULT_PS(XMM6,XMM1)
1783:             SSE_SUB_PS(XMM4,XMM6)

1785:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1786:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1787:             SSE_MULT_PS(XMM7,XMM2)
1788:             SSE_SUB_PS(XMM4,XMM7)

1790:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1791:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1792:             SSE_MULT_PS(XMM5,XMM3)
1793:             SSE_SUB_PS(XMM4,XMM5)
1794: 
1795:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1796:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1797:           SSE_INLINE_END_2;
1798:           pv   += 16;
1799:         }
1800:         PetscLogFlops(128.0*nz+112.0);
1801:       }
1802:       row = (unsigned int)(*bjtmp++);
1803: /*        row = (*bjtmp++)/4; */
1804: /*        bjtmp++; */
1805:     }
1806:     /* finished row so stick it into b->a */
1807:     pv = ba + 16*bi[i];
1808:     pj = bj + bi[i];
1809:     nz = bi[i+1] - bi[i];

1811:     /* Copy x block back into pv block */
1812:     for (j=0; j<nz; j++) {
1813:       x  = rtmp+16*((unsigned int)pj[j]);
1814: /*        x  = rtmp+4*pj[j]; */

1816:       SSE_INLINE_BEGIN_2(x,pv)
1817:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1818:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1819:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1821:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1822:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1824:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1825:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1827:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1828:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1830:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1831:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1833:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1834:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1836:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1837:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1839:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1840:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1841:       SSE_INLINE_END_2;
1842:       pv += 16;
1843:     }
1844:     /* invert diagonal block */
1845:     w = ba + 16*diag_offset[i];
1846:     if (pivotinblocks) {
1847:       PetscKernel_A_gets_inverse_A_4(w,shift);
1848:     } else {
1849:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1850:     }
1851: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1852:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1853:   }

1855:   PetscFree(rtmp);
1856:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1857:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1858:   C->assembled = PETSC_TRUE;
1859:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1860:   /* Flop Count from inverting diagonal blocks */
1861:   SSE_SCOPE_END;
1862:   return(0);
1863: }

1865: #endif