Actual source code: baijfact11.c
petsc-3.3-p7 2013-05-11
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <../src/mat/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15: {
16: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
17: IS isrow = b->row,isicol = b->icol;
19: const PetscInt *r,*ic;
20: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21: PetscInt *ajtmpold,*ajtmp,nz,row;
22: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
24: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27: MatScalar m13,m14,m15,m16;
28: MatScalar *ba = b->a,*aa = a->a;
29: PetscBool pivotinblocks = b->pivotinblocks;
30: PetscReal shift = info->shiftamount;
33: ISGetIndices(isrow,&r);
34: ISGetIndices(isicol,&ic);
35: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
37: for (i=0; i<n; i++) {
38: nz = bi[i+1] - bi[i];
39: ajtmp = bj + bi[i];
40: for (j=0; j<nz; j++) {
41: x = rtmp+16*ajtmp[j];
42: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
43: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44: }
45: /* load in initial (unfactored row) */
46: idx = r[i];
47: nz = ai[idx+1] - ai[idx];
48: ajtmpold = aj + ai[idx];
49: v = aa + 16*ai[idx];
50: for (j=0; j<nz; j++) {
51: x = rtmp+16*ic[ajtmpold[j]];
52: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
53: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
54: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55: x[14] = v[14]; x[15] = v[15];
56: v += 16;
57: }
58: row = *ajtmp++;
59: while (row < i) {
60: pc = rtmp + 16*row;
61: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
62: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
63: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64: p15 = pc[14]; p16 = pc[15];
65: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68: || p16 != 0.0) {
69: pv = ba + 16*diag_offset[row];
70: pj = bj + diag_offset[row] + 1;
71: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
72: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
73: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74: x15 = pv[14]; x16 = pv[15];
75: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
76: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
77: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
78: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
80: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
81: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
82: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
83: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
85: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
86: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
87: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
88: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
90: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
91: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
92: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
93: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
95: nz = bi[row+1] - diag_offset[row] - 1;
96: pv += 16;
97: for (j=0; j<nz; j++) {
98: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
99: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
100: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
102: x = rtmp + 16*pj[j];
103: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
104: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
105: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
106: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
108: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
109: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
110: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
111: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
113: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
114: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
119: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
120: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
121: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
123: pv += 16;
124: }
125: PetscLogFlops(128.0*nz+112.0);
126: }
127: row = *ajtmp++;
128: }
129: /* finished row so stick it into b->a */
130: pv = ba + 16*bi[i];
131: pj = bj + bi[i];
132: nz = bi[i+1] - bi[i];
133: for (j=0; j<nz; j++) {
134: x = rtmp+16*pj[j];
135: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
136: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
137: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139: pv += 16;
140: }
141: /* invert diagonal block */
142: w = ba + 16*diag_offset[i];
143: if (pivotinblocks) {
144: PetscKernel_A_gets_inverse_A_4(w,shift);
145: } else {
146: PetscKernel_A_gets_inverse_A_4_nopivot(w);
147: }
148: }
150: PetscFree(rtmp);
151: ISRestoreIndices(isicol,&ic);
152: ISRestoreIndices(isrow,&r);
153: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
154: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155: C->assembled = PETSC_TRUE;
156: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
157: return(0);
158: }
160: /* MatLUFactorNumeric_SeqBAIJ_4 -
161: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
162: PetscKernel_A_gets_A_times_B()
163: PetscKernel_A_gets_A_minus_B_times_C()
164: PetscKernel_A_gets_inverse_A()
165: */
169: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170: {
171: Mat C=B;
172: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173: IS isrow = b->row,isicol = b->icol;
175: const PetscInt *r,*ic;
176: PetscInt i,j,k,nz,nzL,row;
177: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
179: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180: PetscInt flg;
181: PetscReal shift;
184: ISGetIndices(isrow,&r);
185: ISGetIndices(isicol,&ic);
187: if (info->shifttype == MAT_SHIFT_NONE){
188: shift = 0;
189: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
190: shift = info->shiftamount;
191: }
193: /* generate work space needed by the factorization */
194: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
195: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
197: for (i=0; i<n; i++){
198: /* zero rtmp */
199: /* L part */
200: nz = bi[i+1] - bi[i];
201: bjtmp = bj + bi[i];
202: for (j=0; j<nz; j++){
203: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
204: }
206: /* U part */
207: nz = bdiag[i]-bdiag[i+1];
208: bjtmp = bj + bdiag[i+1]+1;
209: for (j=0; j<nz; j++){
210: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
211: }
212:
213: /* load in initial (unfactored row) */
214: nz = ai[r[i]+1] - ai[r[i]];
215: ajtmp = aj + ai[r[i]];
216: v = aa + bs2*ai[r[i]];
217: for (j=0; j<nz; j++) {
218: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
219: }
221: /* elimination */
222: bjtmp = bj + bi[i];
223: nzL = bi[i+1] - bi[i];
224: for(k=0;k < nzL;k++) {
225: row = bjtmp[k];
226: pc = rtmp + bs2*row;
227: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
228: if (flg) {
229: pv = b->a + bs2*bdiag[row];
230: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
231: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
232:
233: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
234: pv = b->a + bs2*(bdiag[row+1]+1);
235: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
236: for (j=0; j<nz; j++) {
237: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
238: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
239: v = rtmp + bs2*pj[j];
240: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
241: pv += bs2;
242: }
243: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
244: }
245: }
247: /* finished row so stick it into b->a */
248: /* L part */
249: pv = b->a + bs2*bi[i] ;
250: pj = b->j + bi[i] ;
251: nz = bi[i+1] - bi[i];
252: for (j=0; j<nz; j++) {
253: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
254: }
256: /* Mark diagonal and invert diagonal for simplier triangular solves */
257: pv = b->a + bs2*bdiag[i];
258: pj = b->j + bdiag[i];
259: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
260: PetscKernel_A_gets_inverse_A_4(pv,shift);
261:
262: /* U part */
263: pv = b->a + bs2*(bdiag[i+1]+1);
264: pj = b->j + bdiag[i+1]+1;
265: nz = bdiag[i] - bdiag[i+1] - 1;
266: for (j=0; j<nz; j++){
267: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
268: }
269: }
271: PetscFree2(rtmp,mwork);
272: ISRestoreIndices(isicol,&ic);
273: ISRestoreIndices(isrow,&r);
274: C->ops->solve = MatSolve_SeqBAIJ_4;
275: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
276: C->assembled = PETSC_TRUE;
277: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
278: return(0);
279: }
283: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
284: {
285: /*
286: Default Version for when blocks are 4 by 4 Using natural ordering
287: */
288: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
290: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
291: PetscInt *ajtmpold,*ajtmp,nz,row;
292: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
293: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
294: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
295: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
296: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
297: MatScalar m13,m14,m15,m16;
298: MatScalar *ba = b->a,*aa = a->a;
299: PetscBool pivotinblocks = b->pivotinblocks;
300: PetscReal shift = info->shiftamount;
303: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
305: for (i=0; i<n; i++) {
306: nz = bi[i+1] - bi[i];
307: ajtmp = bj + bi[i];
308: for (j=0; j<nz; j++) {
309: x = rtmp+16*ajtmp[j];
310: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
311: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
312: }
313: /* load in initial (unfactored row) */
314: nz = ai[i+1] - ai[i];
315: ajtmpold = aj + ai[i];
316: v = aa + 16*ai[i];
317: for (j=0; j<nz; j++) {
318: x = rtmp+16*ajtmpold[j];
319: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
320: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
321: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
322: x[14] = v[14]; x[15] = v[15];
323: v += 16;
324: }
325: row = *ajtmp++;
326: while (row < i) {
327: pc = rtmp + 16*row;
328: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
329: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
330: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
331: p15 = pc[14]; p16 = pc[15];
332: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
333: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
334: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
335: || p16 != 0.0) {
336: pv = ba + 16*diag_offset[row];
337: pj = bj + diag_offset[row] + 1;
338: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
339: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
340: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
341: x15 = pv[14]; x16 = pv[15];
342: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
343: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
344: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
345: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
347: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
348: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
349: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
350: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
352: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
353: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
354: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
355: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
357: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
358: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
359: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
360: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
361: nz = bi[row+1] - diag_offset[row] - 1;
362: pv += 16;
363: for (j=0; j<nz; j++) {
364: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
365: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
366: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
367: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
368: x = rtmp + 16*pj[j];
369: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
370: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
371: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
372: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
374: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
375: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
376: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
377: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
379: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
380: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
381: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
382: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
384: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
385: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
386: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
387: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
389: pv += 16;
390: }
391: PetscLogFlops(128.0*nz+112.0);
392: }
393: row = *ajtmp++;
394: }
395: /* finished row so stick it into b->a */
396: pv = ba + 16*bi[i];
397: pj = bj + bi[i];
398: nz = bi[i+1] - bi[i];
399: for (j=0; j<nz; j++) {
400: x = rtmp+16*pj[j];
401: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
402: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
403: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
404: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
405: pv += 16;
406: }
407: /* invert diagonal block */
408: w = ba + 16*diag_offset[i];
409: if (pivotinblocks) {
410: PetscKernel_A_gets_inverse_A_4(w,shift);
411: } else {
412: PetscKernel_A_gets_inverse_A_4_nopivot(w);
413: }
414: }
416: PetscFree(rtmp);
417: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
418: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
419: C->assembled = PETSC_TRUE;
420: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
421: return(0);
422: }
424: /*
425: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
426: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
427: */
430: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
431: {
432: Mat C=B;
433: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
435: PetscInt i,j,k,nz,nzL,row;
436: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
437: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
438: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
439: PetscInt flg;
440: PetscReal shift;
443: /* generate work space needed by the factorization */
444: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
445: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
447: if (info->shifttype == MAT_SHIFT_NONE){
448: shift = 0;
449: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
450: shift = info->shiftamount;
451: }
453: for (i=0; i<n; i++){
454: /* zero rtmp */
455: /* L part */
456: nz = bi[i+1] - bi[i];
457: bjtmp = bj + bi[i];
458: for (j=0; j<nz; j++){
459: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
460: }
462: /* U part */
463: nz = bdiag[i] - bdiag[i+1];
464: bjtmp = bj + bdiag[i+1]+1;
465: for (j=0; j<nz; j++){
466: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
467: }
468:
469: /* load in initial (unfactored row) */
470: nz = ai[i+1] - ai[i];
471: ajtmp = aj + ai[i];
472: v = aa + bs2*ai[i];
473: for (j=0; j<nz; j++) {
474: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
475: }
477: /* elimination */
478: bjtmp = bj + bi[i];
479: nzL = bi[i+1] - bi[i];
480: for(k=0;k < nzL;k++) {
481: row = bjtmp[k];
482: pc = rtmp + bs2*row;
483: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
484: if (flg) {
485: pv = b->a + bs2*bdiag[row];
486: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
487: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
488:
489: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
490: pv = b->a + bs2*(bdiag[row+1]+1);
491: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
492: for (j=0; j<nz; j++) {
493: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
494: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
495: v = rtmp + bs2*pj[j];
496: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
497: pv += bs2;
498: }
499: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
500: }
501: }
503: /* finished row so stick it into b->a */
504: /* L part */
505: pv = b->a + bs2*bi[i] ;
506: pj = b->j + bi[i] ;
507: nz = bi[i+1] - bi[i];
508: for (j=0; j<nz; j++) {
509: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
510: }
512: /* Mark diagonal and invert diagonal for simplier triangular solves */
513: pv = b->a + bs2*bdiag[i];
514: pj = b->j + bdiag[i];
515: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
516: PetscKernel_A_gets_inverse_A_4(pv,shift);
517:
518: /* U part */
519: pv = b->a + bs2*(bdiag[i+1]+1);
520: pj = b->j + bdiag[i+1]+1;
521: nz = bdiag[i] - bdiag[i+1] - 1;
522: for (j=0; j<nz; j++){
523: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
524: }
525: }
526: PetscFree2(rtmp,mwork);
527: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
528: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
529: C->assembled = PETSC_TRUE;
530: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
531: return(0);
532: }
534: #if defined(PETSC_HAVE_SSE)
536: #include PETSC_HAVE_SSE
538: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
542: {
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
545: int i,j,n = a->mbs;
546: int *bj = b->j,*bjtmp,*pj;
547: int row;
548: int *ajtmpold,nz,*bi=b->i;
549: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
550: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
551: MatScalar *ba = b->a,*aa = a->a;
552: int nonzero=0;
553: /* int nonzero=0,colscale = 16; */
554: PetscBool pivotinblocks = b->pivotinblocks;
555: PetscReal shift = info->shiftamount;
558: SSE_SCOPE_BEGIN;
560: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
561: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
562: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
563: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
564: /* if ((unsigned long)bj==(unsigned long)aj) { */
565: /* colscale = 4; */
566: /* } */
567: for (i=0; i<n; i++) {
568: nz = bi[i+1] - bi[i];
569: bjtmp = bj + bi[i];
570: /* zero out the 4x4 block accumulators */
571: /* zero out one register */
572: XOR_PS(XMM7,XMM7);
573: for (j=0; j<nz; j++) {
574: x = rtmp+16*bjtmp[j];
575: /* x = rtmp+4*bjtmp[j]; */
576: SSE_INLINE_BEGIN_1(x)
577: /* Copy zero register to memory locations */
578: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
579: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
580: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
581: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
582: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
583: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
584: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
585: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
586: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
587: SSE_INLINE_END_1;
588: }
589: /* load in initial (unfactored row) */
590: nz = ai[i+1] - ai[i];
591: ajtmpold = aj + ai[i];
592: v = aa + 16*ai[i];
593: for (j=0; j<nz; j++) {
594: x = rtmp+16*ajtmpold[j];
595: /* x = rtmp+colscale*ajtmpold[j]; */
596: /* Copy v block into x block */
597: SSE_INLINE_BEGIN_2(v,x)
598: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
599: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
600: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
602: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
603: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
605: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
606: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
608: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
609: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
611: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
612: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
614: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
615: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
617: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
618: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
620: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
621: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
622: SSE_INLINE_END_2;
624: v += 16;
625: }
626: /* row = (*bjtmp++)/4; */
627: row = *bjtmp++;
628: while (row < i) {
629: pc = rtmp + 16*row;
630: SSE_INLINE_BEGIN_1(pc)
631: /* Load block from lower triangle */
632: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
633: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
634: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
636: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
637: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
639: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
640: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
642: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
643: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
645: /* Compare block to zero block */
647: SSE_COPY_PS(XMM4,XMM7)
648: SSE_CMPNEQ_PS(XMM4,XMM0)
650: SSE_COPY_PS(XMM5,XMM7)
651: SSE_CMPNEQ_PS(XMM5,XMM1)
653: SSE_COPY_PS(XMM6,XMM7)
654: SSE_CMPNEQ_PS(XMM6,XMM2)
656: SSE_CMPNEQ_PS(XMM7,XMM3)
658: /* Reduce the comparisons to one SSE register */
659: SSE_OR_PS(XMM6,XMM7)
660: SSE_OR_PS(XMM5,XMM4)
661: SSE_OR_PS(XMM5,XMM6)
662: SSE_INLINE_END_1;
664: /* Reduce the one SSE register to an integer register for branching */
665: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
666: MOVEMASK(nonzero,XMM5);
668: /* If block is nonzero ... */
669: if (nonzero) {
670: pv = ba + 16*diag_offset[row];
671: PREFETCH_L1(&pv[16]);
672: pj = bj + diag_offset[row] + 1;
674: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
675: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
676: /* but the diagonal was inverted already */
677: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
679: SSE_INLINE_BEGIN_2(pv,pc)
680: /* Column 0, product is accumulated in XMM4 */
681: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
682: SSE_SHUFFLE(XMM4,XMM4,0x00)
683: SSE_MULT_PS(XMM4,XMM0)
685: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
686: SSE_SHUFFLE(XMM5,XMM5,0x00)
687: SSE_MULT_PS(XMM5,XMM1)
688: SSE_ADD_PS(XMM4,XMM5)
690: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
691: SSE_SHUFFLE(XMM6,XMM6,0x00)
692: SSE_MULT_PS(XMM6,XMM2)
693: SSE_ADD_PS(XMM4,XMM6)
695: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
696: SSE_SHUFFLE(XMM7,XMM7,0x00)
697: SSE_MULT_PS(XMM7,XMM3)
698: SSE_ADD_PS(XMM4,XMM7)
700: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
701: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
703: /* Column 1, product is accumulated in XMM5 */
704: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
705: SSE_SHUFFLE(XMM5,XMM5,0x00)
706: SSE_MULT_PS(XMM5,XMM0)
708: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
709: SSE_SHUFFLE(XMM6,XMM6,0x00)
710: SSE_MULT_PS(XMM6,XMM1)
711: SSE_ADD_PS(XMM5,XMM6)
713: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
714: SSE_SHUFFLE(XMM7,XMM7,0x00)
715: SSE_MULT_PS(XMM7,XMM2)
716: SSE_ADD_PS(XMM5,XMM7)
718: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
719: SSE_SHUFFLE(XMM6,XMM6,0x00)
720: SSE_MULT_PS(XMM6,XMM3)
721: SSE_ADD_PS(XMM5,XMM6)
723: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
724: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
726: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
728: /* Column 2, product is accumulated in XMM6 */
729: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
730: SSE_SHUFFLE(XMM6,XMM6,0x00)
731: SSE_MULT_PS(XMM6,XMM0)
733: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
734: SSE_SHUFFLE(XMM7,XMM7,0x00)
735: SSE_MULT_PS(XMM7,XMM1)
736: SSE_ADD_PS(XMM6,XMM7)
738: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
739: SSE_SHUFFLE(XMM7,XMM7,0x00)
740: SSE_MULT_PS(XMM7,XMM2)
741: SSE_ADD_PS(XMM6,XMM7)
743: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
744: SSE_SHUFFLE(XMM7,XMM7,0x00)
745: SSE_MULT_PS(XMM7,XMM3)
746: SSE_ADD_PS(XMM6,XMM7)
747:
748: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
749: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
751: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
752: /* Column 3, product is accumulated in XMM0 */
753: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
754: SSE_SHUFFLE(XMM7,XMM7,0x00)
755: SSE_MULT_PS(XMM0,XMM7)
757: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
758: SSE_SHUFFLE(XMM7,XMM7,0x00)
759: SSE_MULT_PS(XMM1,XMM7)
760: SSE_ADD_PS(XMM0,XMM1)
762: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
763: SSE_SHUFFLE(XMM1,XMM1,0x00)
764: SSE_MULT_PS(XMM1,XMM2)
765: SSE_ADD_PS(XMM0,XMM1)
767: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
768: SSE_SHUFFLE(XMM7,XMM7,0x00)
769: SSE_MULT_PS(XMM3,XMM7)
770: SSE_ADD_PS(XMM0,XMM3)
772: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
773: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
775: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
776: /* This is code to be maintained and read by humans afterall. */
777: /* Copy Multiplier Col 3 into XMM3 */
778: SSE_COPY_PS(XMM3,XMM0)
779: /* Copy Multiplier Col 2 into XMM2 */
780: SSE_COPY_PS(XMM2,XMM6)
781: /* Copy Multiplier Col 1 into XMM1 */
782: SSE_COPY_PS(XMM1,XMM5)
783: /* Copy Multiplier Col 0 into XMM0 */
784: SSE_COPY_PS(XMM0,XMM4)
785: SSE_INLINE_END_2;
787: /* Update the row: */
788: nz = bi[row+1] - diag_offset[row] - 1;
789: pv += 16;
790: for (j=0; j<nz; j++) {
791: PREFETCH_L1(&pv[16]);
792: x = rtmp + 16*pj[j];
793: /* x = rtmp + 4*pj[j]; */
795: /* X:=X-M*PV, One column at a time */
796: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
797: SSE_INLINE_BEGIN_2(x,pv)
798: /* Load First Column of X*/
799: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
800: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
802: /* Matrix-Vector Product: */
803: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
804: SSE_SHUFFLE(XMM5,XMM5,0x00)
805: SSE_MULT_PS(XMM5,XMM0)
806: SSE_SUB_PS(XMM4,XMM5)
808: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
809: SSE_SHUFFLE(XMM6,XMM6,0x00)
810: SSE_MULT_PS(XMM6,XMM1)
811: SSE_SUB_PS(XMM4,XMM6)
813: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
814: SSE_SHUFFLE(XMM7,XMM7,0x00)
815: SSE_MULT_PS(XMM7,XMM2)
816: SSE_SUB_PS(XMM4,XMM7)
818: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
819: SSE_SHUFFLE(XMM5,XMM5,0x00)
820: SSE_MULT_PS(XMM5,XMM3)
821: SSE_SUB_PS(XMM4,XMM5)
823: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
824: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
826: /* Second Column */
827: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
828: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
830: /* Matrix-Vector Product: */
831: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
832: SSE_SHUFFLE(XMM6,XMM6,0x00)
833: SSE_MULT_PS(XMM6,XMM0)
834: SSE_SUB_PS(XMM5,XMM6)
836: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
837: SSE_SHUFFLE(XMM7,XMM7,0x00)
838: SSE_MULT_PS(XMM7,XMM1)
839: SSE_SUB_PS(XMM5,XMM7)
841: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
842: SSE_SHUFFLE(XMM4,XMM4,0x00)
843: SSE_MULT_PS(XMM4,XMM2)
844: SSE_SUB_PS(XMM5,XMM4)
846: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
847: SSE_SHUFFLE(XMM6,XMM6,0x00)
848: SSE_MULT_PS(XMM6,XMM3)
849: SSE_SUB_PS(XMM5,XMM6)
850:
851: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
852: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
854: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
856: /* Third Column */
857: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
858: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
860: /* Matrix-Vector Product: */
861: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
862: SSE_SHUFFLE(XMM7,XMM7,0x00)
863: SSE_MULT_PS(XMM7,XMM0)
864: SSE_SUB_PS(XMM6,XMM7)
866: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
867: SSE_SHUFFLE(XMM4,XMM4,0x00)
868: SSE_MULT_PS(XMM4,XMM1)
869: SSE_SUB_PS(XMM6,XMM4)
871: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
872: SSE_SHUFFLE(XMM5,XMM5,0x00)
873: SSE_MULT_PS(XMM5,XMM2)
874: SSE_SUB_PS(XMM6,XMM5)
876: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
877: SSE_SHUFFLE(XMM7,XMM7,0x00)
878: SSE_MULT_PS(XMM7,XMM3)
879: SSE_SUB_PS(XMM6,XMM7)
880:
881: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
882: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
883:
884: /* Fourth Column */
885: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
886: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
888: /* Matrix-Vector Product: */
889: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
890: SSE_SHUFFLE(XMM5,XMM5,0x00)
891: SSE_MULT_PS(XMM5,XMM0)
892: SSE_SUB_PS(XMM4,XMM5)
894: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
895: SSE_SHUFFLE(XMM6,XMM6,0x00)
896: SSE_MULT_PS(XMM6,XMM1)
897: SSE_SUB_PS(XMM4,XMM6)
899: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
900: SSE_SHUFFLE(XMM7,XMM7,0x00)
901: SSE_MULT_PS(XMM7,XMM2)
902: SSE_SUB_PS(XMM4,XMM7)
904: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
905: SSE_SHUFFLE(XMM5,XMM5,0x00)
906: SSE_MULT_PS(XMM5,XMM3)
907: SSE_SUB_PS(XMM4,XMM5)
908:
909: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
910: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
911: SSE_INLINE_END_2;
912: pv += 16;
913: }
914: PetscLogFlops(128.0*nz+112.0);
915: }
916: row = *bjtmp++;
917: /* row = (*bjtmp++)/4; */
918: }
919: /* finished row so stick it into b->a */
920: pv = ba + 16*bi[i];
921: pj = bj + bi[i];
922: nz = bi[i+1] - bi[i];
924: /* Copy x block back into pv block */
925: for (j=0; j<nz; j++) {
926: x = rtmp+16*pj[j];
927: /* x = rtmp+4*pj[j]; */
929: SSE_INLINE_BEGIN_2(x,pv)
930: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
931: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
932: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
934: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
935: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
937: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
938: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
940: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
941: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
943: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
944: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
946: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
947: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
949: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
950: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
952: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
953: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
954: SSE_INLINE_END_2;
955: pv += 16;
956: }
957: /* invert diagonal block */
958: w = ba + 16*diag_offset[i];
959: if (pivotinblocks) {
960: PetscKernel_A_gets_inverse_A_4(w,shift);
961: } else {
962: PetscKernel_A_gets_inverse_A_4_nopivot(w);
963: }
964: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
965: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
966: }
968: PetscFree(rtmp);
969: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
970: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
971: C->assembled = PETSC_TRUE;
972: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
973: /* Flop Count from inverting diagonal blocks */
974: SSE_SCOPE_END;
975: return(0);
976: }
980: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
981: {
982: Mat A=C;
983: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
985: int i,j,n = a->mbs;
986: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
987: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
988: unsigned int row;
989: int nz,*bi=b->i;
990: int *diag_offset = b->diag,*ai=a->i;
991: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
992: MatScalar *ba = b->a,*aa = a->a;
993: int nonzero=0;
994: /* int nonzero=0,colscale = 16; */
995: PetscBool pivotinblocks = b->pivotinblocks;
996: PetscReal shift = info->shiftamount;
999: SSE_SCOPE_BEGIN;
1001: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1002: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1003: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1004: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1005: /* if ((unsigned long)bj==(unsigned long)aj) { */
1006: /* colscale = 4; */
1007: /* } */
1008:
1009: for (i=0; i<n; i++) {
1010: nz = bi[i+1] - bi[i];
1011: bjtmp = bj + bi[i];
1012: /* zero out the 4x4 block accumulators */
1013: /* zero out one register */
1014: XOR_PS(XMM7,XMM7);
1015: for (j=0; j<nz; j++) {
1016: x = rtmp+16*((unsigned int)bjtmp[j]);
1017: /* x = rtmp+4*bjtmp[j]; */
1018: SSE_INLINE_BEGIN_1(x)
1019: /* Copy zero register to memory locations */
1020: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1021: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1022: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1023: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1024: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1025: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1026: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1027: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1028: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1029: SSE_INLINE_END_1;
1030: }
1031: /* load in initial (unfactored row) */
1032: nz = ai[i+1] - ai[i];
1033: ajtmp = aj + ai[i];
1034: v = aa + 16*ai[i];
1035: for (j=0; j<nz; j++) {
1036: x = rtmp+16*((unsigned int)ajtmp[j]);
1037: /* x = rtmp+colscale*ajtmp[j]; */
1038: /* Copy v block into x block */
1039: SSE_INLINE_BEGIN_2(v,x)
1040: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1042: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1044: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1045: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1047: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1048: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1050: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1051: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1053: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1054: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1056: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1057: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1059: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1060: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1062: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1063: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1064: SSE_INLINE_END_2;
1066: v += 16;
1067: }
1068: /* row = (*bjtmp++)/4; */
1069: row = (unsigned int)(*bjtmp++);
1070: while (row < i) {
1071: pc = rtmp + 16*row;
1072: SSE_INLINE_BEGIN_1(pc)
1073: /* Load block from lower triangle */
1074: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1075: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1076: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1078: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1079: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1081: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1082: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1084: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1085: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1087: /* Compare block to zero block */
1089: SSE_COPY_PS(XMM4,XMM7)
1090: SSE_CMPNEQ_PS(XMM4,XMM0)
1092: SSE_COPY_PS(XMM5,XMM7)
1093: SSE_CMPNEQ_PS(XMM5,XMM1)
1095: SSE_COPY_PS(XMM6,XMM7)
1096: SSE_CMPNEQ_PS(XMM6,XMM2)
1098: SSE_CMPNEQ_PS(XMM7,XMM3)
1100: /* Reduce the comparisons to one SSE register */
1101: SSE_OR_PS(XMM6,XMM7)
1102: SSE_OR_PS(XMM5,XMM4)
1103: SSE_OR_PS(XMM5,XMM6)
1104: SSE_INLINE_END_1;
1106: /* Reduce the one SSE register to an integer register for branching */
1107: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1108: MOVEMASK(nonzero,XMM5);
1110: /* If block is nonzero ... */
1111: if (nonzero) {
1112: pv = ba + 16*diag_offset[row];
1113: PREFETCH_L1(&pv[16]);
1114: pj = bj + diag_offset[row] + 1;
1116: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1117: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1118: /* but the diagonal was inverted already */
1119: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1121: SSE_INLINE_BEGIN_2(pv,pc)
1122: /* Column 0, product is accumulated in XMM4 */
1123: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1124: SSE_SHUFFLE(XMM4,XMM4,0x00)
1125: SSE_MULT_PS(XMM4,XMM0)
1127: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1128: SSE_SHUFFLE(XMM5,XMM5,0x00)
1129: SSE_MULT_PS(XMM5,XMM1)
1130: SSE_ADD_PS(XMM4,XMM5)
1132: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1133: SSE_SHUFFLE(XMM6,XMM6,0x00)
1134: SSE_MULT_PS(XMM6,XMM2)
1135: SSE_ADD_PS(XMM4,XMM6)
1137: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1138: SSE_SHUFFLE(XMM7,XMM7,0x00)
1139: SSE_MULT_PS(XMM7,XMM3)
1140: SSE_ADD_PS(XMM4,XMM7)
1142: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1143: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1145: /* Column 1, product is accumulated in XMM5 */
1146: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1147: SSE_SHUFFLE(XMM5,XMM5,0x00)
1148: SSE_MULT_PS(XMM5,XMM0)
1150: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1151: SSE_SHUFFLE(XMM6,XMM6,0x00)
1152: SSE_MULT_PS(XMM6,XMM1)
1153: SSE_ADD_PS(XMM5,XMM6)
1155: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1156: SSE_SHUFFLE(XMM7,XMM7,0x00)
1157: SSE_MULT_PS(XMM7,XMM2)
1158: SSE_ADD_PS(XMM5,XMM7)
1160: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1161: SSE_SHUFFLE(XMM6,XMM6,0x00)
1162: SSE_MULT_PS(XMM6,XMM3)
1163: SSE_ADD_PS(XMM5,XMM6)
1165: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1166: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1168: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1170: /* Column 2, product is accumulated in XMM6 */
1171: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1172: SSE_SHUFFLE(XMM6,XMM6,0x00)
1173: SSE_MULT_PS(XMM6,XMM0)
1175: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1176: SSE_SHUFFLE(XMM7,XMM7,0x00)
1177: SSE_MULT_PS(XMM7,XMM1)
1178: SSE_ADD_PS(XMM6,XMM7)
1180: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1181: SSE_SHUFFLE(XMM7,XMM7,0x00)
1182: SSE_MULT_PS(XMM7,XMM2)
1183: SSE_ADD_PS(XMM6,XMM7)
1185: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1186: SSE_SHUFFLE(XMM7,XMM7,0x00)
1187: SSE_MULT_PS(XMM7,XMM3)
1188: SSE_ADD_PS(XMM6,XMM7)
1189:
1190: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1191: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1193: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1194: /* Column 3, product is accumulated in XMM0 */
1195: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1196: SSE_SHUFFLE(XMM7,XMM7,0x00)
1197: SSE_MULT_PS(XMM0,XMM7)
1199: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1200: SSE_SHUFFLE(XMM7,XMM7,0x00)
1201: SSE_MULT_PS(XMM1,XMM7)
1202: SSE_ADD_PS(XMM0,XMM1)
1204: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1205: SSE_SHUFFLE(XMM1,XMM1,0x00)
1206: SSE_MULT_PS(XMM1,XMM2)
1207: SSE_ADD_PS(XMM0,XMM1)
1209: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1210: SSE_SHUFFLE(XMM7,XMM7,0x00)
1211: SSE_MULT_PS(XMM3,XMM7)
1212: SSE_ADD_PS(XMM0,XMM3)
1214: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1215: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1217: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1218: /* This is code to be maintained and read by humans afterall. */
1219: /* Copy Multiplier Col 3 into XMM3 */
1220: SSE_COPY_PS(XMM3,XMM0)
1221: /* Copy Multiplier Col 2 into XMM2 */
1222: SSE_COPY_PS(XMM2,XMM6)
1223: /* Copy Multiplier Col 1 into XMM1 */
1224: SSE_COPY_PS(XMM1,XMM5)
1225: /* Copy Multiplier Col 0 into XMM0 */
1226: SSE_COPY_PS(XMM0,XMM4)
1227: SSE_INLINE_END_2;
1229: /* Update the row: */
1230: nz = bi[row+1] - diag_offset[row] - 1;
1231: pv += 16;
1232: for (j=0; j<nz; j++) {
1233: PREFETCH_L1(&pv[16]);
1234: x = rtmp + 16*((unsigned int)pj[j]);
1235: /* x = rtmp + 4*pj[j]; */
1237: /* X:=X-M*PV, One column at a time */
1238: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1239: SSE_INLINE_BEGIN_2(x,pv)
1240: /* Load First Column of X*/
1241: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1242: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1244: /* Matrix-Vector Product: */
1245: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1246: SSE_SHUFFLE(XMM5,XMM5,0x00)
1247: SSE_MULT_PS(XMM5,XMM0)
1248: SSE_SUB_PS(XMM4,XMM5)
1250: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1251: SSE_SHUFFLE(XMM6,XMM6,0x00)
1252: SSE_MULT_PS(XMM6,XMM1)
1253: SSE_SUB_PS(XMM4,XMM6)
1255: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1256: SSE_SHUFFLE(XMM7,XMM7,0x00)
1257: SSE_MULT_PS(XMM7,XMM2)
1258: SSE_SUB_PS(XMM4,XMM7)
1260: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1261: SSE_SHUFFLE(XMM5,XMM5,0x00)
1262: SSE_MULT_PS(XMM5,XMM3)
1263: SSE_SUB_PS(XMM4,XMM5)
1265: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1268: /* Second Column */
1269: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1270: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1272: /* Matrix-Vector Product: */
1273: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1274: SSE_SHUFFLE(XMM6,XMM6,0x00)
1275: SSE_MULT_PS(XMM6,XMM0)
1276: SSE_SUB_PS(XMM5,XMM6)
1278: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1279: SSE_SHUFFLE(XMM7,XMM7,0x00)
1280: SSE_MULT_PS(XMM7,XMM1)
1281: SSE_SUB_PS(XMM5,XMM7)
1283: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1284: SSE_SHUFFLE(XMM4,XMM4,0x00)
1285: SSE_MULT_PS(XMM4,XMM2)
1286: SSE_SUB_PS(XMM5,XMM4)
1288: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1289: SSE_SHUFFLE(XMM6,XMM6,0x00)
1290: SSE_MULT_PS(XMM6,XMM3)
1291: SSE_SUB_PS(XMM5,XMM6)
1292:
1293: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1296: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1298: /* Third Column */
1299: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1300: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1302: /* Matrix-Vector Product: */
1303: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1304: SSE_SHUFFLE(XMM7,XMM7,0x00)
1305: SSE_MULT_PS(XMM7,XMM0)
1306: SSE_SUB_PS(XMM6,XMM7)
1308: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1309: SSE_SHUFFLE(XMM4,XMM4,0x00)
1310: SSE_MULT_PS(XMM4,XMM1)
1311: SSE_SUB_PS(XMM6,XMM4)
1313: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1314: SSE_SHUFFLE(XMM5,XMM5,0x00)
1315: SSE_MULT_PS(XMM5,XMM2)
1316: SSE_SUB_PS(XMM6,XMM5)
1318: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1319: SSE_SHUFFLE(XMM7,XMM7,0x00)
1320: SSE_MULT_PS(XMM7,XMM3)
1321: SSE_SUB_PS(XMM6,XMM7)
1322:
1323: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325:
1326: /* Fourth Column */
1327: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1328: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1330: /* Matrix-Vector Product: */
1331: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1332: SSE_SHUFFLE(XMM5,XMM5,0x00)
1333: SSE_MULT_PS(XMM5,XMM0)
1334: SSE_SUB_PS(XMM4,XMM5)
1336: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1337: SSE_SHUFFLE(XMM6,XMM6,0x00)
1338: SSE_MULT_PS(XMM6,XMM1)
1339: SSE_SUB_PS(XMM4,XMM6)
1341: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1342: SSE_SHUFFLE(XMM7,XMM7,0x00)
1343: SSE_MULT_PS(XMM7,XMM2)
1344: SSE_SUB_PS(XMM4,XMM7)
1346: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1347: SSE_SHUFFLE(XMM5,XMM5,0x00)
1348: SSE_MULT_PS(XMM5,XMM3)
1349: SSE_SUB_PS(XMM4,XMM5)
1350:
1351: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353: SSE_INLINE_END_2;
1354: pv += 16;
1355: }
1356: PetscLogFlops(128.0*nz+112.0);
1357: }
1358: row = (unsigned int)(*bjtmp++);
1359: /* row = (*bjtmp++)/4; */
1360: /* bjtmp++; */
1361: }
1362: /* finished row so stick it into b->a */
1363: pv = ba + 16*bi[i];
1364: pj = bj + bi[i];
1365: nz = bi[i+1] - bi[i];
1367: /* Copy x block back into pv block */
1368: for (j=0; j<nz; j++) {
1369: x = rtmp+16*((unsigned int)pj[j]);
1370: /* x = rtmp+4*pj[j]; */
1372: SSE_INLINE_BEGIN_2(x,pv)
1373: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1374: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1375: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1377: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1378: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1380: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1381: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1383: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1384: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1386: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1387: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1389: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1390: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1392: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1393: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1395: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1396: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1397: SSE_INLINE_END_2;
1398: pv += 16;
1399: }
1400: /* invert diagonal block */
1401: w = ba + 16*diag_offset[i];
1402: if (pivotinblocks) {
1403: PetscKernel_A_gets_inverse_A_4(w,shift);
1404: } else {
1405: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1406: }
1407: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1408: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1409: }
1411: PetscFree(rtmp);
1412: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1413: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1414: C->assembled = PETSC_TRUE;
1415: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1416: /* Flop Count from inverting diagonal blocks */
1417: SSE_SCOPE_END;
1418: return(0);
1419: }
1423: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1424: {
1425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1427: int i,j,n = a->mbs;
1428: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1429: unsigned int row;
1430: int *ajtmpold,nz,*bi=b->i;
1431: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1432: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1433: MatScalar *ba = b->a,*aa = a->a;
1434: int nonzero=0;
1435: /* int nonzero=0,colscale = 16; */
1436: PetscBool pivotinblocks = b->pivotinblocks;
1437: PetscReal shift = info->shiftamount;
1440: SSE_SCOPE_BEGIN;
1442: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1443: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1444: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1445: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1446: /* if ((unsigned long)bj==(unsigned long)aj) { */
1447: /* colscale = 4; */
1448: /* } */
1449: if ((unsigned long)bj==(unsigned long)aj) {
1450: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1451: }
1452:
1453: for (i=0; i<n; i++) {
1454: nz = bi[i+1] - bi[i];
1455: bjtmp = bj + bi[i];
1456: /* zero out the 4x4 block accumulators */
1457: /* zero out one register */
1458: XOR_PS(XMM7,XMM7);
1459: for (j=0; j<nz; j++) {
1460: x = rtmp+16*((unsigned int)bjtmp[j]);
1461: /* x = rtmp+4*bjtmp[j]; */
1462: SSE_INLINE_BEGIN_1(x)
1463: /* Copy zero register to memory locations */
1464: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1465: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1466: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1467: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1468: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1469: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1470: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1471: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1472: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1473: SSE_INLINE_END_1;
1474: }
1475: /* load in initial (unfactored row) */
1476: nz = ai[i+1] - ai[i];
1477: ajtmpold = aj + ai[i];
1478: v = aa + 16*ai[i];
1479: for (j=0; j<nz; j++) {
1480: x = rtmp+16*ajtmpold[j];
1481: /* x = rtmp+colscale*ajtmpold[j]; */
1482: /* Copy v block into x block */
1483: SSE_INLINE_BEGIN_2(v,x)
1484: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1486: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1488: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1489: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1491: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1492: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1494: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1495: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1497: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1498: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1500: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1501: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1503: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1504: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1506: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1507: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1508: SSE_INLINE_END_2;
1510: v += 16;
1511: }
1512: /* row = (*bjtmp++)/4; */
1513: row = (unsigned int)(*bjtmp++);
1514: while (row < i) {
1515: pc = rtmp + 16*row;
1516: SSE_INLINE_BEGIN_1(pc)
1517: /* Load block from lower triangle */
1518: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1519: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1520: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1522: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1523: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1525: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1526: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1528: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1529: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1531: /* Compare block to zero block */
1533: SSE_COPY_PS(XMM4,XMM7)
1534: SSE_CMPNEQ_PS(XMM4,XMM0)
1536: SSE_COPY_PS(XMM5,XMM7)
1537: SSE_CMPNEQ_PS(XMM5,XMM1)
1539: SSE_COPY_PS(XMM6,XMM7)
1540: SSE_CMPNEQ_PS(XMM6,XMM2)
1542: SSE_CMPNEQ_PS(XMM7,XMM3)
1544: /* Reduce the comparisons to one SSE register */
1545: SSE_OR_PS(XMM6,XMM7)
1546: SSE_OR_PS(XMM5,XMM4)
1547: SSE_OR_PS(XMM5,XMM6)
1548: SSE_INLINE_END_1;
1550: /* Reduce the one SSE register to an integer register for branching */
1551: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1552: MOVEMASK(nonzero,XMM5);
1554: /* If block is nonzero ... */
1555: if (nonzero) {
1556: pv = ba + 16*diag_offset[row];
1557: PREFETCH_L1(&pv[16]);
1558: pj = bj + diag_offset[row] + 1;
1560: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1561: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1562: /* but the diagonal was inverted already */
1563: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1565: SSE_INLINE_BEGIN_2(pv,pc)
1566: /* Column 0, product is accumulated in XMM4 */
1567: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1568: SSE_SHUFFLE(XMM4,XMM4,0x00)
1569: SSE_MULT_PS(XMM4,XMM0)
1571: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1572: SSE_SHUFFLE(XMM5,XMM5,0x00)
1573: SSE_MULT_PS(XMM5,XMM1)
1574: SSE_ADD_PS(XMM4,XMM5)
1576: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1577: SSE_SHUFFLE(XMM6,XMM6,0x00)
1578: SSE_MULT_PS(XMM6,XMM2)
1579: SSE_ADD_PS(XMM4,XMM6)
1581: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1582: SSE_SHUFFLE(XMM7,XMM7,0x00)
1583: SSE_MULT_PS(XMM7,XMM3)
1584: SSE_ADD_PS(XMM4,XMM7)
1586: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1587: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1589: /* Column 1, product is accumulated in XMM5 */
1590: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1591: SSE_SHUFFLE(XMM5,XMM5,0x00)
1592: SSE_MULT_PS(XMM5,XMM0)
1594: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1595: SSE_SHUFFLE(XMM6,XMM6,0x00)
1596: SSE_MULT_PS(XMM6,XMM1)
1597: SSE_ADD_PS(XMM5,XMM6)
1599: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1600: SSE_SHUFFLE(XMM7,XMM7,0x00)
1601: SSE_MULT_PS(XMM7,XMM2)
1602: SSE_ADD_PS(XMM5,XMM7)
1604: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1605: SSE_SHUFFLE(XMM6,XMM6,0x00)
1606: SSE_MULT_PS(XMM6,XMM3)
1607: SSE_ADD_PS(XMM5,XMM6)
1609: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1610: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1612: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1614: /* Column 2, product is accumulated in XMM6 */
1615: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1616: SSE_SHUFFLE(XMM6,XMM6,0x00)
1617: SSE_MULT_PS(XMM6,XMM0)
1619: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1620: SSE_SHUFFLE(XMM7,XMM7,0x00)
1621: SSE_MULT_PS(XMM7,XMM1)
1622: SSE_ADD_PS(XMM6,XMM7)
1624: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1625: SSE_SHUFFLE(XMM7,XMM7,0x00)
1626: SSE_MULT_PS(XMM7,XMM2)
1627: SSE_ADD_PS(XMM6,XMM7)
1629: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1630: SSE_SHUFFLE(XMM7,XMM7,0x00)
1631: SSE_MULT_PS(XMM7,XMM3)
1632: SSE_ADD_PS(XMM6,XMM7)
1633:
1634: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1635: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1637: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1638: /* Column 3, product is accumulated in XMM0 */
1639: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1640: SSE_SHUFFLE(XMM7,XMM7,0x00)
1641: SSE_MULT_PS(XMM0,XMM7)
1643: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1644: SSE_SHUFFLE(XMM7,XMM7,0x00)
1645: SSE_MULT_PS(XMM1,XMM7)
1646: SSE_ADD_PS(XMM0,XMM1)
1648: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1649: SSE_SHUFFLE(XMM1,XMM1,0x00)
1650: SSE_MULT_PS(XMM1,XMM2)
1651: SSE_ADD_PS(XMM0,XMM1)
1653: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1654: SSE_SHUFFLE(XMM7,XMM7,0x00)
1655: SSE_MULT_PS(XMM3,XMM7)
1656: SSE_ADD_PS(XMM0,XMM3)
1658: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1659: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1661: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1662: /* This is code to be maintained and read by humans afterall. */
1663: /* Copy Multiplier Col 3 into XMM3 */
1664: SSE_COPY_PS(XMM3,XMM0)
1665: /* Copy Multiplier Col 2 into XMM2 */
1666: SSE_COPY_PS(XMM2,XMM6)
1667: /* Copy Multiplier Col 1 into XMM1 */
1668: SSE_COPY_PS(XMM1,XMM5)
1669: /* Copy Multiplier Col 0 into XMM0 */
1670: SSE_COPY_PS(XMM0,XMM4)
1671: SSE_INLINE_END_2;
1673: /* Update the row: */
1674: nz = bi[row+1] - diag_offset[row] - 1;
1675: pv += 16;
1676: for (j=0; j<nz; j++) {
1677: PREFETCH_L1(&pv[16]);
1678: x = rtmp + 16*((unsigned int)pj[j]);
1679: /* x = rtmp + 4*pj[j]; */
1681: /* X:=X-M*PV, One column at a time */
1682: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1683: SSE_INLINE_BEGIN_2(x,pv)
1684: /* Load First Column of X*/
1685: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1686: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1688: /* Matrix-Vector Product: */
1689: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1690: SSE_SHUFFLE(XMM5,XMM5,0x00)
1691: SSE_MULT_PS(XMM5,XMM0)
1692: SSE_SUB_PS(XMM4,XMM5)
1694: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1695: SSE_SHUFFLE(XMM6,XMM6,0x00)
1696: SSE_MULT_PS(XMM6,XMM1)
1697: SSE_SUB_PS(XMM4,XMM6)
1699: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1700: SSE_SHUFFLE(XMM7,XMM7,0x00)
1701: SSE_MULT_PS(XMM7,XMM2)
1702: SSE_SUB_PS(XMM4,XMM7)
1704: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1705: SSE_SHUFFLE(XMM5,XMM5,0x00)
1706: SSE_MULT_PS(XMM5,XMM3)
1707: SSE_SUB_PS(XMM4,XMM5)
1709: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1710: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1712: /* Second Column */
1713: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1714: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1716: /* Matrix-Vector Product: */
1717: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1718: SSE_SHUFFLE(XMM6,XMM6,0x00)
1719: SSE_MULT_PS(XMM6,XMM0)
1720: SSE_SUB_PS(XMM5,XMM6)
1722: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1723: SSE_SHUFFLE(XMM7,XMM7,0x00)
1724: SSE_MULT_PS(XMM7,XMM1)
1725: SSE_SUB_PS(XMM5,XMM7)
1727: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1728: SSE_SHUFFLE(XMM4,XMM4,0x00)
1729: SSE_MULT_PS(XMM4,XMM2)
1730: SSE_SUB_PS(XMM5,XMM4)
1732: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1733: SSE_SHUFFLE(XMM6,XMM6,0x00)
1734: SSE_MULT_PS(XMM6,XMM3)
1735: SSE_SUB_PS(XMM5,XMM6)
1736:
1737: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1738: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1740: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1742: /* Third Column */
1743: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1744: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1746: /* Matrix-Vector Product: */
1747: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1748: SSE_SHUFFLE(XMM7,XMM7,0x00)
1749: SSE_MULT_PS(XMM7,XMM0)
1750: SSE_SUB_PS(XMM6,XMM7)
1752: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1753: SSE_SHUFFLE(XMM4,XMM4,0x00)
1754: SSE_MULT_PS(XMM4,XMM1)
1755: SSE_SUB_PS(XMM6,XMM4)
1757: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1758: SSE_SHUFFLE(XMM5,XMM5,0x00)
1759: SSE_MULT_PS(XMM5,XMM2)
1760: SSE_SUB_PS(XMM6,XMM5)
1762: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1763: SSE_SHUFFLE(XMM7,XMM7,0x00)
1764: SSE_MULT_PS(XMM7,XMM3)
1765: SSE_SUB_PS(XMM6,XMM7)
1766:
1767: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1768: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1769:
1770: /* Fourth Column */
1771: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1772: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1774: /* Matrix-Vector Product: */
1775: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1776: SSE_SHUFFLE(XMM5,XMM5,0x00)
1777: SSE_MULT_PS(XMM5,XMM0)
1778: SSE_SUB_PS(XMM4,XMM5)
1780: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1781: SSE_SHUFFLE(XMM6,XMM6,0x00)
1782: SSE_MULT_PS(XMM6,XMM1)
1783: SSE_SUB_PS(XMM4,XMM6)
1785: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1786: SSE_SHUFFLE(XMM7,XMM7,0x00)
1787: SSE_MULT_PS(XMM7,XMM2)
1788: SSE_SUB_PS(XMM4,XMM7)
1790: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1791: SSE_SHUFFLE(XMM5,XMM5,0x00)
1792: SSE_MULT_PS(XMM5,XMM3)
1793: SSE_SUB_PS(XMM4,XMM5)
1794:
1795: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1796: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1797: SSE_INLINE_END_2;
1798: pv += 16;
1799: }
1800: PetscLogFlops(128.0*nz+112.0);
1801: }
1802: row = (unsigned int)(*bjtmp++);
1803: /* row = (*bjtmp++)/4; */
1804: /* bjtmp++; */
1805: }
1806: /* finished row so stick it into b->a */
1807: pv = ba + 16*bi[i];
1808: pj = bj + bi[i];
1809: nz = bi[i+1] - bi[i];
1811: /* Copy x block back into pv block */
1812: for (j=0; j<nz; j++) {
1813: x = rtmp+16*((unsigned int)pj[j]);
1814: /* x = rtmp+4*pj[j]; */
1816: SSE_INLINE_BEGIN_2(x,pv)
1817: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1818: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1819: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1821: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1822: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1824: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1825: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1827: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1828: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1830: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1831: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1833: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1834: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1836: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1837: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1839: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1840: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1841: SSE_INLINE_END_2;
1842: pv += 16;
1843: }
1844: /* invert diagonal block */
1845: w = ba + 16*diag_offset[i];
1846: if (pivotinblocks) {
1847: PetscKernel_A_gets_inverse_A_4(w,shift);
1848: } else {
1849: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1850: }
1851: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1852: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1853: }
1855: PetscFree(rtmp);
1856: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1857: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1858: C->assembled = PETSC_TRUE;
1859: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1860: /* Flop Count from inverting diagonal blocks */
1861: SSE_SCOPE_END;
1862: return(0);
1863: }
1865: #endif