Actual source code: baijfact11.c
petsc-3.6.1 2015-08-06
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15: {
16: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17: IS isrow = b->row,isicol = b->icol;
19: const PetscInt *r,*ic;
20: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21: PetscInt *ajtmpold,*ajtmp,nz,row;
22: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
24: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27: MatScalar m13,m14,m15,m16;
28: MatScalar *ba = b->a,*aa = a->a;
29: PetscBool pivotinblocks = b->pivotinblocks;
30: PetscReal shift = info->shiftamount;
33: ISGetIndices(isrow,&r);
34: ISGetIndices(isicol,&ic);
35: PetscMalloc1(16*(n+1),&rtmp);
37: for (i=0; i<n; i++) {
38: nz = bi[i+1] - bi[i];
39: ajtmp = bj + bi[i];
40: for (j=0; j<nz; j++) {
41: x = rtmp+16*ajtmp[j];
42: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
43: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44: }
45: /* load in initial (unfactored row) */
46: idx = r[i];
47: nz = ai[idx+1] - ai[idx];
48: ajtmpold = aj + ai[idx];
49: v = aa + 16*ai[idx];
50: for (j=0; j<nz; j++) {
51: x = rtmp+16*ic[ajtmpold[j]];
52: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
53: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
54: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55: x[14] = v[14]; x[15] = v[15];
56: v += 16;
57: }
58: row = *ajtmp++;
59: while (row < i) {
60: pc = rtmp + 16*row;
61: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
62: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
63: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64: p15 = pc[14]; p16 = pc[15];
65: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68: || p16 != 0.0) {
69: pv = ba + 16*diag_offset[row];
70: pj = bj + diag_offset[row] + 1;
71: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
72: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
73: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74: x15 = pv[14]; x16 = pv[15];
75: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
76: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
77: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
78: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
80: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
81: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
82: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
83: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
85: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
86: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
87: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
88: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
90: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
91: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
92: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
93: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
95: nz = bi[row+1] - diag_offset[row] - 1;
96: pv += 16;
97: for (j=0; j<nz; j++) {
98: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
99: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
100: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
102: x = rtmp + 16*pj[j];
103: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
104: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
105: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
106: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
108: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
109: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
110: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
111: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
113: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
114: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
119: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
120: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
121: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
123: pv += 16;
124: }
125: PetscLogFlops(128.0*nz+112.0);
126: }
127: row = *ajtmp++;
128: }
129: /* finished row so stick it into b->a */
130: pv = ba + 16*bi[i];
131: pj = bj + bi[i];
132: nz = bi[i+1] - bi[i];
133: for (j=0; j<nz; j++) {
134: x = rtmp+16*pj[j];
135: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
136: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
137: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139: pv += 16;
140: }
141: /* invert diagonal block */
142: w = ba + 16*diag_offset[i];
143: if (pivotinblocks) {
144: PetscKernel_A_gets_inverse_A_4(w,shift);
145: } else {
146: PetscKernel_A_gets_inverse_A_4_nopivot(w);
147: }
148: }
150: PetscFree(rtmp);
151: ISRestoreIndices(isicol,&ic);
152: ISRestoreIndices(isrow,&r);
154: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
155: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156: C->assembled = PETSC_TRUE;
158: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
159: return(0);
160: }
162: /* MatLUFactorNumeric_SeqBAIJ_4 -
163: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
164: PetscKernel_A_gets_A_times_B()
165: PetscKernel_A_gets_A_minus_B_times_C()
166: PetscKernel_A_gets_inverse_A()
167: */
171: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
172: {
173: Mat C = B;
174: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
175: IS isrow = b->row,isicol = b->icol;
177: const PetscInt *r,*ic;
178: PetscInt i,j,k,nz,nzL,row;
179: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
180: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
181: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
182: PetscInt flg;
183: PetscReal shift;
186: ISGetIndices(isrow,&r);
187: ISGetIndices(isicol,&ic);
189: if (info->shifttype == MAT_SHIFT_NONE) {
190: shift = 0;
191: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
192: shift = info->shiftamount;
193: }
195: /* generate work space needed by the factorization */
196: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
197: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
199: for (i=0; i<n; i++) {
200: /* zero rtmp */
201: /* L part */
202: nz = bi[i+1] - bi[i];
203: bjtmp = bj + bi[i];
204: for (j=0; j<nz; j++) {
205: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
206: }
208: /* U part */
209: nz = bdiag[i]-bdiag[i+1];
210: bjtmp = bj + bdiag[i+1]+1;
211: for (j=0; j<nz; j++) {
212: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
213: }
215: /* load in initial (unfactored row) */
216: nz = ai[r[i]+1] - ai[r[i]];
217: ajtmp = aj + ai[r[i]];
218: v = aa + bs2*ai[r[i]];
219: for (j=0; j<nz; j++) {
220: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
221: }
223: /* elimination */
224: bjtmp = bj + bi[i];
225: nzL = bi[i+1] - bi[i];
226: for (k=0; k < nzL; k++) {
227: row = bjtmp[k];
228: pc = rtmp + bs2*row;
229: for (flg=0,j=0; j<bs2; j++) {
230: if (pc[j]!=0.0) {
231: flg = 1;
232: break;
233: }
234: }
235: if (flg) {
236: pv = b->a + bs2*bdiag[row];
237: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
238: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
240: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
241: pv = b->a + bs2*(bdiag[row+1]+1);
242: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
243: for (j=0; j<nz; j++) {
244: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
245: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
246: v = rtmp + bs2*pj[j];
247: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
248: pv += bs2;
249: }
250: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
251: }
252: }
254: /* finished row so stick it into b->a */
255: /* L part */
256: pv = b->a + bs2*bi[i];
257: pj = b->j + bi[i];
258: nz = bi[i+1] - bi[i];
259: for (j=0; j<nz; j++) {
260: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
261: }
263: /* Mark diagonal and invert diagonal for simplier triangular solves */
264: pv = b->a + bs2*bdiag[i];
265: pj = b->j + bdiag[i];
266: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
267: PetscKernel_A_gets_inverse_A_4(pv,shift);
269: /* U part */
270: pv = b->a + bs2*(bdiag[i+1]+1);
271: pj = b->j + bdiag[i+1]+1;
272: nz = bdiag[i] - bdiag[i+1] - 1;
273: for (j=0; j<nz; j++) {
274: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
275: }
276: }
278: PetscFree2(rtmp,mwork);
279: ISRestoreIndices(isicol,&ic);
280: ISRestoreIndices(isrow,&r);
282: C->ops->solve = MatSolve_SeqBAIJ_4;
283: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
284: C->assembled = PETSC_TRUE;
286: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
287: return(0);
288: }
292: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293: {
294: /*
295: Default Version for when blocks are 4 by 4 Using natural ordering
296: */
297: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
299: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300: PetscInt *ajtmpold,*ajtmp,nz,row;
301: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
303: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306: MatScalar m13,m14,m15,m16;
307: MatScalar *ba = b->a,*aa = a->a;
308: PetscBool pivotinblocks = b->pivotinblocks;
309: PetscReal shift = info->shiftamount;
312: PetscMalloc1(16*(n+1),&rtmp);
314: for (i=0; i<n; i++) {
315: nz = bi[i+1] - bi[i];
316: ajtmp = bj + bi[i];
317: for (j=0; j<nz; j++) {
318: x = rtmp+16*ajtmp[j];
319: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
320: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
321: }
322: /* load in initial (unfactored row) */
323: nz = ai[i+1] - ai[i];
324: ajtmpold = aj + ai[i];
325: v = aa + 16*ai[i];
326: for (j=0; j<nz; j++) {
327: x = rtmp+16*ajtmpold[j];
328: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
329: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
330: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
331: x[14] = v[14]; x[15] = v[15];
332: v += 16;
333: }
334: row = *ajtmp++;
335: while (row < i) {
336: pc = rtmp + 16*row;
337: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
338: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
339: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
340: p15 = pc[14]; p16 = pc[15];
341: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
342: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
343: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
344: || p16 != 0.0) {
345: pv = ba + 16*diag_offset[row];
346: pj = bj + diag_offset[row] + 1;
347: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
348: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
349: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
350: x15 = pv[14]; x16 = pv[15];
351: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
352: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
353: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
354: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
356: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
357: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
358: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
359: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
361: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
362: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
363: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
364: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
366: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
367: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
368: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
369: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
370: nz = bi[row+1] - diag_offset[row] - 1;
371: pv += 16;
372: for (j=0; j<nz; j++) {
373: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
374: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
375: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
376: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
377: x = rtmp + 16*pj[j];
378: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
379: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
380: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
381: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
383: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
384: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
385: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
386: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
388: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
389: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
390: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
391: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
393: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
394: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
395: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
396: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
398: pv += 16;
399: }
400: PetscLogFlops(128.0*nz+112.0);
401: }
402: row = *ajtmp++;
403: }
404: /* finished row so stick it into b->a */
405: pv = ba + 16*bi[i];
406: pj = bj + bi[i];
407: nz = bi[i+1] - bi[i];
408: for (j=0; j<nz; j++) {
409: x = rtmp+16*pj[j];
410: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
411: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
412: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
413: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
414: pv += 16;
415: }
416: /* invert diagonal block */
417: w = ba + 16*diag_offset[i];
418: if (pivotinblocks) {
419: PetscKernel_A_gets_inverse_A_4(w,shift);
420: } else {
421: PetscKernel_A_gets_inverse_A_4_nopivot(w);
422: }
423: }
425: PetscFree(rtmp);
427: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
428: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
429: C->assembled = PETSC_TRUE;
431: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
432: return(0);
433: }
435: /*
436: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
437: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
438: */
441: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
442: {
443: Mat C =B;
444: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
446: PetscInt i,j,k,nz,nzL,row;
447: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
448: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
449: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
450: PetscInt flg;
451: PetscReal shift;
454: /* generate work space needed by the factorization */
455: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
456: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
458: if (info->shifttype == MAT_SHIFT_NONE) {
459: shift = 0;
460: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
461: shift = info->shiftamount;
462: }
464: for (i=0; i<n; i++) {
465: /* zero rtmp */
466: /* L part */
467: nz = bi[i+1] - bi[i];
468: bjtmp = bj + bi[i];
469: for (j=0; j<nz; j++) {
470: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
471: }
473: /* U part */
474: nz = bdiag[i] - bdiag[i+1];
475: bjtmp = bj + bdiag[i+1]+1;
476: for (j=0; j<nz; j++) {
477: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
478: }
480: /* load in initial (unfactored row) */
481: nz = ai[i+1] - ai[i];
482: ajtmp = aj + ai[i];
483: v = aa + bs2*ai[i];
484: for (j=0; j<nz; j++) {
485: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
486: }
488: /* elimination */
489: bjtmp = bj + bi[i];
490: nzL = bi[i+1] - bi[i];
491: for (k=0; k < nzL; k++) {
492: row = bjtmp[k];
493: pc = rtmp + bs2*row;
494: for (flg=0,j=0; j<bs2; j++) {
495: if (pc[j]!=0.0) {
496: flg = 1;
497: break;
498: }
499: }
500: if (flg) {
501: pv = b->a + bs2*bdiag[row];
502: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
503: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
505: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
506: pv = b->a + bs2*(bdiag[row+1]+1);
507: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
508: for (j=0; j<nz; j++) {
509: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
510: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
511: v = rtmp + bs2*pj[j];
512: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
513: pv += bs2;
514: }
515: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
516: }
517: }
519: /* finished row so stick it into b->a */
520: /* L part */
521: pv = b->a + bs2*bi[i];
522: pj = b->j + bi[i];
523: nz = bi[i+1] - bi[i];
524: for (j=0; j<nz; j++) {
525: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
526: }
528: /* Mark diagonal and invert diagonal for simplier triangular solves */
529: pv = b->a + bs2*bdiag[i];
530: pj = b->j + bdiag[i];
531: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
532: PetscKernel_A_gets_inverse_A_4(pv,shift);
534: /* U part */
535: pv = b->a + bs2*(bdiag[i+1]+1);
536: pj = b->j + bdiag[i+1]+1;
537: nz = bdiag[i] - bdiag[i+1] - 1;
538: for (j=0; j<nz; j++) {
539: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
540: }
541: }
542: PetscFree2(rtmp,mwork);
544: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
545: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
546: C->assembled = PETSC_TRUE;
548: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
549: return(0);
550: }
552: #if defined(PETSC_HAVE_SSE)
554: #include PETSC_HAVE_SSE
556: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
559: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
560: {
561: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
563: int i,j,n = a->mbs;
564: int *bj = b->j,*bjtmp,*pj;
565: int row;
566: int *ajtmpold,nz,*bi=b->i;
567: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
568: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
569: MatScalar *ba = b->a,*aa = a->a;
570: int nonzero=0;
571: /* int nonzero=0,colscale = 16; */
572: PetscBool pivotinblocks = b->pivotinblocks;
573: PetscReal shift = info->shiftamount;
576: SSE_SCOPE_BEGIN;
578: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
579: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
580: PetscMalloc1(16*(n+1),&rtmp);
581: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
582: /* if ((unsigned long)bj==(unsigned long)aj) { */
583: /* colscale = 4; */
584: /* } */
585: for (i=0; i<n; i++) {
586: nz = bi[i+1] - bi[i];
587: bjtmp = bj + bi[i];
588: /* zero out the 4x4 block accumulators */
589: /* zero out one register */
590: XOR_PS(XMM7,XMM7);
591: for (j=0; j<nz; j++) {
592: x = rtmp+16*bjtmp[j];
593: /* x = rtmp+4*bjtmp[j]; */
594: SSE_INLINE_BEGIN_1(x)
595: /* Copy zero register to memory locations */
596: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
597: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
598: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
599: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
600: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
601: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
602: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
603: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
604: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
605: SSE_INLINE_END_1;
606: }
607: /* load in initial (unfactored row) */
608: nz = ai[i+1] - ai[i];
609: ajtmpold = aj + ai[i];
610: v = aa + 16*ai[i];
611: for (j=0; j<nz; j++) {
612: x = rtmp+16*ajtmpold[j];
613: /* x = rtmp+colscale*ajtmpold[j]; */
614: /* Copy v block into x block */
615: SSE_INLINE_BEGIN_2(v,x)
616: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
617: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
618: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
620: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
621: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
623: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
624: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
626: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
627: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
629: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
630: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
632: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
633: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
635: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
636: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
638: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
639: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
640: SSE_INLINE_END_2;
642: v += 16;
643: }
644: /* row = (*bjtmp++)/4; */
645: row = *bjtmp++;
646: while (row < i) {
647: pc = rtmp + 16*row;
648: SSE_INLINE_BEGIN_1(pc)
649: /* Load block from lower triangle */
650: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
651: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
652: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
654: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
655: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
657: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
658: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
660: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
661: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
663: /* Compare block to zero block */
665: SSE_COPY_PS(XMM4,XMM7)
666: SSE_CMPNEQ_PS(XMM4,XMM0)
668: SSE_COPY_PS(XMM5,XMM7)
669: SSE_CMPNEQ_PS(XMM5,XMM1)
671: SSE_COPY_PS(XMM6,XMM7)
672: SSE_CMPNEQ_PS(XMM6,XMM2)
674: SSE_CMPNEQ_PS(XMM7,XMM3)
676: /* Reduce the comparisons to one SSE register */
677: SSE_OR_PS(XMM6,XMM7)
678: SSE_OR_PS(XMM5,XMM4)
679: SSE_OR_PS(XMM5,XMM6)
680: SSE_INLINE_END_1;
682: /* Reduce the one SSE register to an integer register for branching */
683: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
684: MOVEMASK(nonzero,XMM5);
686: /* If block is nonzero ... */
687: if (nonzero) {
688: pv = ba + 16*diag_offset[row];
689: PREFETCH_L1(&pv[16]);
690: pj = bj + diag_offset[row] + 1;
692: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
693: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
694: /* but the diagonal was inverted already */
695: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
697: SSE_INLINE_BEGIN_2(pv,pc)
698: /* Column 0, product is accumulated in XMM4 */
699: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
700: SSE_SHUFFLE(XMM4,XMM4,0x00)
701: SSE_MULT_PS(XMM4,XMM0)
703: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
704: SSE_SHUFFLE(XMM5,XMM5,0x00)
705: SSE_MULT_PS(XMM5,XMM1)
706: SSE_ADD_PS(XMM4,XMM5)
708: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
709: SSE_SHUFFLE(XMM6,XMM6,0x00)
710: SSE_MULT_PS(XMM6,XMM2)
711: SSE_ADD_PS(XMM4,XMM6)
713: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
714: SSE_SHUFFLE(XMM7,XMM7,0x00)
715: SSE_MULT_PS(XMM7,XMM3)
716: SSE_ADD_PS(XMM4,XMM7)
718: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
719: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
721: /* Column 1, product is accumulated in XMM5 */
722: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
723: SSE_SHUFFLE(XMM5,XMM5,0x00)
724: SSE_MULT_PS(XMM5,XMM0)
726: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
727: SSE_SHUFFLE(XMM6,XMM6,0x00)
728: SSE_MULT_PS(XMM6,XMM1)
729: SSE_ADD_PS(XMM5,XMM6)
731: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
732: SSE_SHUFFLE(XMM7,XMM7,0x00)
733: SSE_MULT_PS(XMM7,XMM2)
734: SSE_ADD_PS(XMM5,XMM7)
736: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
737: SSE_SHUFFLE(XMM6,XMM6,0x00)
738: SSE_MULT_PS(XMM6,XMM3)
739: SSE_ADD_PS(XMM5,XMM6)
741: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
742: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
744: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
746: /* Column 2, product is accumulated in XMM6 */
747: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
748: SSE_SHUFFLE(XMM6,XMM6,0x00)
749: SSE_MULT_PS(XMM6,XMM0)
751: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
752: SSE_SHUFFLE(XMM7,XMM7,0x00)
753: SSE_MULT_PS(XMM7,XMM1)
754: SSE_ADD_PS(XMM6,XMM7)
756: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
757: SSE_SHUFFLE(XMM7,XMM7,0x00)
758: SSE_MULT_PS(XMM7,XMM2)
759: SSE_ADD_PS(XMM6,XMM7)
761: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
762: SSE_SHUFFLE(XMM7,XMM7,0x00)
763: SSE_MULT_PS(XMM7,XMM3)
764: SSE_ADD_PS(XMM6,XMM7)
766: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
767: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
769: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
770: /* Column 3, product is accumulated in XMM0 */
771: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
772: SSE_SHUFFLE(XMM7,XMM7,0x00)
773: SSE_MULT_PS(XMM0,XMM7)
775: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
776: SSE_SHUFFLE(XMM7,XMM7,0x00)
777: SSE_MULT_PS(XMM1,XMM7)
778: SSE_ADD_PS(XMM0,XMM1)
780: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
781: SSE_SHUFFLE(XMM1,XMM1,0x00)
782: SSE_MULT_PS(XMM1,XMM2)
783: SSE_ADD_PS(XMM0,XMM1)
785: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
786: SSE_SHUFFLE(XMM7,XMM7,0x00)
787: SSE_MULT_PS(XMM3,XMM7)
788: SSE_ADD_PS(XMM0,XMM3)
790: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
791: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
793: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
794: /* This is code to be maintained and read by humans afterall. */
795: /* Copy Multiplier Col 3 into XMM3 */
796: SSE_COPY_PS(XMM3,XMM0)
797: /* Copy Multiplier Col 2 into XMM2 */
798: SSE_COPY_PS(XMM2,XMM6)
799: /* Copy Multiplier Col 1 into XMM1 */
800: SSE_COPY_PS(XMM1,XMM5)
801: /* Copy Multiplier Col 0 into XMM0 */
802: SSE_COPY_PS(XMM0,XMM4)
803: SSE_INLINE_END_2;
805: /* Update the row: */
806: nz = bi[row+1] - diag_offset[row] - 1;
807: pv += 16;
808: for (j=0; j<nz; j++) {
809: PREFETCH_L1(&pv[16]);
810: x = rtmp + 16*pj[j];
811: /* x = rtmp + 4*pj[j]; */
813: /* X:=X-M*PV, One column at a time */
814: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
815: SSE_INLINE_BEGIN_2(x,pv)
816: /* Load First Column of X*/
817: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
818: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
820: /* Matrix-Vector Product: */
821: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
822: SSE_SHUFFLE(XMM5,XMM5,0x00)
823: SSE_MULT_PS(XMM5,XMM0)
824: SSE_SUB_PS(XMM4,XMM5)
826: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
827: SSE_SHUFFLE(XMM6,XMM6,0x00)
828: SSE_MULT_PS(XMM6,XMM1)
829: SSE_SUB_PS(XMM4,XMM6)
831: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
832: SSE_SHUFFLE(XMM7,XMM7,0x00)
833: SSE_MULT_PS(XMM7,XMM2)
834: SSE_SUB_PS(XMM4,XMM7)
836: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
837: SSE_SHUFFLE(XMM5,XMM5,0x00)
838: SSE_MULT_PS(XMM5,XMM3)
839: SSE_SUB_PS(XMM4,XMM5)
841: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
842: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
844: /* Second Column */
845: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
846: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
848: /* Matrix-Vector Product: */
849: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
850: SSE_SHUFFLE(XMM6,XMM6,0x00)
851: SSE_MULT_PS(XMM6,XMM0)
852: SSE_SUB_PS(XMM5,XMM6)
854: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
855: SSE_SHUFFLE(XMM7,XMM7,0x00)
856: SSE_MULT_PS(XMM7,XMM1)
857: SSE_SUB_PS(XMM5,XMM7)
859: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
860: SSE_SHUFFLE(XMM4,XMM4,0x00)
861: SSE_MULT_PS(XMM4,XMM2)
862: SSE_SUB_PS(XMM5,XMM4)
864: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
865: SSE_SHUFFLE(XMM6,XMM6,0x00)
866: SSE_MULT_PS(XMM6,XMM3)
867: SSE_SUB_PS(XMM5,XMM6)
869: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
870: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
872: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
874: /* Third Column */
875: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
876: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
878: /* Matrix-Vector Product: */
879: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
880: SSE_SHUFFLE(XMM7,XMM7,0x00)
881: SSE_MULT_PS(XMM7,XMM0)
882: SSE_SUB_PS(XMM6,XMM7)
884: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
885: SSE_SHUFFLE(XMM4,XMM4,0x00)
886: SSE_MULT_PS(XMM4,XMM1)
887: SSE_SUB_PS(XMM6,XMM4)
889: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
890: SSE_SHUFFLE(XMM5,XMM5,0x00)
891: SSE_MULT_PS(XMM5,XMM2)
892: SSE_SUB_PS(XMM6,XMM5)
894: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
895: SSE_SHUFFLE(XMM7,XMM7,0x00)
896: SSE_MULT_PS(XMM7,XMM3)
897: SSE_SUB_PS(XMM6,XMM7)
899: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
900: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
902: /* Fourth Column */
903: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
904: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
906: /* Matrix-Vector Product: */
907: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
908: SSE_SHUFFLE(XMM5,XMM5,0x00)
909: SSE_MULT_PS(XMM5,XMM0)
910: SSE_SUB_PS(XMM4,XMM5)
912: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
913: SSE_SHUFFLE(XMM6,XMM6,0x00)
914: SSE_MULT_PS(XMM6,XMM1)
915: SSE_SUB_PS(XMM4,XMM6)
917: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
918: SSE_SHUFFLE(XMM7,XMM7,0x00)
919: SSE_MULT_PS(XMM7,XMM2)
920: SSE_SUB_PS(XMM4,XMM7)
922: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
923: SSE_SHUFFLE(XMM5,XMM5,0x00)
924: SSE_MULT_PS(XMM5,XMM3)
925: SSE_SUB_PS(XMM4,XMM5)
927: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
928: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
929: SSE_INLINE_END_2;
930: pv += 16;
931: }
932: PetscLogFlops(128.0*nz+112.0);
933: }
934: row = *bjtmp++;
935: /* row = (*bjtmp++)/4; */
936: }
937: /* finished row so stick it into b->a */
938: pv = ba + 16*bi[i];
939: pj = bj + bi[i];
940: nz = bi[i+1] - bi[i];
942: /* Copy x block back into pv block */
943: for (j=0; j<nz; j++) {
944: x = rtmp+16*pj[j];
945: /* x = rtmp+4*pj[j]; */
947: SSE_INLINE_BEGIN_2(x,pv)
948: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
949: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
950: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
952: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
953: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
955: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
956: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
958: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
959: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
961: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
962: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
964: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
965: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
967: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
968: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
970: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
971: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
972: SSE_INLINE_END_2;
973: pv += 16;
974: }
975: /* invert diagonal block */
976: w = ba + 16*diag_offset[i];
977: if (pivotinblocks) {
978: PetscKernel_A_gets_inverse_A_4(w,shift);
979: } else {
980: PetscKernel_A_gets_inverse_A_4_nopivot(w);
981: }
982: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
983: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
984: }
986: PetscFree(rtmp);
988: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
989: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
990: C->assembled = PETSC_TRUE;
992: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
993: /* Flop Count from inverting diagonal blocks */
994: SSE_SCOPE_END;
995: return(0);
996: }
1000: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1001: {
1002: Mat A =C;
1003: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1005: int i,j,n = a->mbs;
1006: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1007: unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1008: unsigned int row;
1009: int nz,*bi=b->i;
1010: int *diag_offset = b->diag,*ai=a->i;
1011: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1012: MatScalar *ba = b->a,*aa = a->a;
1013: int nonzero=0;
1014: /* int nonzero=0,colscale = 16; */
1015: PetscBool pivotinblocks = b->pivotinblocks;
1016: PetscReal shift = info->shiftamount;
1019: SSE_SCOPE_BEGIN;
1021: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1022: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1023: PetscMalloc1(16*(n+1),&rtmp);
1024: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1025: /* if ((unsigned long)bj==(unsigned long)aj) { */
1026: /* colscale = 4; */
1027: /* } */
1029: for (i=0; i<n; i++) {
1030: nz = bi[i+1] - bi[i];
1031: bjtmp = bj + bi[i];
1032: /* zero out the 4x4 block accumulators */
1033: /* zero out one register */
1034: XOR_PS(XMM7,XMM7);
1035: for (j=0; j<nz; j++) {
1036: x = rtmp+16*((unsigned int)bjtmp[j]);
1037: /* x = rtmp+4*bjtmp[j]; */
1038: SSE_INLINE_BEGIN_1(x)
1039: /* Copy zero register to memory locations */
1040: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1042: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1043: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1044: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1045: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1046: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1047: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1048: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1049: SSE_INLINE_END_1;
1050: }
1051: /* load in initial (unfactored row) */
1052: nz = ai[i+1] - ai[i];
1053: ajtmp = aj + ai[i];
1054: v = aa + 16*ai[i];
1055: for (j=0; j<nz; j++) {
1056: x = rtmp+16*((unsigned int)ajtmp[j]);
1057: /* x = rtmp+colscale*ajtmp[j]; */
1058: /* Copy v block into x block */
1059: SSE_INLINE_BEGIN_2(v,x)
1060: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1061: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1062: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1064: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1065: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1067: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1068: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1070: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1071: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1073: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1074: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1076: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1077: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1079: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1080: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1082: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1083: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1084: SSE_INLINE_END_2;
1086: v += 16;
1087: }
1088: /* row = (*bjtmp++)/4; */
1089: row = (unsigned int)(*bjtmp++);
1090: while (row < i) {
1091: pc = rtmp + 16*row;
1092: SSE_INLINE_BEGIN_1(pc)
1093: /* Load block from lower triangle */
1094: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1095: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1098: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1101: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1102: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1104: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1105: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1107: /* Compare block to zero block */
1109: SSE_COPY_PS(XMM4,XMM7)
1110: SSE_CMPNEQ_PS(XMM4,XMM0)
1112: SSE_COPY_PS(XMM5,XMM7)
1113: SSE_CMPNEQ_PS(XMM5,XMM1)
1115: SSE_COPY_PS(XMM6,XMM7)
1116: SSE_CMPNEQ_PS(XMM6,XMM2)
1118: SSE_CMPNEQ_PS(XMM7,XMM3)
1120: /* Reduce the comparisons to one SSE register */
1121: SSE_OR_PS(XMM6,XMM7)
1122: SSE_OR_PS(XMM5,XMM4)
1123: SSE_OR_PS(XMM5,XMM6)
1124: SSE_INLINE_END_1;
1126: /* Reduce the one SSE register to an integer register for branching */
1127: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1128: MOVEMASK(nonzero,XMM5);
1130: /* If block is nonzero ... */
1131: if (nonzero) {
1132: pv = ba + 16*diag_offset[row];
1133: PREFETCH_L1(&pv[16]);
1134: pj = bj + diag_offset[row] + 1;
1136: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1137: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1138: /* but the diagonal was inverted already */
1139: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1141: SSE_INLINE_BEGIN_2(pv,pc)
1142: /* Column 0, product is accumulated in XMM4 */
1143: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1144: SSE_SHUFFLE(XMM4,XMM4,0x00)
1145: SSE_MULT_PS(XMM4,XMM0)
1147: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1148: SSE_SHUFFLE(XMM5,XMM5,0x00)
1149: SSE_MULT_PS(XMM5,XMM1)
1150: SSE_ADD_PS(XMM4,XMM5)
1152: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1153: SSE_SHUFFLE(XMM6,XMM6,0x00)
1154: SSE_MULT_PS(XMM6,XMM2)
1155: SSE_ADD_PS(XMM4,XMM6)
1157: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1158: SSE_SHUFFLE(XMM7,XMM7,0x00)
1159: SSE_MULT_PS(XMM7,XMM3)
1160: SSE_ADD_PS(XMM4,XMM7)
1162: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1163: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1165: /* Column 1, product is accumulated in XMM5 */
1166: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1167: SSE_SHUFFLE(XMM5,XMM5,0x00)
1168: SSE_MULT_PS(XMM5,XMM0)
1170: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1171: SSE_SHUFFLE(XMM6,XMM6,0x00)
1172: SSE_MULT_PS(XMM6,XMM1)
1173: SSE_ADD_PS(XMM5,XMM6)
1175: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1176: SSE_SHUFFLE(XMM7,XMM7,0x00)
1177: SSE_MULT_PS(XMM7,XMM2)
1178: SSE_ADD_PS(XMM5,XMM7)
1180: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1181: SSE_SHUFFLE(XMM6,XMM6,0x00)
1182: SSE_MULT_PS(XMM6,XMM3)
1183: SSE_ADD_PS(XMM5,XMM6)
1185: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1186: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1188: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1190: /* Column 2, product is accumulated in XMM6 */
1191: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1192: SSE_SHUFFLE(XMM6,XMM6,0x00)
1193: SSE_MULT_PS(XMM6,XMM0)
1195: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1196: SSE_SHUFFLE(XMM7,XMM7,0x00)
1197: SSE_MULT_PS(XMM7,XMM1)
1198: SSE_ADD_PS(XMM6,XMM7)
1200: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1201: SSE_SHUFFLE(XMM7,XMM7,0x00)
1202: SSE_MULT_PS(XMM7,XMM2)
1203: SSE_ADD_PS(XMM6,XMM7)
1205: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1206: SSE_SHUFFLE(XMM7,XMM7,0x00)
1207: SSE_MULT_PS(XMM7,XMM3)
1208: SSE_ADD_PS(XMM6,XMM7)
1210: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1211: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1213: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1214: /* Column 3, product is accumulated in XMM0 */
1215: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1216: SSE_SHUFFLE(XMM7,XMM7,0x00)
1217: SSE_MULT_PS(XMM0,XMM7)
1219: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1220: SSE_SHUFFLE(XMM7,XMM7,0x00)
1221: SSE_MULT_PS(XMM1,XMM7)
1222: SSE_ADD_PS(XMM0,XMM1)
1224: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1225: SSE_SHUFFLE(XMM1,XMM1,0x00)
1226: SSE_MULT_PS(XMM1,XMM2)
1227: SSE_ADD_PS(XMM0,XMM1)
1229: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1230: SSE_SHUFFLE(XMM7,XMM7,0x00)
1231: SSE_MULT_PS(XMM3,XMM7)
1232: SSE_ADD_PS(XMM0,XMM3)
1234: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1235: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1237: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1238: /* This is code to be maintained and read by humans afterall. */
1239: /* Copy Multiplier Col 3 into XMM3 */
1240: SSE_COPY_PS(XMM3,XMM0)
1241: /* Copy Multiplier Col 2 into XMM2 */
1242: SSE_COPY_PS(XMM2,XMM6)
1243: /* Copy Multiplier Col 1 into XMM1 */
1244: SSE_COPY_PS(XMM1,XMM5)
1245: /* Copy Multiplier Col 0 into XMM0 */
1246: SSE_COPY_PS(XMM0,XMM4)
1247: SSE_INLINE_END_2;
1249: /* Update the row: */
1250: nz = bi[row+1] - diag_offset[row] - 1;
1251: pv += 16;
1252: for (j=0; j<nz; j++) {
1253: PREFETCH_L1(&pv[16]);
1254: x = rtmp + 16*((unsigned int)pj[j]);
1255: /* x = rtmp + 4*pj[j]; */
1257: /* X:=X-M*PV, One column at a time */
1258: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1259: SSE_INLINE_BEGIN_2(x,pv)
1260: /* Load First Column of X*/
1261: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1262: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1264: /* Matrix-Vector Product: */
1265: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1266: SSE_SHUFFLE(XMM5,XMM5,0x00)
1267: SSE_MULT_PS(XMM5,XMM0)
1268: SSE_SUB_PS(XMM4,XMM5)
1270: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1271: SSE_SHUFFLE(XMM6,XMM6,0x00)
1272: SSE_MULT_PS(XMM6,XMM1)
1273: SSE_SUB_PS(XMM4,XMM6)
1275: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1276: SSE_SHUFFLE(XMM7,XMM7,0x00)
1277: SSE_MULT_PS(XMM7,XMM2)
1278: SSE_SUB_PS(XMM4,XMM7)
1280: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1281: SSE_SHUFFLE(XMM5,XMM5,0x00)
1282: SSE_MULT_PS(XMM5,XMM3)
1283: SSE_SUB_PS(XMM4,XMM5)
1285: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1286: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1288: /* Second Column */
1289: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1290: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1292: /* Matrix-Vector Product: */
1293: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1294: SSE_SHUFFLE(XMM6,XMM6,0x00)
1295: SSE_MULT_PS(XMM6,XMM0)
1296: SSE_SUB_PS(XMM5,XMM6)
1298: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1299: SSE_SHUFFLE(XMM7,XMM7,0x00)
1300: SSE_MULT_PS(XMM7,XMM1)
1301: SSE_SUB_PS(XMM5,XMM7)
1303: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1304: SSE_SHUFFLE(XMM4,XMM4,0x00)
1305: SSE_MULT_PS(XMM4,XMM2)
1306: SSE_SUB_PS(XMM5,XMM4)
1308: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1309: SSE_SHUFFLE(XMM6,XMM6,0x00)
1310: SSE_MULT_PS(XMM6,XMM3)
1311: SSE_SUB_PS(XMM5,XMM6)
1313: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1314: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1316: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1318: /* Third Column */
1319: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1320: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1322: /* Matrix-Vector Product: */
1323: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1324: SSE_SHUFFLE(XMM7,XMM7,0x00)
1325: SSE_MULT_PS(XMM7,XMM0)
1326: SSE_SUB_PS(XMM6,XMM7)
1328: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1329: SSE_SHUFFLE(XMM4,XMM4,0x00)
1330: SSE_MULT_PS(XMM4,XMM1)
1331: SSE_SUB_PS(XMM6,XMM4)
1333: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1334: SSE_SHUFFLE(XMM5,XMM5,0x00)
1335: SSE_MULT_PS(XMM5,XMM2)
1336: SSE_SUB_PS(XMM6,XMM5)
1338: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1339: SSE_SHUFFLE(XMM7,XMM7,0x00)
1340: SSE_MULT_PS(XMM7,XMM3)
1341: SSE_SUB_PS(XMM6,XMM7)
1343: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1344: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1346: /* Fourth Column */
1347: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1348: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1350: /* Matrix-Vector Product: */
1351: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1352: SSE_SHUFFLE(XMM5,XMM5,0x00)
1353: SSE_MULT_PS(XMM5,XMM0)
1354: SSE_SUB_PS(XMM4,XMM5)
1356: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1357: SSE_SHUFFLE(XMM6,XMM6,0x00)
1358: SSE_MULT_PS(XMM6,XMM1)
1359: SSE_SUB_PS(XMM4,XMM6)
1361: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1362: SSE_SHUFFLE(XMM7,XMM7,0x00)
1363: SSE_MULT_PS(XMM7,XMM2)
1364: SSE_SUB_PS(XMM4,XMM7)
1366: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1367: SSE_SHUFFLE(XMM5,XMM5,0x00)
1368: SSE_MULT_PS(XMM5,XMM3)
1369: SSE_SUB_PS(XMM4,XMM5)
1371: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1372: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1373: SSE_INLINE_END_2;
1374: pv += 16;
1375: }
1376: PetscLogFlops(128.0*nz+112.0);
1377: }
1378: row = (unsigned int)(*bjtmp++);
1379: /* row = (*bjtmp++)/4; */
1380: /* bjtmp++; */
1381: }
1382: /* finished row so stick it into b->a */
1383: pv = ba + 16*bi[i];
1384: pj = bj + bi[i];
1385: nz = bi[i+1] - bi[i];
1387: /* Copy x block back into pv block */
1388: for (j=0; j<nz; j++) {
1389: x = rtmp+16*((unsigned int)pj[j]);
1390: /* x = rtmp+4*pj[j]; */
1392: SSE_INLINE_BEGIN_2(x,pv)
1393: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1394: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1395: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1397: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1398: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1400: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1401: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1403: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1404: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1406: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1407: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1409: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1410: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1412: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1413: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1415: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1416: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1417: SSE_INLINE_END_2;
1418: pv += 16;
1419: }
1420: /* invert diagonal block */
1421: w = ba + 16*diag_offset[i];
1422: if (pivotinblocks) {
1423: PetscKernel_A_gets_inverse_A_4(w,shift);
1424: } else {
1425: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1426: }
1427: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1428: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1429: }
1431: PetscFree(rtmp);
1433: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1434: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1435: C->assembled = PETSC_TRUE;
1437: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1438: /* Flop Count from inverting diagonal blocks */
1439: SSE_SCOPE_END;
1440: return(0);
1441: }
1445: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1446: {
1447: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1449: int i,j,n = a->mbs;
1450: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1451: unsigned int row;
1452: int *ajtmpold,nz,*bi=b->i;
1453: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1454: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1455: MatScalar *ba = b->a,*aa = a->a;
1456: int nonzero=0;
1457: /* int nonzero=0,colscale = 16; */
1458: PetscBool pivotinblocks = b->pivotinblocks;
1459: PetscReal shift = info->shiftamount;
1462: SSE_SCOPE_BEGIN;
1464: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1465: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1466: PetscMalloc1(16*(n+1),&rtmp);
1467: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1468: /* if ((unsigned long)bj==(unsigned long)aj) { */
1469: /* colscale = 4; */
1470: /* } */
1471: if ((unsigned long)bj==(unsigned long)aj) {
1472: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1473: }
1475: for (i=0; i<n; i++) {
1476: nz = bi[i+1] - bi[i];
1477: bjtmp = bj + bi[i];
1478: /* zero out the 4x4 block accumulators */
1479: /* zero out one register */
1480: XOR_PS(XMM7,XMM7);
1481: for (j=0; j<nz; j++) {
1482: x = rtmp+16*((unsigned int)bjtmp[j]);
1483: /* x = rtmp+4*bjtmp[j]; */
1484: SSE_INLINE_BEGIN_1(x)
1485: /* Copy zero register to memory locations */
1486: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1487: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1488: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1489: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1490: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1491: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1492: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1493: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1494: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1495: SSE_INLINE_END_1;
1496: }
1497: /* load in initial (unfactored row) */
1498: nz = ai[i+1] - ai[i];
1499: ajtmpold = aj + ai[i];
1500: v = aa + 16*ai[i];
1501: for (j=0; j<nz; j++) {
1502: x = rtmp+16*ajtmpold[j];
1503: /* x = rtmp+colscale*ajtmpold[j]; */
1504: /* Copy v block into x block */
1505: SSE_INLINE_BEGIN_2(v,x)
1506: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1507: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1508: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1510: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1511: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1513: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1514: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1516: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1517: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1519: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1520: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1522: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1523: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1525: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1526: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1528: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1529: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1530: SSE_INLINE_END_2;
1532: v += 16;
1533: }
1534: /* row = (*bjtmp++)/4; */
1535: row = (unsigned int)(*bjtmp++);
1536: while (row < i) {
1537: pc = rtmp + 16*row;
1538: SSE_INLINE_BEGIN_1(pc)
1539: /* Load block from lower triangle */
1540: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1541: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1542: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1544: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1545: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1547: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1548: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1550: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1551: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1553: /* Compare block to zero block */
1555: SSE_COPY_PS(XMM4,XMM7)
1556: SSE_CMPNEQ_PS(XMM4,XMM0)
1558: SSE_COPY_PS(XMM5,XMM7)
1559: SSE_CMPNEQ_PS(XMM5,XMM1)
1561: SSE_COPY_PS(XMM6,XMM7)
1562: SSE_CMPNEQ_PS(XMM6,XMM2)
1564: SSE_CMPNEQ_PS(XMM7,XMM3)
1566: /* Reduce the comparisons to one SSE register */
1567: SSE_OR_PS(XMM6,XMM7)
1568: SSE_OR_PS(XMM5,XMM4)
1569: SSE_OR_PS(XMM5,XMM6)
1570: SSE_INLINE_END_1;
1572: /* Reduce the one SSE register to an integer register for branching */
1573: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1574: MOVEMASK(nonzero,XMM5);
1576: /* If block is nonzero ... */
1577: if (nonzero) {
1578: pv = ba + 16*diag_offset[row];
1579: PREFETCH_L1(&pv[16]);
1580: pj = bj + diag_offset[row] + 1;
1582: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1583: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1584: /* but the diagonal was inverted already */
1585: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1587: SSE_INLINE_BEGIN_2(pv,pc)
1588: /* Column 0, product is accumulated in XMM4 */
1589: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1590: SSE_SHUFFLE(XMM4,XMM4,0x00)
1591: SSE_MULT_PS(XMM4,XMM0)
1593: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1594: SSE_SHUFFLE(XMM5,XMM5,0x00)
1595: SSE_MULT_PS(XMM5,XMM1)
1596: SSE_ADD_PS(XMM4,XMM5)
1598: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1599: SSE_SHUFFLE(XMM6,XMM6,0x00)
1600: SSE_MULT_PS(XMM6,XMM2)
1601: SSE_ADD_PS(XMM4,XMM6)
1603: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1604: SSE_SHUFFLE(XMM7,XMM7,0x00)
1605: SSE_MULT_PS(XMM7,XMM3)
1606: SSE_ADD_PS(XMM4,XMM7)
1608: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1609: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1611: /* Column 1, product is accumulated in XMM5 */
1612: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1613: SSE_SHUFFLE(XMM5,XMM5,0x00)
1614: SSE_MULT_PS(XMM5,XMM0)
1616: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1617: SSE_SHUFFLE(XMM6,XMM6,0x00)
1618: SSE_MULT_PS(XMM6,XMM1)
1619: SSE_ADD_PS(XMM5,XMM6)
1621: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1622: SSE_SHUFFLE(XMM7,XMM7,0x00)
1623: SSE_MULT_PS(XMM7,XMM2)
1624: SSE_ADD_PS(XMM5,XMM7)
1626: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1627: SSE_SHUFFLE(XMM6,XMM6,0x00)
1628: SSE_MULT_PS(XMM6,XMM3)
1629: SSE_ADD_PS(XMM5,XMM6)
1631: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1632: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1634: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1636: /* Column 2, product is accumulated in XMM6 */
1637: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1638: SSE_SHUFFLE(XMM6,XMM6,0x00)
1639: SSE_MULT_PS(XMM6,XMM0)
1641: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1642: SSE_SHUFFLE(XMM7,XMM7,0x00)
1643: SSE_MULT_PS(XMM7,XMM1)
1644: SSE_ADD_PS(XMM6,XMM7)
1646: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1647: SSE_SHUFFLE(XMM7,XMM7,0x00)
1648: SSE_MULT_PS(XMM7,XMM2)
1649: SSE_ADD_PS(XMM6,XMM7)
1651: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1652: SSE_SHUFFLE(XMM7,XMM7,0x00)
1653: SSE_MULT_PS(XMM7,XMM3)
1654: SSE_ADD_PS(XMM6,XMM7)
1656: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1657: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1659: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1660: /* Column 3, product is accumulated in XMM0 */
1661: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1662: SSE_SHUFFLE(XMM7,XMM7,0x00)
1663: SSE_MULT_PS(XMM0,XMM7)
1665: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1666: SSE_SHUFFLE(XMM7,XMM7,0x00)
1667: SSE_MULT_PS(XMM1,XMM7)
1668: SSE_ADD_PS(XMM0,XMM1)
1670: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1671: SSE_SHUFFLE(XMM1,XMM1,0x00)
1672: SSE_MULT_PS(XMM1,XMM2)
1673: SSE_ADD_PS(XMM0,XMM1)
1675: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1676: SSE_SHUFFLE(XMM7,XMM7,0x00)
1677: SSE_MULT_PS(XMM3,XMM7)
1678: SSE_ADD_PS(XMM0,XMM3)
1680: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1681: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1683: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1684: /* This is code to be maintained and read by humans afterall. */
1685: /* Copy Multiplier Col 3 into XMM3 */
1686: SSE_COPY_PS(XMM3,XMM0)
1687: /* Copy Multiplier Col 2 into XMM2 */
1688: SSE_COPY_PS(XMM2,XMM6)
1689: /* Copy Multiplier Col 1 into XMM1 */
1690: SSE_COPY_PS(XMM1,XMM5)
1691: /* Copy Multiplier Col 0 into XMM0 */
1692: SSE_COPY_PS(XMM0,XMM4)
1693: SSE_INLINE_END_2;
1695: /* Update the row: */
1696: nz = bi[row+1] - diag_offset[row] - 1;
1697: pv += 16;
1698: for (j=0; j<nz; j++) {
1699: PREFETCH_L1(&pv[16]);
1700: x = rtmp + 16*((unsigned int)pj[j]);
1701: /* x = rtmp + 4*pj[j]; */
1703: /* X:=X-M*PV, One column at a time */
1704: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1705: SSE_INLINE_BEGIN_2(x,pv)
1706: /* Load First Column of X*/
1707: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1708: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1710: /* Matrix-Vector Product: */
1711: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1712: SSE_SHUFFLE(XMM5,XMM5,0x00)
1713: SSE_MULT_PS(XMM5,XMM0)
1714: SSE_SUB_PS(XMM4,XMM5)
1716: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1717: SSE_SHUFFLE(XMM6,XMM6,0x00)
1718: SSE_MULT_PS(XMM6,XMM1)
1719: SSE_SUB_PS(XMM4,XMM6)
1721: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1722: SSE_SHUFFLE(XMM7,XMM7,0x00)
1723: SSE_MULT_PS(XMM7,XMM2)
1724: SSE_SUB_PS(XMM4,XMM7)
1726: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1727: SSE_SHUFFLE(XMM5,XMM5,0x00)
1728: SSE_MULT_PS(XMM5,XMM3)
1729: SSE_SUB_PS(XMM4,XMM5)
1731: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1732: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1734: /* Second Column */
1735: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1736: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1738: /* Matrix-Vector Product: */
1739: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1740: SSE_SHUFFLE(XMM6,XMM6,0x00)
1741: SSE_MULT_PS(XMM6,XMM0)
1742: SSE_SUB_PS(XMM5,XMM6)
1744: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1745: SSE_SHUFFLE(XMM7,XMM7,0x00)
1746: SSE_MULT_PS(XMM7,XMM1)
1747: SSE_SUB_PS(XMM5,XMM7)
1749: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1750: SSE_SHUFFLE(XMM4,XMM4,0x00)
1751: SSE_MULT_PS(XMM4,XMM2)
1752: SSE_SUB_PS(XMM5,XMM4)
1754: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1755: SSE_SHUFFLE(XMM6,XMM6,0x00)
1756: SSE_MULT_PS(XMM6,XMM3)
1757: SSE_SUB_PS(XMM5,XMM6)
1759: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1760: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1762: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1764: /* Third Column */
1765: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1766: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1768: /* Matrix-Vector Product: */
1769: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1770: SSE_SHUFFLE(XMM7,XMM7,0x00)
1771: SSE_MULT_PS(XMM7,XMM0)
1772: SSE_SUB_PS(XMM6,XMM7)
1774: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1775: SSE_SHUFFLE(XMM4,XMM4,0x00)
1776: SSE_MULT_PS(XMM4,XMM1)
1777: SSE_SUB_PS(XMM6,XMM4)
1779: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1780: SSE_SHUFFLE(XMM5,XMM5,0x00)
1781: SSE_MULT_PS(XMM5,XMM2)
1782: SSE_SUB_PS(XMM6,XMM5)
1784: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1785: SSE_SHUFFLE(XMM7,XMM7,0x00)
1786: SSE_MULT_PS(XMM7,XMM3)
1787: SSE_SUB_PS(XMM6,XMM7)
1789: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1790: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1792: /* Fourth Column */
1793: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1794: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1796: /* Matrix-Vector Product: */
1797: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1798: SSE_SHUFFLE(XMM5,XMM5,0x00)
1799: SSE_MULT_PS(XMM5,XMM0)
1800: SSE_SUB_PS(XMM4,XMM5)
1802: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1803: SSE_SHUFFLE(XMM6,XMM6,0x00)
1804: SSE_MULT_PS(XMM6,XMM1)
1805: SSE_SUB_PS(XMM4,XMM6)
1807: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1808: SSE_SHUFFLE(XMM7,XMM7,0x00)
1809: SSE_MULT_PS(XMM7,XMM2)
1810: SSE_SUB_PS(XMM4,XMM7)
1812: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1813: SSE_SHUFFLE(XMM5,XMM5,0x00)
1814: SSE_MULT_PS(XMM5,XMM3)
1815: SSE_SUB_PS(XMM4,XMM5)
1817: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1818: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1819: SSE_INLINE_END_2;
1820: pv += 16;
1821: }
1822: PetscLogFlops(128.0*nz+112.0);
1823: }
1824: row = (unsigned int)(*bjtmp++);
1825: /* row = (*bjtmp++)/4; */
1826: /* bjtmp++; */
1827: }
1828: /* finished row so stick it into b->a */
1829: pv = ba + 16*bi[i];
1830: pj = bj + bi[i];
1831: nz = bi[i+1] - bi[i];
1833: /* Copy x block back into pv block */
1834: for (j=0; j<nz; j++) {
1835: x = rtmp+16*((unsigned int)pj[j]);
1836: /* x = rtmp+4*pj[j]; */
1838: SSE_INLINE_BEGIN_2(x,pv)
1839: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1840: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1841: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1843: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1844: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1846: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1847: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1849: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1850: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1852: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1853: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1855: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1856: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1858: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1859: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1861: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1862: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1863: SSE_INLINE_END_2;
1864: pv += 16;
1865: }
1866: /* invert diagonal block */
1867: w = ba + 16*diag_offset[i];
1868: if (pivotinblocks) {
1869: PetscKernel_A_gets_inverse_A_4(w,shift);
1870: } else {
1871: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1872: }
1873: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1874: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1875: }
1877: PetscFree(rtmp);
1879: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1880: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1881: C->assembled = PETSC_TRUE;
1883: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1884: /* Flop Count from inverting diagonal blocks */
1885: SSE_SCOPE_END;
1886: return(0);
1887: }
1889: #endif