Actual source code: baijfact11.c
petsc-3.7.3 2016-08-01
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
14: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15: {
16: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17: IS isrow = b->row,isicol = b->icol;
19: const PetscInt *r,*ic;
20: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21: PetscInt *ajtmpold,*ajtmp,nz,row;
22: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
24: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27: MatScalar m13,m14,m15,m16;
28: MatScalar *ba = b->a,*aa = a->a;
29: PetscBool pivotinblocks = b->pivotinblocks;
30: PetscReal shift = info->shiftamount;
31: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
34: ISGetIndices(isrow,&r);
35: ISGetIndices(isicol,&ic);
36: PetscMalloc1(16*(n+1),&rtmp);
37: allowzeropivot = PetscNot(A->erroriffailure);
39: for (i=0; i<n; i++) {
40: nz = bi[i+1] - bi[i];
41: ajtmp = bj + bi[i];
42: for (j=0; j<nz; j++) {
43: x = rtmp+16*ajtmp[j];
44: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
45: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
46: }
47: /* load in initial (unfactored row) */
48: idx = r[i];
49: nz = ai[idx+1] - ai[idx];
50: ajtmpold = aj + ai[idx];
51: v = aa + 16*ai[idx];
52: for (j=0; j<nz; j++) {
53: x = rtmp+16*ic[ajtmpold[j]];
54: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
55: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
56: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
57: x[14] = v[14]; x[15] = v[15];
58: v += 16;
59: }
60: row = *ajtmp++;
61: while (row < i) {
62: pc = rtmp + 16*row;
63: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
64: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
65: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
66: p15 = pc[14]; p16 = pc[15];
67: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
68: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
69: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
70: || p16 != 0.0) {
71: pv = ba + 16*diag_offset[row];
72: pj = bj + diag_offset[row] + 1;
73: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
74: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
75: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
76: x15 = pv[14]; x16 = pv[15];
77: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
78: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
79: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
80: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
82: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
83: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
84: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
85: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
87: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
88: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
89: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
90: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
92: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
93: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
94: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
95: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
97: nz = bi[row+1] - diag_offset[row] - 1;
98: pv += 16;
99: for (j=0; j<nz; j++) {
100: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
101: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
102: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
103: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
104: x = rtmp + 16*pj[j];
105: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
106: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
107: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
108: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
110: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
111: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
112: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
113: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
115: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
116: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
117: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
118: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
120: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
121: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
122: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
123: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
125: pv += 16;
126: }
127: PetscLogFlops(128.0*nz+112.0);
128: }
129: row = *ajtmp++;
130: }
131: /* finished row so stick it into b->a */
132: pv = ba + 16*bi[i];
133: pj = bj + bi[i];
134: nz = bi[i+1] - bi[i];
135: for (j=0; j<nz; j++) {
136: x = rtmp+16*pj[j];
137: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
138: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
139: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
140: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
141: pv += 16;
142: }
143: /* invert diagonal block */
144: w = ba + 16*diag_offset[i];
145: if (pivotinblocks) {
146: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
147: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
148: } else {
149: PetscKernel_A_gets_inverse_A_4_nopivot(w);
150: }
151: }
153: PetscFree(rtmp);
154: ISRestoreIndices(isicol,&ic);
155: ISRestoreIndices(isrow,&r);
157: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
158: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
159: C->assembled = PETSC_TRUE;
161: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
162: return(0);
163: }
165: /* MatLUFactorNumeric_SeqBAIJ_4 -
166: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
167: PetscKernel_A_gets_A_times_B()
168: PetscKernel_A_gets_A_minus_B_times_C()
169: PetscKernel_A_gets_inverse_A()
170: */
174: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
175: {
176: Mat C = B;
177: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
178: IS isrow = b->row,isicol = b->icol;
180: const PetscInt *r,*ic;
181: PetscInt i,j,k,nz,nzL,row;
182: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
183: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
184: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
185: PetscInt flg;
186: PetscReal shift;
187: PetscBool allowzeropivot,zeropivotdetected;
190: allowzeropivot = PetscNot(A->erroriffailure);
191: ISGetIndices(isrow,&r);
192: ISGetIndices(isicol,&ic);
194: if (info->shifttype == MAT_SHIFT_NONE) {
195: shift = 0;
196: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
197: shift = info->shiftamount;
198: }
200: /* generate work space needed by the factorization */
201: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
202: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
204: for (i=0; i<n; i++) {
205: /* zero rtmp */
206: /* L part */
207: nz = bi[i+1] - bi[i];
208: bjtmp = bj + bi[i];
209: for (j=0; j<nz; j++) {
210: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
211: }
213: /* U part */
214: nz = bdiag[i]-bdiag[i+1];
215: bjtmp = bj + bdiag[i+1]+1;
216: for (j=0; j<nz; j++) {
217: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
218: }
220: /* load in initial (unfactored row) */
221: nz = ai[r[i]+1] - ai[r[i]];
222: ajtmp = aj + ai[r[i]];
223: v = aa + bs2*ai[r[i]];
224: for (j=0; j<nz; j++) {
225: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
226: }
228: /* elimination */
229: bjtmp = bj + bi[i];
230: nzL = bi[i+1] - bi[i];
231: for (k=0; k < nzL; k++) {
232: row = bjtmp[k];
233: pc = rtmp + bs2*row;
234: for (flg=0,j=0; j<bs2; j++) {
235: if (pc[j]!=0.0) {
236: flg = 1;
237: break;
238: }
239: }
240: if (flg) {
241: pv = b->a + bs2*bdiag[row];
242: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
243: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
245: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
246: pv = b->a + bs2*(bdiag[row+1]+1);
247: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
248: for (j=0; j<nz; j++) {
249: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
250: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
251: v = rtmp + bs2*pj[j];
252: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
253: pv += bs2;
254: }
255: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
256: }
257: }
259: /* finished row so stick it into b->a */
260: /* L part */
261: pv = b->a + bs2*bi[i];
262: pj = b->j + bi[i];
263: nz = bi[i+1] - bi[i];
264: for (j=0; j<nz; j++) {
265: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
266: }
268: /* Mark diagonal and invert diagonal for simplier triangular solves */
269: pv = b->a + bs2*bdiag[i];
270: pj = b->j + bdiag[i];
271: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
272: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
273: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
275: /* U part */
276: pv = b->a + bs2*(bdiag[i+1]+1);
277: pj = b->j + bdiag[i+1]+1;
278: nz = bdiag[i] - bdiag[i+1] - 1;
279: for (j=0; j<nz; j++) {
280: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
281: }
282: }
284: PetscFree2(rtmp,mwork);
285: ISRestoreIndices(isicol,&ic);
286: ISRestoreIndices(isrow,&r);
288: C->ops->solve = MatSolve_SeqBAIJ_4;
289: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
290: C->assembled = PETSC_TRUE;
292: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
293: return(0);
294: }
298: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
299: {
300: /*
301: Default Version for when blocks are 4 by 4 Using natural ordering
302: */
303: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
305: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
306: PetscInt *ajtmpold,*ajtmp,nz,row;
307: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
308: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
309: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
310: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
311: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
312: MatScalar m13,m14,m15,m16;
313: MatScalar *ba = b->a,*aa = a->a;
314: PetscBool pivotinblocks = b->pivotinblocks;
315: PetscReal shift = info->shiftamount;
316: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
319: allowzeropivot = PetscNot(A->erroriffailure);
320: PetscMalloc1(16*(n+1),&rtmp);
322: for (i=0; i<n; i++) {
323: nz = bi[i+1] - bi[i];
324: ajtmp = bj + bi[i];
325: for (j=0; j<nz; j++) {
326: x = rtmp+16*ajtmp[j];
327: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
328: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
329: }
330: /* load in initial (unfactored row) */
331: nz = ai[i+1] - ai[i];
332: ajtmpold = aj + ai[i];
333: v = aa + 16*ai[i];
334: for (j=0; j<nz; j++) {
335: x = rtmp+16*ajtmpold[j];
336: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
337: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
338: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
339: x[14] = v[14]; x[15] = v[15];
340: v += 16;
341: }
342: row = *ajtmp++;
343: while (row < i) {
344: pc = rtmp + 16*row;
345: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
346: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
347: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
348: p15 = pc[14]; p16 = pc[15];
349: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
350: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
351: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
352: || p16 != 0.0) {
353: pv = ba + 16*diag_offset[row];
354: pj = bj + diag_offset[row] + 1;
355: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
356: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
357: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
358: x15 = pv[14]; x16 = pv[15];
359: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
360: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
361: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
362: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
364: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
365: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
366: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
367: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
369: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
370: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
371: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
372: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
374: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
375: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
376: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
377: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
378: nz = bi[row+1] - diag_offset[row] - 1;
379: pv += 16;
380: for (j=0; j<nz; j++) {
381: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
382: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
383: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
384: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
385: x = rtmp + 16*pj[j];
386: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
387: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
388: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
389: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
391: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
392: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
393: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
394: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
396: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
397: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
398: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
399: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
401: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
402: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
403: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
404: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
406: pv += 16;
407: }
408: PetscLogFlops(128.0*nz+112.0);
409: }
410: row = *ajtmp++;
411: }
412: /* finished row so stick it into b->a */
413: pv = ba + 16*bi[i];
414: pj = bj + bi[i];
415: nz = bi[i+1] - bi[i];
416: for (j=0; j<nz; j++) {
417: x = rtmp+16*pj[j];
418: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
419: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
420: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
421: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
422: pv += 16;
423: }
424: /* invert diagonal block */
425: w = ba + 16*diag_offset[i];
426: if (pivotinblocks) {
427: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
428: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
429: } else {
430: PetscKernel_A_gets_inverse_A_4_nopivot(w);
431: }
432: }
434: PetscFree(rtmp);
436: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
437: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
438: C->assembled = PETSC_TRUE;
440: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
441: return(0);
442: }
444: /*
445: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
446: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
447: */
450: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
451: {
452: Mat C =B;
453: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
455: PetscInt i,j,k,nz,nzL,row;
456: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
457: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
458: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
459: PetscInt flg;
460: PetscReal shift;
461: PetscBool allowzeropivot,zeropivotdetected;
464: allowzeropivot = PetscNot(A->erroriffailure);
466: /* generate work space needed by the factorization */
467: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
468: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
470: if (info->shifttype == MAT_SHIFT_NONE) {
471: shift = 0;
472: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
473: shift = info->shiftamount;
474: }
476: for (i=0; i<n; i++) {
477: /* zero rtmp */
478: /* L part */
479: nz = bi[i+1] - bi[i];
480: bjtmp = bj + bi[i];
481: for (j=0; j<nz; j++) {
482: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
483: }
485: /* U part */
486: nz = bdiag[i] - bdiag[i+1];
487: bjtmp = bj + bdiag[i+1]+1;
488: for (j=0; j<nz; j++) {
489: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
490: }
492: /* load in initial (unfactored row) */
493: nz = ai[i+1] - ai[i];
494: ajtmp = aj + ai[i];
495: v = aa + bs2*ai[i];
496: for (j=0; j<nz; j++) {
497: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
498: }
500: /* elimination */
501: bjtmp = bj + bi[i];
502: nzL = bi[i+1] - bi[i];
503: for (k=0; k < nzL; k++) {
504: row = bjtmp[k];
505: pc = rtmp + bs2*row;
506: for (flg=0,j=0; j<bs2; j++) {
507: if (pc[j]!=0.0) {
508: flg = 1;
509: break;
510: }
511: }
512: if (flg) {
513: pv = b->a + bs2*bdiag[row];
514: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
515: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
517: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
518: pv = b->a + bs2*(bdiag[row+1]+1);
519: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
520: for (j=0; j<nz; j++) {
521: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
522: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
523: v = rtmp + bs2*pj[j];
524: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
525: pv += bs2;
526: }
527: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
528: }
529: }
531: /* finished row so stick it into b->a */
532: /* L part */
533: pv = b->a + bs2*bi[i];
534: pj = b->j + bi[i];
535: nz = bi[i+1] - bi[i];
536: for (j=0; j<nz; j++) {
537: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
538: }
540: /* Mark diagonal and invert diagonal for simplier triangular solves */
541: pv = b->a + bs2*bdiag[i];
542: pj = b->j + bdiag[i];
543: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
544: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
545: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
547: /* U part */
548: pv = b->a + bs2*(bdiag[i+1]+1);
549: pj = b->j + bdiag[i+1]+1;
550: nz = bdiag[i] - bdiag[i+1] - 1;
551: for (j=0; j<nz; j++) {
552: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
553: }
554: }
555: PetscFree2(rtmp,mwork);
557: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
558: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
559: C->assembled = PETSC_TRUE;
561: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
562: return(0);
563: }
565: #if defined(PETSC_HAVE_SSE)
567: #include PETSC_HAVE_SSE
569: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
572: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
573: {
574: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
576: int i,j,n = a->mbs;
577: int *bj = b->j,*bjtmp,*pj;
578: int row;
579: int *ajtmpold,nz,*bi=b->i;
580: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
581: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
582: MatScalar *ba = b->a,*aa = a->a;
583: int nonzero=0;
584: PetscBool pivotinblocks = b->pivotinblocks;
585: PetscReal shift = info->shiftamount;
586: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
589: allowzeropivot = PetscNot(A->erroriffailure);
590: SSE_SCOPE_BEGIN;
592: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
593: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
594: PetscMalloc1(16*(n+1),&rtmp);
595: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
596: /* if ((unsigned long)bj==(unsigned long)aj) { */
597: /* colscale = 4; */
598: /* } */
599: for (i=0; i<n; i++) {
600: nz = bi[i+1] - bi[i];
601: bjtmp = bj + bi[i];
602: /* zero out the 4x4 block accumulators */
603: /* zero out one register */
604: XOR_PS(XMM7,XMM7);
605: for (j=0; j<nz; j++) {
606: x = rtmp+16*bjtmp[j];
607: /* x = rtmp+4*bjtmp[j]; */
608: SSE_INLINE_BEGIN_1(x)
609: /* Copy zero register to memory locations */
610: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
611: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
612: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
613: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
614: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
615: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
616: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
617: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
618: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
619: SSE_INLINE_END_1;
620: }
621: /* load in initial (unfactored row) */
622: nz = ai[i+1] - ai[i];
623: ajtmpold = aj + ai[i];
624: v = aa + 16*ai[i];
625: for (j=0; j<nz; j++) {
626: x = rtmp+16*ajtmpold[j];
627: /* x = rtmp+colscale*ajtmpold[j]; */
628: /* Copy v block into x block */
629: SSE_INLINE_BEGIN_2(v,x)
630: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
631: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
632: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
634: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
635: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
637: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
638: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
640: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
641: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
643: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
644: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
646: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
647: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
649: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
650: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
652: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
653: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
654: SSE_INLINE_END_2;
656: v += 16;
657: }
658: /* row = (*bjtmp++)/4; */
659: row = *bjtmp++;
660: while (row < i) {
661: pc = rtmp + 16*row;
662: SSE_INLINE_BEGIN_1(pc)
663: /* Load block from lower triangle */
664: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
665: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
666: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
668: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
669: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
671: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
672: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
674: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
675: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
677: /* Compare block to zero block */
679: SSE_COPY_PS(XMM4,XMM7)
680: SSE_CMPNEQ_PS(XMM4,XMM0)
682: SSE_COPY_PS(XMM5,XMM7)
683: SSE_CMPNEQ_PS(XMM5,XMM1)
685: SSE_COPY_PS(XMM6,XMM7)
686: SSE_CMPNEQ_PS(XMM6,XMM2)
688: SSE_CMPNEQ_PS(XMM7,XMM3)
690: /* Reduce the comparisons to one SSE register */
691: SSE_OR_PS(XMM6,XMM7)
692: SSE_OR_PS(XMM5,XMM4)
693: SSE_OR_PS(XMM5,XMM6)
694: SSE_INLINE_END_1;
696: /* Reduce the one SSE register to an integer register for branching */
697: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
698: MOVEMASK(nonzero,XMM5);
700: /* If block is nonzero ... */
701: if (nonzero) {
702: pv = ba + 16*diag_offset[row];
703: PREFETCH_L1(&pv[16]);
704: pj = bj + diag_offset[row] + 1;
706: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
707: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
708: /* but the diagonal was inverted already */
709: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
711: SSE_INLINE_BEGIN_2(pv,pc)
712: /* Column 0, product is accumulated in XMM4 */
713: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
714: SSE_SHUFFLE(XMM4,XMM4,0x00)
715: SSE_MULT_PS(XMM4,XMM0)
717: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
718: SSE_SHUFFLE(XMM5,XMM5,0x00)
719: SSE_MULT_PS(XMM5,XMM1)
720: SSE_ADD_PS(XMM4,XMM5)
722: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
723: SSE_SHUFFLE(XMM6,XMM6,0x00)
724: SSE_MULT_PS(XMM6,XMM2)
725: SSE_ADD_PS(XMM4,XMM6)
727: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
728: SSE_SHUFFLE(XMM7,XMM7,0x00)
729: SSE_MULT_PS(XMM7,XMM3)
730: SSE_ADD_PS(XMM4,XMM7)
732: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
733: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
735: /* Column 1, product is accumulated in XMM5 */
736: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
737: SSE_SHUFFLE(XMM5,XMM5,0x00)
738: SSE_MULT_PS(XMM5,XMM0)
740: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
741: SSE_SHUFFLE(XMM6,XMM6,0x00)
742: SSE_MULT_PS(XMM6,XMM1)
743: SSE_ADD_PS(XMM5,XMM6)
745: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
746: SSE_SHUFFLE(XMM7,XMM7,0x00)
747: SSE_MULT_PS(XMM7,XMM2)
748: SSE_ADD_PS(XMM5,XMM7)
750: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
751: SSE_SHUFFLE(XMM6,XMM6,0x00)
752: SSE_MULT_PS(XMM6,XMM3)
753: SSE_ADD_PS(XMM5,XMM6)
755: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
756: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
758: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
760: /* Column 2, product is accumulated in XMM6 */
761: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
762: SSE_SHUFFLE(XMM6,XMM6,0x00)
763: SSE_MULT_PS(XMM6,XMM0)
765: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
766: SSE_SHUFFLE(XMM7,XMM7,0x00)
767: SSE_MULT_PS(XMM7,XMM1)
768: SSE_ADD_PS(XMM6,XMM7)
770: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
771: SSE_SHUFFLE(XMM7,XMM7,0x00)
772: SSE_MULT_PS(XMM7,XMM2)
773: SSE_ADD_PS(XMM6,XMM7)
775: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
776: SSE_SHUFFLE(XMM7,XMM7,0x00)
777: SSE_MULT_PS(XMM7,XMM3)
778: SSE_ADD_PS(XMM6,XMM7)
780: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
781: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
783: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
784: /* Column 3, product is accumulated in XMM0 */
785: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
786: SSE_SHUFFLE(XMM7,XMM7,0x00)
787: SSE_MULT_PS(XMM0,XMM7)
789: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
790: SSE_SHUFFLE(XMM7,XMM7,0x00)
791: SSE_MULT_PS(XMM1,XMM7)
792: SSE_ADD_PS(XMM0,XMM1)
794: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
795: SSE_SHUFFLE(XMM1,XMM1,0x00)
796: SSE_MULT_PS(XMM1,XMM2)
797: SSE_ADD_PS(XMM0,XMM1)
799: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
800: SSE_SHUFFLE(XMM7,XMM7,0x00)
801: SSE_MULT_PS(XMM3,XMM7)
802: SSE_ADD_PS(XMM0,XMM3)
804: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
805: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
807: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
808: /* This is code to be maintained and read by humans afterall. */
809: /* Copy Multiplier Col 3 into XMM3 */
810: SSE_COPY_PS(XMM3,XMM0)
811: /* Copy Multiplier Col 2 into XMM2 */
812: SSE_COPY_PS(XMM2,XMM6)
813: /* Copy Multiplier Col 1 into XMM1 */
814: SSE_COPY_PS(XMM1,XMM5)
815: /* Copy Multiplier Col 0 into XMM0 */
816: SSE_COPY_PS(XMM0,XMM4)
817: SSE_INLINE_END_2;
819: /* Update the row: */
820: nz = bi[row+1] - diag_offset[row] - 1;
821: pv += 16;
822: for (j=0; j<nz; j++) {
823: PREFETCH_L1(&pv[16]);
824: x = rtmp + 16*pj[j];
825: /* x = rtmp + 4*pj[j]; */
827: /* X:=X-M*PV, One column at a time */
828: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
829: SSE_INLINE_BEGIN_2(x,pv)
830: /* Load First Column of X*/
831: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
832: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
834: /* Matrix-Vector Product: */
835: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
836: SSE_SHUFFLE(XMM5,XMM5,0x00)
837: SSE_MULT_PS(XMM5,XMM0)
838: SSE_SUB_PS(XMM4,XMM5)
840: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
841: SSE_SHUFFLE(XMM6,XMM6,0x00)
842: SSE_MULT_PS(XMM6,XMM1)
843: SSE_SUB_PS(XMM4,XMM6)
845: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
846: SSE_SHUFFLE(XMM7,XMM7,0x00)
847: SSE_MULT_PS(XMM7,XMM2)
848: SSE_SUB_PS(XMM4,XMM7)
850: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
851: SSE_SHUFFLE(XMM5,XMM5,0x00)
852: SSE_MULT_PS(XMM5,XMM3)
853: SSE_SUB_PS(XMM4,XMM5)
855: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
856: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
858: /* Second Column */
859: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
860: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
862: /* Matrix-Vector Product: */
863: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
864: SSE_SHUFFLE(XMM6,XMM6,0x00)
865: SSE_MULT_PS(XMM6,XMM0)
866: SSE_SUB_PS(XMM5,XMM6)
868: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
869: SSE_SHUFFLE(XMM7,XMM7,0x00)
870: SSE_MULT_PS(XMM7,XMM1)
871: SSE_SUB_PS(XMM5,XMM7)
873: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
874: SSE_SHUFFLE(XMM4,XMM4,0x00)
875: SSE_MULT_PS(XMM4,XMM2)
876: SSE_SUB_PS(XMM5,XMM4)
878: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
879: SSE_SHUFFLE(XMM6,XMM6,0x00)
880: SSE_MULT_PS(XMM6,XMM3)
881: SSE_SUB_PS(XMM5,XMM6)
883: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
884: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
886: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
888: /* Third Column */
889: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
890: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
892: /* Matrix-Vector Product: */
893: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
894: SSE_SHUFFLE(XMM7,XMM7,0x00)
895: SSE_MULT_PS(XMM7,XMM0)
896: SSE_SUB_PS(XMM6,XMM7)
898: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
899: SSE_SHUFFLE(XMM4,XMM4,0x00)
900: SSE_MULT_PS(XMM4,XMM1)
901: SSE_SUB_PS(XMM6,XMM4)
903: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
904: SSE_SHUFFLE(XMM5,XMM5,0x00)
905: SSE_MULT_PS(XMM5,XMM2)
906: SSE_SUB_PS(XMM6,XMM5)
908: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
909: SSE_SHUFFLE(XMM7,XMM7,0x00)
910: SSE_MULT_PS(XMM7,XMM3)
911: SSE_SUB_PS(XMM6,XMM7)
913: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
914: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
916: /* Fourth Column */
917: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
918: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
920: /* Matrix-Vector Product: */
921: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
922: SSE_SHUFFLE(XMM5,XMM5,0x00)
923: SSE_MULT_PS(XMM5,XMM0)
924: SSE_SUB_PS(XMM4,XMM5)
926: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
927: SSE_SHUFFLE(XMM6,XMM6,0x00)
928: SSE_MULT_PS(XMM6,XMM1)
929: SSE_SUB_PS(XMM4,XMM6)
931: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
932: SSE_SHUFFLE(XMM7,XMM7,0x00)
933: SSE_MULT_PS(XMM7,XMM2)
934: SSE_SUB_PS(XMM4,XMM7)
936: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
937: SSE_SHUFFLE(XMM5,XMM5,0x00)
938: SSE_MULT_PS(XMM5,XMM3)
939: SSE_SUB_PS(XMM4,XMM5)
941: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
942: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
943: SSE_INLINE_END_2;
944: pv += 16;
945: }
946: PetscLogFlops(128.0*nz+112.0);
947: }
948: row = *bjtmp++;
949: /* row = (*bjtmp++)/4; */
950: }
951: /* finished row so stick it into b->a */
952: pv = ba + 16*bi[i];
953: pj = bj + bi[i];
954: nz = bi[i+1] - bi[i];
956: /* Copy x block back into pv block */
957: for (j=0; j<nz; j++) {
958: x = rtmp+16*pj[j];
959: /* x = rtmp+4*pj[j]; */
961: SSE_INLINE_BEGIN_2(x,pv)
962: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
963: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
964: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
966: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
967: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
969: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
970: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
972: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
973: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
975: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
976: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
978: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
979: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
981: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
982: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
984: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
985: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
986: SSE_INLINE_END_2;
987: pv += 16;
988: }
989: /* invert diagonal block */
990: w = ba + 16*diag_offset[i];
991: if (pivotinblocks) {
992: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
993: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
994: } else {
995: PetscKernel_A_gets_inverse_A_4_nopivot(w);
996: }
997: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
998: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
999: }
1001: PetscFree(rtmp);
1003: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1004: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1005: C->assembled = PETSC_TRUE;
1007: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1008: /* Flop Count from inverting diagonal blocks */
1009: SSE_SCOPE_END;
1010: return(0);
1011: }
1015: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1016: {
1017: Mat A =C;
1018: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1020: int i,j,n = a->mbs;
1021: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1022: unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1023: unsigned int row;
1024: int nz,*bi=b->i;
1025: int *diag_offset = b->diag,*ai=a->i;
1026: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1027: MatScalar *ba = b->a,*aa = a->a;
1028: int nonzero=0;
1029: PetscBool pivotinblocks = b->pivotinblocks;
1030: PetscReal shift = info->shiftamount;
1031: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1034: allowzeropivot = PetscNot(A->erroriffailure);
1035: SSE_SCOPE_BEGIN;
1037: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1038: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1039: PetscMalloc1(16*(n+1),&rtmp);
1040: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1041: /* if ((unsigned long)bj==(unsigned long)aj) { */
1042: /* colscale = 4; */
1043: /* } */
1045: for (i=0; i<n; i++) {
1046: nz = bi[i+1] - bi[i];
1047: bjtmp = bj + bi[i];
1048: /* zero out the 4x4 block accumulators */
1049: /* zero out one register */
1050: XOR_PS(XMM7,XMM7);
1051: for (j=0; j<nz; j++) {
1052: x = rtmp+16*((unsigned int)bjtmp[j]);
1053: /* x = rtmp+4*bjtmp[j]; */
1054: SSE_INLINE_BEGIN_1(x)
1055: /* Copy zero register to memory locations */
1056: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1057: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1058: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1059: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1060: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1061: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1062: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1063: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1064: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1065: SSE_INLINE_END_1;
1066: }
1067: /* load in initial (unfactored row) */
1068: nz = ai[i+1] - ai[i];
1069: ajtmp = aj + ai[i];
1070: v = aa + 16*ai[i];
1071: for (j=0; j<nz; j++) {
1072: x = rtmp+16*((unsigned int)ajtmp[j]);
1073: /* x = rtmp+colscale*ajtmp[j]; */
1074: /* Copy v block into x block */
1075: SSE_INLINE_BEGIN_2(v,x)
1076: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1077: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1078: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1080: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1081: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1083: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1084: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1086: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1087: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1089: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1090: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1092: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1093: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1095: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1096: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1098: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1099: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1100: SSE_INLINE_END_2;
1102: v += 16;
1103: }
1104: /* row = (*bjtmp++)/4; */
1105: row = (unsigned int)(*bjtmp++);
1106: while (row < i) {
1107: pc = rtmp + 16*row;
1108: SSE_INLINE_BEGIN_1(pc)
1109: /* Load block from lower triangle */
1110: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1111: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1112: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1114: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1115: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1117: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1118: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1120: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1121: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1123: /* Compare block to zero block */
1125: SSE_COPY_PS(XMM4,XMM7)
1126: SSE_CMPNEQ_PS(XMM4,XMM0)
1128: SSE_COPY_PS(XMM5,XMM7)
1129: SSE_CMPNEQ_PS(XMM5,XMM1)
1131: SSE_COPY_PS(XMM6,XMM7)
1132: SSE_CMPNEQ_PS(XMM6,XMM2)
1134: SSE_CMPNEQ_PS(XMM7,XMM3)
1136: /* Reduce the comparisons to one SSE register */
1137: SSE_OR_PS(XMM6,XMM7)
1138: SSE_OR_PS(XMM5,XMM4)
1139: SSE_OR_PS(XMM5,XMM6)
1140: SSE_INLINE_END_1;
1142: /* Reduce the one SSE register to an integer register for branching */
1143: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1144: MOVEMASK(nonzero,XMM5);
1146: /* If block is nonzero ... */
1147: if (nonzero) {
1148: pv = ba + 16*diag_offset[row];
1149: PREFETCH_L1(&pv[16]);
1150: pj = bj + diag_offset[row] + 1;
1152: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1153: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1154: /* but the diagonal was inverted already */
1155: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1157: SSE_INLINE_BEGIN_2(pv,pc)
1158: /* Column 0, product is accumulated in XMM4 */
1159: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1160: SSE_SHUFFLE(XMM4,XMM4,0x00)
1161: SSE_MULT_PS(XMM4,XMM0)
1163: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1164: SSE_SHUFFLE(XMM5,XMM5,0x00)
1165: SSE_MULT_PS(XMM5,XMM1)
1166: SSE_ADD_PS(XMM4,XMM5)
1168: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1169: SSE_SHUFFLE(XMM6,XMM6,0x00)
1170: SSE_MULT_PS(XMM6,XMM2)
1171: SSE_ADD_PS(XMM4,XMM6)
1173: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1174: SSE_SHUFFLE(XMM7,XMM7,0x00)
1175: SSE_MULT_PS(XMM7,XMM3)
1176: SSE_ADD_PS(XMM4,XMM7)
1178: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1179: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1181: /* Column 1, product is accumulated in XMM5 */
1182: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1183: SSE_SHUFFLE(XMM5,XMM5,0x00)
1184: SSE_MULT_PS(XMM5,XMM0)
1186: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1187: SSE_SHUFFLE(XMM6,XMM6,0x00)
1188: SSE_MULT_PS(XMM6,XMM1)
1189: SSE_ADD_PS(XMM5,XMM6)
1191: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1192: SSE_SHUFFLE(XMM7,XMM7,0x00)
1193: SSE_MULT_PS(XMM7,XMM2)
1194: SSE_ADD_PS(XMM5,XMM7)
1196: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1197: SSE_SHUFFLE(XMM6,XMM6,0x00)
1198: SSE_MULT_PS(XMM6,XMM3)
1199: SSE_ADD_PS(XMM5,XMM6)
1201: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1202: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1204: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1206: /* Column 2, product is accumulated in XMM6 */
1207: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1208: SSE_SHUFFLE(XMM6,XMM6,0x00)
1209: SSE_MULT_PS(XMM6,XMM0)
1211: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1212: SSE_SHUFFLE(XMM7,XMM7,0x00)
1213: SSE_MULT_PS(XMM7,XMM1)
1214: SSE_ADD_PS(XMM6,XMM7)
1216: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1217: SSE_SHUFFLE(XMM7,XMM7,0x00)
1218: SSE_MULT_PS(XMM7,XMM2)
1219: SSE_ADD_PS(XMM6,XMM7)
1221: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1222: SSE_SHUFFLE(XMM7,XMM7,0x00)
1223: SSE_MULT_PS(XMM7,XMM3)
1224: SSE_ADD_PS(XMM6,XMM7)
1226: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1227: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1229: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1230: /* Column 3, product is accumulated in XMM0 */
1231: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1232: SSE_SHUFFLE(XMM7,XMM7,0x00)
1233: SSE_MULT_PS(XMM0,XMM7)
1235: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1236: SSE_SHUFFLE(XMM7,XMM7,0x00)
1237: SSE_MULT_PS(XMM1,XMM7)
1238: SSE_ADD_PS(XMM0,XMM1)
1240: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1241: SSE_SHUFFLE(XMM1,XMM1,0x00)
1242: SSE_MULT_PS(XMM1,XMM2)
1243: SSE_ADD_PS(XMM0,XMM1)
1245: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1246: SSE_SHUFFLE(XMM7,XMM7,0x00)
1247: SSE_MULT_PS(XMM3,XMM7)
1248: SSE_ADD_PS(XMM0,XMM3)
1250: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1251: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1253: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1254: /* This is code to be maintained and read by humans afterall. */
1255: /* Copy Multiplier Col 3 into XMM3 */
1256: SSE_COPY_PS(XMM3,XMM0)
1257: /* Copy Multiplier Col 2 into XMM2 */
1258: SSE_COPY_PS(XMM2,XMM6)
1259: /* Copy Multiplier Col 1 into XMM1 */
1260: SSE_COPY_PS(XMM1,XMM5)
1261: /* Copy Multiplier Col 0 into XMM0 */
1262: SSE_COPY_PS(XMM0,XMM4)
1263: SSE_INLINE_END_2;
1265: /* Update the row: */
1266: nz = bi[row+1] - diag_offset[row] - 1;
1267: pv += 16;
1268: for (j=0; j<nz; j++) {
1269: PREFETCH_L1(&pv[16]);
1270: x = rtmp + 16*((unsigned int)pj[j]);
1271: /* x = rtmp + 4*pj[j]; */
1273: /* X:=X-M*PV, One column at a time */
1274: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1275: SSE_INLINE_BEGIN_2(x,pv)
1276: /* Load First Column of X*/
1277: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1278: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1280: /* Matrix-Vector Product: */
1281: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1282: SSE_SHUFFLE(XMM5,XMM5,0x00)
1283: SSE_MULT_PS(XMM5,XMM0)
1284: SSE_SUB_PS(XMM4,XMM5)
1286: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1287: SSE_SHUFFLE(XMM6,XMM6,0x00)
1288: SSE_MULT_PS(XMM6,XMM1)
1289: SSE_SUB_PS(XMM4,XMM6)
1291: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1292: SSE_SHUFFLE(XMM7,XMM7,0x00)
1293: SSE_MULT_PS(XMM7,XMM2)
1294: SSE_SUB_PS(XMM4,XMM7)
1296: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1297: SSE_SHUFFLE(XMM5,XMM5,0x00)
1298: SSE_MULT_PS(XMM5,XMM3)
1299: SSE_SUB_PS(XMM4,XMM5)
1301: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1302: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1304: /* Second Column */
1305: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1306: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1308: /* Matrix-Vector Product: */
1309: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1310: SSE_SHUFFLE(XMM6,XMM6,0x00)
1311: SSE_MULT_PS(XMM6,XMM0)
1312: SSE_SUB_PS(XMM5,XMM6)
1314: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1315: SSE_SHUFFLE(XMM7,XMM7,0x00)
1316: SSE_MULT_PS(XMM7,XMM1)
1317: SSE_SUB_PS(XMM5,XMM7)
1319: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1320: SSE_SHUFFLE(XMM4,XMM4,0x00)
1321: SSE_MULT_PS(XMM4,XMM2)
1322: SSE_SUB_PS(XMM5,XMM4)
1324: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1325: SSE_SHUFFLE(XMM6,XMM6,0x00)
1326: SSE_MULT_PS(XMM6,XMM3)
1327: SSE_SUB_PS(XMM5,XMM6)
1329: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1330: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1332: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1334: /* Third Column */
1335: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1336: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1338: /* Matrix-Vector Product: */
1339: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1340: SSE_SHUFFLE(XMM7,XMM7,0x00)
1341: SSE_MULT_PS(XMM7,XMM0)
1342: SSE_SUB_PS(XMM6,XMM7)
1344: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1345: SSE_SHUFFLE(XMM4,XMM4,0x00)
1346: SSE_MULT_PS(XMM4,XMM1)
1347: SSE_SUB_PS(XMM6,XMM4)
1349: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1350: SSE_SHUFFLE(XMM5,XMM5,0x00)
1351: SSE_MULT_PS(XMM5,XMM2)
1352: SSE_SUB_PS(XMM6,XMM5)
1354: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1355: SSE_SHUFFLE(XMM7,XMM7,0x00)
1356: SSE_MULT_PS(XMM7,XMM3)
1357: SSE_SUB_PS(XMM6,XMM7)
1359: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1360: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1362: /* Fourth Column */
1363: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1364: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1366: /* Matrix-Vector Product: */
1367: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1368: SSE_SHUFFLE(XMM5,XMM5,0x00)
1369: SSE_MULT_PS(XMM5,XMM0)
1370: SSE_SUB_PS(XMM4,XMM5)
1372: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1373: SSE_SHUFFLE(XMM6,XMM6,0x00)
1374: SSE_MULT_PS(XMM6,XMM1)
1375: SSE_SUB_PS(XMM4,XMM6)
1377: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1378: SSE_SHUFFLE(XMM7,XMM7,0x00)
1379: SSE_MULT_PS(XMM7,XMM2)
1380: SSE_SUB_PS(XMM4,XMM7)
1382: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1383: SSE_SHUFFLE(XMM5,XMM5,0x00)
1384: SSE_MULT_PS(XMM5,XMM3)
1385: SSE_SUB_PS(XMM4,XMM5)
1387: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1388: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1389: SSE_INLINE_END_2;
1390: pv += 16;
1391: }
1392: PetscLogFlops(128.0*nz+112.0);
1393: }
1394: row = (unsigned int)(*bjtmp++);
1395: /* row = (*bjtmp++)/4; */
1396: /* bjtmp++; */
1397: }
1398: /* finished row so stick it into b->a */
1399: pv = ba + 16*bi[i];
1400: pj = bj + bi[i];
1401: nz = bi[i+1] - bi[i];
1403: /* Copy x block back into pv block */
1404: for (j=0; j<nz; j++) {
1405: x = rtmp+16*((unsigned int)pj[j]);
1406: /* x = rtmp+4*pj[j]; */
1408: SSE_INLINE_BEGIN_2(x,pv)
1409: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1410: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1411: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1413: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1414: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1416: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1417: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1419: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1420: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1422: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1423: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1425: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1426: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1428: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1429: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1431: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1432: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1433: SSE_INLINE_END_2;
1434: pv += 16;
1435: }
1436: /* invert diagonal block */
1437: w = ba + 16*diag_offset[i];
1438: if (pivotinblocks) {
1439: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1440: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1441: } else {
1442: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1443: }
1444: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1445: }
1447: PetscFree(rtmp);
1449: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1450: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1451: C->assembled = PETSC_TRUE;
1453: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1454: /* Flop Count from inverting diagonal blocks */
1455: SSE_SCOPE_END;
1456: return(0);
1457: }
1461: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1462: {
1463: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1465: int i,j,n = a->mbs;
1466: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1467: unsigned int row;
1468: int *ajtmpold,nz,*bi=b->i;
1469: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1470: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1471: MatScalar *ba = b->a,*aa = a->a;
1472: int nonzero=0;
1473: PetscBool pivotinblocks = b->pivotinblocks;
1474: PetscReal shift = info->shiftamount;
1475: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1478: allowzeropivot = PetscNot(A->erroriffailure);
1479: SSE_SCOPE_BEGIN;
1481: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1482: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1483: PetscMalloc1(16*(n+1),&rtmp);
1484: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1485: /* if ((unsigned long)bj==(unsigned long)aj) { */
1486: /* colscale = 4; */
1487: /* } */
1488: if ((unsigned long)bj==(unsigned long)aj) {
1489: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1490: }
1492: for (i=0; i<n; i++) {
1493: nz = bi[i+1] - bi[i];
1494: bjtmp = bj + bi[i];
1495: /* zero out the 4x4 block accumulators */
1496: /* zero out one register */
1497: XOR_PS(XMM7,XMM7);
1498: for (j=0; j<nz; j++) {
1499: x = rtmp+16*((unsigned int)bjtmp[j]);
1500: /* x = rtmp+4*bjtmp[j]; */
1501: SSE_INLINE_BEGIN_1(x)
1502: /* Copy zero register to memory locations */
1503: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1504: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1505: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1506: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1507: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1508: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1509: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1510: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1511: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1512: SSE_INLINE_END_1;
1513: }
1514: /* load in initial (unfactored row) */
1515: nz = ai[i+1] - ai[i];
1516: ajtmpold = aj + ai[i];
1517: v = aa + 16*ai[i];
1518: for (j=0; j<nz; j++) {
1519: x = rtmp+16*ajtmpold[j];
1520: /* x = rtmp+colscale*ajtmpold[j]; */
1521: /* Copy v block into x block */
1522: SSE_INLINE_BEGIN_2(v,x)
1523: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1524: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1525: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1527: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1528: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1530: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1531: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1533: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1534: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1536: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1537: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1539: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1540: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1542: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1543: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1545: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1546: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1547: SSE_INLINE_END_2;
1549: v += 16;
1550: }
1551: /* row = (*bjtmp++)/4; */
1552: row = (unsigned int)(*bjtmp++);
1553: while (row < i) {
1554: pc = rtmp + 16*row;
1555: SSE_INLINE_BEGIN_1(pc)
1556: /* Load block from lower triangle */
1557: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1558: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1559: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1561: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1562: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1564: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1565: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1567: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1568: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1570: /* Compare block to zero block */
1572: SSE_COPY_PS(XMM4,XMM7)
1573: SSE_CMPNEQ_PS(XMM4,XMM0)
1575: SSE_COPY_PS(XMM5,XMM7)
1576: SSE_CMPNEQ_PS(XMM5,XMM1)
1578: SSE_COPY_PS(XMM6,XMM7)
1579: SSE_CMPNEQ_PS(XMM6,XMM2)
1581: SSE_CMPNEQ_PS(XMM7,XMM3)
1583: /* Reduce the comparisons to one SSE register */
1584: SSE_OR_PS(XMM6,XMM7)
1585: SSE_OR_PS(XMM5,XMM4)
1586: SSE_OR_PS(XMM5,XMM6)
1587: SSE_INLINE_END_1;
1589: /* Reduce the one SSE register to an integer register for branching */
1590: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1591: MOVEMASK(nonzero,XMM5);
1593: /* If block is nonzero ... */
1594: if (nonzero) {
1595: pv = ba + 16*diag_offset[row];
1596: PREFETCH_L1(&pv[16]);
1597: pj = bj + diag_offset[row] + 1;
1599: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1600: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1601: /* but the diagonal was inverted already */
1602: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1604: SSE_INLINE_BEGIN_2(pv,pc)
1605: /* Column 0, product is accumulated in XMM4 */
1606: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1607: SSE_SHUFFLE(XMM4,XMM4,0x00)
1608: SSE_MULT_PS(XMM4,XMM0)
1610: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1611: SSE_SHUFFLE(XMM5,XMM5,0x00)
1612: SSE_MULT_PS(XMM5,XMM1)
1613: SSE_ADD_PS(XMM4,XMM5)
1615: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1616: SSE_SHUFFLE(XMM6,XMM6,0x00)
1617: SSE_MULT_PS(XMM6,XMM2)
1618: SSE_ADD_PS(XMM4,XMM6)
1620: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1621: SSE_SHUFFLE(XMM7,XMM7,0x00)
1622: SSE_MULT_PS(XMM7,XMM3)
1623: SSE_ADD_PS(XMM4,XMM7)
1625: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1626: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1628: /* Column 1, product is accumulated in XMM5 */
1629: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1630: SSE_SHUFFLE(XMM5,XMM5,0x00)
1631: SSE_MULT_PS(XMM5,XMM0)
1633: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1634: SSE_SHUFFLE(XMM6,XMM6,0x00)
1635: SSE_MULT_PS(XMM6,XMM1)
1636: SSE_ADD_PS(XMM5,XMM6)
1638: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1639: SSE_SHUFFLE(XMM7,XMM7,0x00)
1640: SSE_MULT_PS(XMM7,XMM2)
1641: SSE_ADD_PS(XMM5,XMM7)
1643: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1644: SSE_SHUFFLE(XMM6,XMM6,0x00)
1645: SSE_MULT_PS(XMM6,XMM3)
1646: SSE_ADD_PS(XMM5,XMM6)
1648: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1649: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1651: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1653: /* Column 2, product is accumulated in XMM6 */
1654: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1655: SSE_SHUFFLE(XMM6,XMM6,0x00)
1656: SSE_MULT_PS(XMM6,XMM0)
1658: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1659: SSE_SHUFFLE(XMM7,XMM7,0x00)
1660: SSE_MULT_PS(XMM7,XMM1)
1661: SSE_ADD_PS(XMM6,XMM7)
1663: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1664: SSE_SHUFFLE(XMM7,XMM7,0x00)
1665: SSE_MULT_PS(XMM7,XMM2)
1666: SSE_ADD_PS(XMM6,XMM7)
1668: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1669: SSE_SHUFFLE(XMM7,XMM7,0x00)
1670: SSE_MULT_PS(XMM7,XMM3)
1671: SSE_ADD_PS(XMM6,XMM7)
1673: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1674: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1676: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1677: /* Column 3, product is accumulated in XMM0 */
1678: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1679: SSE_SHUFFLE(XMM7,XMM7,0x00)
1680: SSE_MULT_PS(XMM0,XMM7)
1682: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1683: SSE_SHUFFLE(XMM7,XMM7,0x00)
1684: SSE_MULT_PS(XMM1,XMM7)
1685: SSE_ADD_PS(XMM0,XMM1)
1687: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1688: SSE_SHUFFLE(XMM1,XMM1,0x00)
1689: SSE_MULT_PS(XMM1,XMM2)
1690: SSE_ADD_PS(XMM0,XMM1)
1692: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1693: SSE_SHUFFLE(XMM7,XMM7,0x00)
1694: SSE_MULT_PS(XMM3,XMM7)
1695: SSE_ADD_PS(XMM0,XMM3)
1697: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1698: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1700: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1701: /* This is code to be maintained and read by humans afterall. */
1702: /* Copy Multiplier Col 3 into XMM3 */
1703: SSE_COPY_PS(XMM3,XMM0)
1704: /* Copy Multiplier Col 2 into XMM2 */
1705: SSE_COPY_PS(XMM2,XMM6)
1706: /* Copy Multiplier Col 1 into XMM1 */
1707: SSE_COPY_PS(XMM1,XMM5)
1708: /* Copy Multiplier Col 0 into XMM0 */
1709: SSE_COPY_PS(XMM0,XMM4)
1710: SSE_INLINE_END_2;
1712: /* Update the row: */
1713: nz = bi[row+1] - diag_offset[row] - 1;
1714: pv += 16;
1715: for (j=0; j<nz; j++) {
1716: PREFETCH_L1(&pv[16]);
1717: x = rtmp + 16*((unsigned int)pj[j]);
1718: /* x = rtmp + 4*pj[j]; */
1720: /* X:=X-M*PV, One column at a time */
1721: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1722: SSE_INLINE_BEGIN_2(x,pv)
1723: /* Load First Column of X*/
1724: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1725: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1727: /* Matrix-Vector Product: */
1728: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1729: SSE_SHUFFLE(XMM5,XMM5,0x00)
1730: SSE_MULT_PS(XMM5,XMM0)
1731: SSE_SUB_PS(XMM4,XMM5)
1733: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1734: SSE_SHUFFLE(XMM6,XMM6,0x00)
1735: SSE_MULT_PS(XMM6,XMM1)
1736: SSE_SUB_PS(XMM4,XMM6)
1738: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1739: SSE_SHUFFLE(XMM7,XMM7,0x00)
1740: SSE_MULT_PS(XMM7,XMM2)
1741: SSE_SUB_PS(XMM4,XMM7)
1743: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1744: SSE_SHUFFLE(XMM5,XMM5,0x00)
1745: SSE_MULT_PS(XMM5,XMM3)
1746: SSE_SUB_PS(XMM4,XMM5)
1748: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1749: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1751: /* Second Column */
1752: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1753: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1755: /* Matrix-Vector Product: */
1756: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1757: SSE_SHUFFLE(XMM6,XMM6,0x00)
1758: SSE_MULT_PS(XMM6,XMM0)
1759: SSE_SUB_PS(XMM5,XMM6)
1761: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1762: SSE_SHUFFLE(XMM7,XMM7,0x00)
1763: SSE_MULT_PS(XMM7,XMM1)
1764: SSE_SUB_PS(XMM5,XMM7)
1766: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1767: SSE_SHUFFLE(XMM4,XMM4,0x00)
1768: SSE_MULT_PS(XMM4,XMM2)
1769: SSE_SUB_PS(XMM5,XMM4)
1771: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1772: SSE_SHUFFLE(XMM6,XMM6,0x00)
1773: SSE_MULT_PS(XMM6,XMM3)
1774: SSE_SUB_PS(XMM5,XMM6)
1776: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1777: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1779: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1781: /* Third Column */
1782: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1783: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1785: /* Matrix-Vector Product: */
1786: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1787: SSE_SHUFFLE(XMM7,XMM7,0x00)
1788: SSE_MULT_PS(XMM7,XMM0)
1789: SSE_SUB_PS(XMM6,XMM7)
1791: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1792: SSE_SHUFFLE(XMM4,XMM4,0x00)
1793: SSE_MULT_PS(XMM4,XMM1)
1794: SSE_SUB_PS(XMM6,XMM4)
1796: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1797: SSE_SHUFFLE(XMM5,XMM5,0x00)
1798: SSE_MULT_PS(XMM5,XMM2)
1799: SSE_SUB_PS(XMM6,XMM5)
1801: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1802: SSE_SHUFFLE(XMM7,XMM7,0x00)
1803: SSE_MULT_PS(XMM7,XMM3)
1804: SSE_SUB_PS(XMM6,XMM7)
1806: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1807: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1809: /* Fourth Column */
1810: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1811: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1813: /* Matrix-Vector Product: */
1814: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1815: SSE_SHUFFLE(XMM5,XMM5,0x00)
1816: SSE_MULT_PS(XMM5,XMM0)
1817: SSE_SUB_PS(XMM4,XMM5)
1819: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1820: SSE_SHUFFLE(XMM6,XMM6,0x00)
1821: SSE_MULT_PS(XMM6,XMM1)
1822: SSE_SUB_PS(XMM4,XMM6)
1824: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1825: SSE_SHUFFLE(XMM7,XMM7,0x00)
1826: SSE_MULT_PS(XMM7,XMM2)
1827: SSE_SUB_PS(XMM4,XMM7)
1829: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1830: SSE_SHUFFLE(XMM5,XMM5,0x00)
1831: SSE_MULT_PS(XMM5,XMM3)
1832: SSE_SUB_PS(XMM4,XMM5)
1834: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1835: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1836: SSE_INLINE_END_2;
1837: pv += 16;
1838: }
1839: PetscLogFlops(128.0*nz+112.0);
1840: }
1841: row = (unsigned int)(*bjtmp++);
1842: /* row = (*bjtmp++)/4; */
1843: /* bjtmp++; */
1844: }
1845: /* finished row so stick it into b->a */
1846: pv = ba + 16*bi[i];
1847: pj = bj + bi[i];
1848: nz = bi[i+1] - bi[i];
1850: /* Copy x block back into pv block */
1851: for (j=0; j<nz; j++) {
1852: x = rtmp+16*((unsigned int)pj[j]);
1853: /* x = rtmp+4*pj[j]; */
1855: SSE_INLINE_BEGIN_2(x,pv)
1856: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1857: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1858: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1860: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1861: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1863: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1864: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1866: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1867: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1869: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1870: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1872: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1873: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1875: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1876: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1878: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1879: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1880: SSE_INLINE_END_2;
1881: pv += 16;
1882: }
1883: /* invert diagonal block */
1884: w = ba + 16*diag_offset[i];
1885: if (pivotinblocks) {
1886: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1887: if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1888: } else {
1889: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1890: }
1891: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1892: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1893: }
1895: PetscFree(rtmp);
1897: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1898: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1899: C->assembled = PETSC_TRUE;
1901: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1902: /* Flop Count from inverting diagonal blocks */
1903: SSE_SCOPE_END;
1904: return(0);
1905: }
1907: #endif