Actual source code: baijfact11.c
petsc-3.12.5 2020-03-29
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15: IS isrow = b->row,isicol = b->icol;
17: const PetscInt *r,*ic;
18: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19: PetscInt *ajtmpold,*ajtmp,nz,row;
20: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
21: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
22: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25: MatScalar m13,m14,m15,m16;
26: MatScalar *ba = b->a,*aa = a->a;
27: PetscBool pivotinblocks = b->pivotinblocks;
28: PetscReal shift = info->shiftamount;
29: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
32: ISGetIndices(isrow,&r);
33: ISGetIndices(isicol,&ic);
34: PetscMalloc1(16*(n+1),&rtmp);
35: allowzeropivot = PetscNot(A->erroriffailure);
37: for (i=0; i<n; i++) {
38: nz = bi[i+1] - bi[i];
39: ajtmp = bj + bi[i];
40: for (j=0; j<nz; j++) {
41: x = rtmp+16*ajtmp[j];
42: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
43: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44: }
45: /* load in initial (unfactored row) */
46: idx = r[i];
47: nz = ai[idx+1] - ai[idx];
48: ajtmpold = aj + ai[idx];
49: v = aa + 16*ai[idx];
50: for (j=0; j<nz; j++) {
51: x = rtmp+16*ic[ajtmpold[j]];
52: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
53: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
54: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55: x[14] = v[14]; x[15] = v[15];
56: v += 16;
57: }
58: row = *ajtmp++;
59: while (row < i) {
60: pc = rtmp + 16*row;
61: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
62: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
63: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64: p15 = pc[14]; p16 = pc[15];
65: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68: || p16 != 0.0) {
69: pv = ba + 16*diag_offset[row];
70: pj = bj + diag_offset[row] + 1;
71: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
72: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
73: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74: x15 = pv[14]; x16 = pv[15];
75: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
76: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
77: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
78: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
80: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
81: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
82: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
83: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
85: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
86: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
87: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
88: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
90: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
91: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
92: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
93: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
95: nz = bi[row+1] - diag_offset[row] - 1;
96: pv += 16;
97: for (j=0; j<nz; j++) {
98: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
99: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
100: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
102: x = rtmp + 16*pj[j];
103: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
104: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
105: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
106: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
108: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
109: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
110: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
111: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
113: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
114: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
119: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
120: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
121: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
123: pv += 16;
124: }
125: PetscLogFlops(128.0*nz+112.0);
126: }
127: row = *ajtmp++;
128: }
129: /* finished row so stick it into b->a */
130: pv = ba + 16*bi[i];
131: pj = bj + bi[i];
132: nz = bi[i+1] - bi[i];
133: for (j=0; j<nz; j++) {
134: x = rtmp+16*pj[j];
135: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
136: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
137: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139: pv += 16;
140: }
141: /* invert diagonal block */
142: w = ba + 16*diag_offset[i];
143: if (pivotinblocks) {
144: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
145: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146: } else {
147: PetscKernel_A_gets_inverse_A_4_nopivot(w);
148: }
149: }
151: PetscFree(rtmp);
152: ISRestoreIndices(isicol,&ic);
153: ISRestoreIndices(isrow,&r);
155: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
156: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157: C->assembled = PETSC_TRUE;
159: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
160: return(0);
161: }
163: /* MatLUFactorNumeric_SeqBAIJ_4 -
164: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165: PetscKernel_A_gets_A_times_B()
166: PetscKernel_A_gets_A_minus_B_times_C()
167: PetscKernel_A_gets_inverse_A()
168: */
170: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171: {
172: Mat C = B;
173: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
174: IS isrow = b->row,isicol = b->icol;
176: const PetscInt *r,*ic;
177: PetscInt i,j,k,nz,nzL,row;
178: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181: PetscInt flg;
182: PetscReal shift;
183: PetscBool allowzeropivot,zeropivotdetected;
186: allowzeropivot = PetscNot(A->erroriffailure);
187: ISGetIndices(isrow,&r);
188: ISGetIndices(isicol,&ic);
190: if (info->shifttype == MAT_SHIFT_NONE) {
191: shift = 0;
192: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
193: shift = info->shiftamount;
194: }
196: /* generate work space needed by the factorization */
197: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
198: PetscArrayzero(rtmp,bs2*n);
200: for (i=0; i<n; i++) {
201: /* zero rtmp */
202: /* L part */
203: nz = bi[i+1] - bi[i];
204: bjtmp = bj + bi[i];
205: for (j=0; j<nz; j++) {
206: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
207: }
209: /* U part */
210: nz = bdiag[i]-bdiag[i+1];
211: bjtmp = bj + bdiag[i+1]+1;
212: for (j=0; j<nz; j++) {
213: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
214: }
216: /* load in initial (unfactored row) */
217: nz = ai[r[i]+1] - ai[r[i]];
218: ajtmp = aj + ai[r[i]];
219: v = aa + bs2*ai[r[i]];
220: for (j=0; j<nz; j++) {
221: PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);
222: }
224: /* elimination */
225: bjtmp = bj + bi[i];
226: nzL = bi[i+1] - bi[i];
227: for (k=0; k < nzL; k++) {
228: row = bjtmp[k];
229: pc = rtmp + bs2*row;
230: for (flg=0,j=0; j<bs2; j++) {
231: if (pc[j]!=0.0) {
232: flg = 1;
233: break;
234: }
235: }
236: if (flg) {
237: pv = b->a + bs2*bdiag[row];
238: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
239: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
241: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
242: pv = b->a + bs2*(bdiag[row+1]+1);
243: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
244: for (j=0; j<nz; j++) {
245: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
246: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
247: v = rtmp + bs2*pj[j];
248: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
249: pv += bs2;
250: }
251: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
252: }
253: }
255: /* finished row so stick it into b->a */
256: /* L part */
257: pv = b->a + bs2*bi[i];
258: pj = b->j + bi[i];
259: nz = bi[i+1] - bi[i];
260: for (j=0; j<nz; j++) {
261: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
262: }
264: /* Mark diagonal and invert diagonal for simplier triangular solves */
265: pv = b->a + bs2*bdiag[i];
266: pj = b->j + bdiag[i];
267: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
268: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
269: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
271: /* U part */
272: pv = b->a + bs2*(bdiag[i+1]+1);
273: pj = b->j + bdiag[i+1]+1;
274: nz = bdiag[i] - bdiag[i+1] - 1;
275: for (j=0; j<nz; j++) {
276: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
277: }
278: }
280: PetscFree2(rtmp,mwork);
281: ISRestoreIndices(isicol,&ic);
282: ISRestoreIndices(isrow,&r);
284: C->ops->solve = MatSolve_SeqBAIJ_4;
285: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
286: C->assembled = PETSC_TRUE;
288: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
289: return(0);
290: }
292: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293: {
294: /*
295: Default Version for when blocks are 4 by 4 Using natural ordering
296: */
297: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
299: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300: PetscInt *ajtmpold,*ajtmp,nz,row;
301: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
303: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306: MatScalar m13,m14,m15,m16;
307: MatScalar *ba = b->a,*aa = a->a;
308: PetscBool pivotinblocks = b->pivotinblocks;
309: PetscReal shift = info->shiftamount;
310: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
313: allowzeropivot = PetscNot(A->erroriffailure);
314: PetscMalloc1(16*(n+1),&rtmp);
316: for (i=0; i<n; i++) {
317: nz = bi[i+1] - bi[i];
318: ajtmp = bj + bi[i];
319: for (j=0; j<nz; j++) {
320: x = rtmp+16*ajtmp[j];
321: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
322: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
323: }
324: /* load in initial (unfactored row) */
325: nz = ai[i+1] - ai[i];
326: ajtmpold = aj + ai[i];
327: v = aa + 16*ai[i];
328: for (j=0; j<nz; j++) {
329: x = rtmp+16*ajtmpold[j];
330: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
331: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
332: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
333: x[14] = v[14]; x[15] = v[15];
334: v += 16;
335: }
336: row = *ajtmp++;
337: while (row < i) {
338: pc = rtmp + 16*row;
339: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
340: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
341: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
342: p15 = pc[14]; p16 = pc[15];
343: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
344: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
345: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
346: || p16 != 0.0) {
347: pv = ba + 16*diag_offset[row];
348: pj = bj + diag_offset[row] + 1;
349: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
350: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
351: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
352: x15 = pv[14]; x16 = pv[15];
353: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
354: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
355: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
356: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
358: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
359: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
360: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
361: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
363: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
364: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
365: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
366: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
368: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
369: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
370: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
371: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
372: nz = bi[row+1] - diag_offset[row] - 1;
373: pv += 16;
374: for (j=0; j<nz; j++) {
375: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
376: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
377: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
378: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
379: x = rtmp + 16*pj[j];
380: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
381: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
382: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
383: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
385: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
386: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
387: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
388: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
390: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
391: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
392: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
393: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
395: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
396: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
397: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
398: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
400: pv += 16;
401: }
402: PetscLogFlops(128.0*nz+112.0);
403: }
404: row = *ajtmp++;
405: }
406: /* finished row so stick it into b->a */
407: pv = ba + 16*bi[i];
408: pj = bj + bi[i];
409: nz = bi[i+1] - bi[i];
410: for (j=0; j<nz; j++) {
411: x = rtmp+16*pj[j];
412: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
413: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
414: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
415: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
416: pv += 16;
417: }
418: /* invert diagonal block */
419: w = ba + 16*diag_offset[i];
420: if (pivotinblocks) {
421: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
422: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
423: } else {
424: PetscKernel_A_gets_inverse_A_4_nopivot(w);
425: }
426: }
428: PetscFree(rtmp);
430: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
431: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
432: C->assembled = PETSC_TRUE;
434: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
435: return(0);
436: }
438: /*
439: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
440: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
441: */
442: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
443: {
444: Mat C =B;
445: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
447: PetscInt i,j,k,nz,nzL,row;
448: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
449: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
450: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
451: PetscInt flg;
452: PetscReal shift;
453: PetscBool allowzeropivot,zeropivotdetected;
456: allowzeropivot = PetscNot(A->erroriffailure);
458: /* generate work space needed by the factorization */
459: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
460: PetscArrayzero(rtmp,bs2*n);
462: if (info->shifttype == MAT_SHIFT_NONE) {
463: shift = 0;
464: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
465: shift = info->shiftamount;
466: }
468: for (i=0; i<n; i++) {
469: /* zero rtmp */
470: /* L part */
471: nz = bi[i+1] - bi[i];
472: bjtmp = bj + bi[i];
473: for (j=0; j<nz; j++) {
474: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
475: }
477: /* U part */
478: nz = bdiag[i] - bdiag[i+1];
479: bjtmp = bj + bdiag[i+1]+1;
480: for (j=0; j<nz; j++) {
481: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
482: }
484: /* load in initial (unfactored row) */
485: nz = ai[i+1] - ai[i];
486: ajtmp = aj + ai[i];
487: v = aa + bs2*ai[i];
488: for (j=0; j<nz; j++) {
489: PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);
490: }
492: /* elimination */
493: bjtmp = bj + bi[i];
494: nzL = bi[i+1] - bi[i];
495: for (k=0; k < nzL; k++) {
496: row = bjtmp[k];
497: pc = rtmp + bs2*row;
498: for (flg=0,j=0; j<bs2; j++) {
499: if (pc[j]!=0.0) {
500: flg = 1;
501: break;
502: }
503: }
504: if (flg) {
505: pv = b->a + bs2*bdiag[row];
506: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
507: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
509: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
510: pv = b->a + bs2*(bdiag[row+1]+1);
511: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
512: for (j=0; j<nz; j++) {
513: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
514: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
515: v = rtmp + bs2*pj[j];
516: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
517: pv += bs2;
518: }
519: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
520: }
521: }
523: /* finished row so stick it into b->a */
524: /* L part */
525: pv = b->a + bs2*bi[i];
526: pj = b->j + bi[i];
527: nz = bi[i+1] - bi[i];
528: for (j=0; j<nz; j++) {
529: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
530: }
532: /* Mark diagonal and invert diagonal for simplier triangular solves */
533: pv = b->a + bs2*bdiag[i];
534: pj = b->j + bdiag[i];
535: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
536: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
537: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
539: /* U part */
540: pv = b->a + bs2*(bdiag[i+1]+1);
541: pj = b->j + bdiag[i+1]+1;
542: nz = bdiag[i] - bdiag[i+1] - 1;
543: for (j=0; j<nz; j++) {
544: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
545: }
546: }
547: PetscFree2(rtmp,mwork);
549: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
550: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
551: C->assembled = PETSC_TRUE;
553: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
554: return(0);
555: }
557: #if defined(PETSC_HAVE_SSE)
559: #include PETSC_HAVE_SSE
561: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
562: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
563: {
564: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
566: int i,j,n = a->mbs;
567: int *bj = b->j,*bjtmp,*pj;
568: int row;
569: int *ajtmpold,nz,*bi=b->i;
570: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
571: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
572: MatScalar *ba = b->a,*aa = a->a;
573: int nonzero=0;
574: PetscBool pivotinblocks = b->pivotinblocks;
575: PetscReal shift = info->shiftamount;
576: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
579: allowzeropivot = PetscNot(A->erroriffailure);
580: SSE_SCOPE_BEGIN;
582: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
583: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
584: PetscMalloc1(16*(n+1),&rtmp);
585: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
586: /* if ((unsigned long)bj==(unsigned long)aj) { */
587: /* colscale = 4; */
588: /* } */
589: for (i=0; i<n; i++) {
590: nz = bi[i+1] - bi[i];
591: bjtmp = bj + bi[i];
592: /* zero out the 4x4 block accumulators */
593: /* zero out one register */
594: XOR_PS(XMM7,XMM7);
595: for (j=0; j<nz; j++) {
596: x = rtmp+16*bjtmp[j];
597: /* x = rtmp+4*bjtmp[j]; */
598: SSE_INLINE_BEGIN_1(x)
599: /* Copy zero register to memory locations */
600: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
602: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
603: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
604: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
605: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
606: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
607: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
608: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
609: SSE_INLINE_END_1;
610: }
611: /* load in initial (unfactored row) */
612: nz = ai[i+1] - ai[i];
613: ajtmpold = aj + ai[i];
614: v = aa + 16*ai[i];
615: for (j=0; j<nz; j++) {
616: x = rtmp+16*ajtmpold[j];
617: /* x = rtmp+colscale*ajtmpold[j]; */
618: /* Copy v block into x block */
619: SSE_INLINE_BEGIN_2(v,x)
620: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
624: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
625: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
627: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
628: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
630: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
631: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
633: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
634: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
636: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
637: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
639: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
640: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
642: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
643: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
644: SSE_INLINE_END_2;
646: v += 16;
647: }
648: /* row = (*bjtmp++)/4; */
649: row = *bjtmp++;
650: while (row < i) {
651: pc = rtmp + 16*row;
652: SSE_INLINE_BEGIN_1(pc)
653: /* Load block from lower triangle */
654: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
658: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
659: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
661: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
662: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
664: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
665: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
667: /* Compare block to zero block */
669: SSE_COPY_PS(XMM4,XMM7)
670: SSE_CMPNEQ_PS(XMM4,XMM0)
672: SSE_COPY_PS(XMM5,XMM7)
673: SSE_CMPNEQ_PS(XMM5,XMM1)
675: SSE_COPY_PS(XMM6,XMM7)
676: SSE_CMPNEQ_PS(XMM6,XMM2)
678: SSE_CMPNEQ_PS(XMM7,XMM3)
680: /* Reduce the comparisons to one SSE register */
681: SSE_OR_PS(XMM6,XMM7)
682: SSE_OR_PS(XMM5,XMM4)
683: SSE_OR_PS(XMM5,XMM6)
684: SSE_INLINE_END_1;
686: /* Reduce the one SSE register to an integer register for branching */
687: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
688: MOVEMASK(nonzero,XMM5);
690: /* If block is nonzero ... */
691: if (nonzero) {
692: pv = ba + 16*diag_offset[row];
693: PREFETCH_L1(&pv[16]);
694: pj = bj + diag_offset[row] + 1;
696: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
697: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
698: /* but the diagonal was inverted already */
699: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
701: SSE_INLINE_BEGIN_2(pv,pc)
702: /* Column 0, product is accumulated in XMM4 */
703: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
704: SSE_SHUFFLE(XMM4,XMM4,0x00)
705: SSE_MULT_PS(XMM4,XMM0)
707: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
708: SSE_SHUFFLE(XMM5,XMM5,0x00)
709: SSE_MULT_PS(XMM5,XMM1)
710: SSE_ADD_PS(XMM4,XMM5)
712: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
713: SSE_SHUFFLE(XMM6,XMM6,0x00)
714: SSE_MULT_PS(XMM6,XMM2)
715: SSE_ADD_PS(XMM4,XMM6)
717: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
718: SSE_SHUFFLE(XMM7,XMM7,0x00)
719: SSE_MULT_PS(XMM7,XMM3)
720: SSE_ADD_PS(XMM4,XMM7)
722: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
723: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
725: /* Column 1, product is accumulated in XMM5 */
726: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
727: SSE_SHUFFLE(XMM5,XMM5,0x00)
728: SSE_MULT_PS(XMM5,XMM0)
730: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
731: SSE_SHUFFLE(XMM6,XMM6,0x00)
732: SSE_MULT_PS(XMM6,XMM1)
733: SSE_ADD_PS(XMM5,XMM6)
735: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
736: SSE_SHUFFLE(XMM7,XMM7,0x00)
737: SSE_MULT_PS(XMM7,XMM2)
738: SSE_ADD_PS(XMM5,XMM7)
740: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
741: SSE_SHUFFLE(XMM6,XMM6,0x00)
742: SSE_MULT_PS(XMM6,XMM3)
743: SSE_ADD_PS(XMM5,XMM6)
745: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
746: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
748: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
750: /* Column 2, product is accumulated in XMM6 */
751: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
752: SSE_SHUFFLE(XMM6,XMM6,0x00)
753: SSE_MULT_PS(XMM6,XMM0)
755: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
756: SSE_SHUFFLE(XMM7,XMM7,0x00)
757: SSE_MULT_PS(XMM7,XMM1)
758: SSE_ADD_PS(XMM6,XMM7)
760: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
761: SSE_SHUFFLE(XMM7,XMM7,0x00)
762: SSE_MULT_PS(XMM7,XMM2)
763: SSE_ADD_PS(XMM6,XMM7)
765: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
766: SSE_SHUFFLE(XMM7,XMM7,0x00)
767: SSE_MULT_PS(XMM7,XMM3)
768: SSE_ADD_PS(XMM6,XMM7)
770: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
771: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
773: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
774: /* Column 3, product is accumulated in XMM0 */
775: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
776: SSE_SHUFFLE(XMM7,XMM7,0x00)
777: SSE_MULT_PS(XMM0,XMM7)
779: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
780: SSE_SHUFFLE(XMM7,XMM7,0x00)
781: SSE_MULT_PS(XMM1,XMM7)
782: SSE_ADD_PS(XMM0,XMM1)
784: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
785: SSE_SHUFFLE(XMM1,XMM1,0x00)
786: SSE_MULT_PS(XMM1,XMM2)
787: SSE_ADD_PS(XMM0,XMM1)
789: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
790: SSE_SHUFFLE(XMM7,XMM7,0x00)
791: SSE_MULT_PS(XMM3,XMM7)
792: SSE_ADD_PS(XMM0,XMM3)
794: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
795: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
797: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
798: /* This is code to be maintained and read by humans afterall. */
799: /* Copy Multiplier Col 3 into XMM3 */
800: SSE_COPY_PS(XMM3,XMM0)
801: /* Copy Multiplier Col 2 into XMM2 */
802: SSE_COPY_PS(XMM2,XMM6)
803: /* Copy Multiplier Col 1 into XMM1 */
804: SSE_COPY_PS(XMM1,XMM5)
805: /* Copy Multiplier Col 0 into XMM0 */
806: SSE_COPY_PS(XMM0,XMM4)
807: SSE_INLINE_END_2;
809: /* Update the row: */
810: nz = bi[row+1] - diag_offset[row] - 1;
811: pv += 16;
812: for (j=0; j<nz; j++) {
813: PREFETCH_L1(&pv[16]);
814: x = rtmp + 16*pj[j];
815: /* x = rtmp + 4*pj[j]; */
817: /* X:=X-M*PV, One column at a time */
818: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
819: SSE_INLINE_BEGIN_2(x,pv)
820: /* Load First Column of X*/
821: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
822: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
824: /* Matrix-Vector Product: */
825: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
826: SSE_SHUFFLE(XMM5,XMM5,0x00)
827: SSE_MULT_PS(XMM5,XMM0)
828: SSE_SUB_PS(XMM4,XMM5)
830: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
831: SSE_SHUFFLE(XMM6,XMM6,0x00)
832: SSE_MULT_PS(XMM6,XMM1)
833: SSE_SUB_PS(XMM4,XMM6)
835: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
836: SSE_SHUFFLE(XMM7,XMM7,0x00)
837: SSE_MULT_PS(XMM7,XMM2)
838: SSE_SUB_PS(XMM4,XMM7)
840: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
841: SSE_SHUFFLE(XMM5,XMM5,0x00)
842: SSE_MULT_PS(XMM5,XMM3)
843: SSE_SUB_PS(XMM4,XMM5)
845: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
846: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
848: /* Second Column */
849: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
850: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
852: /* Matrix-Vector Product: */
853: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
854: SSE_SHUFFLE(XMM6,XMM6,0x00)
855: SSE_MULT_PS(XMM6,XMM0)
856: SSE_SUB_PS(XMM5,XMM6)
858: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
859: SSE_SHUFFLE(XMM7,XMM7,0x00)
860: SSE_MULT_PS(XMM7,XMM1)
861: SSE_SUB_PS(XMM5,XMM7)
863: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
864: SSE_SHUFFLE(XMM4,XMM4,0x00)
865: SSE_MULT_PS(XMM4,XMM2)
866: SSE_SUB_PS(XMM5,XMM4)
868: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
869: SSE_SHUFFLE(XMM6,XMM6,0x00)
870: SSE_MULT_PS(XMM6,XMM3)
871: SSE_SUB_PS(XMM5,XMM6)
873: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
874: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
876: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
878: /* Third Column */
879: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
880: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
882: /* Matrix-Vector Product: */
883: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
884: SSE_SHUFFLE(XMM7,XMM7,0x00)
885: SSE_MULT_PS(XMM7,XMM0)
886: SSE_SUB_PS(XMM6,XMM7)
888: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
889: SSE_SHUFFLE(XMM4,XMM4,0x00)
890: SSE_MULT_PS(XMM4,XMM1)
891: SSE_SUB_PS(XMM6,XMM4)
893: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
894: SSE_SHUFFLE(XMM5,XMM5,0x00)
895: SSE_MULT_PS(XMM5,XMM2)
896: SSE_SUB_PS(XMM6,XMM5)
898: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
899: SSE_SHUFFLE(XMM7,XMM7,0x00)
900: SSE_MULT_PS(XMM7,XMM3)
901: SSE_SUB_PS(XMM6,XMM7)
903: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
904: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
906: /* Fourth Column */
907: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
908: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
910: /* Matrix-Vector Product: */
911: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
912: SSE_SHUFFLE(XMM5,XMM5,0x00)
913: SSE_MULT_PS(XMM5,XMM0)
914: SSE_SUB_PS(XMM4,XMM5)
916: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
917: SSE_SHUFFLE(XMM6,XMM6,0x00)
918: SSE_MULT_PS(XMM6,XMM1)
919: SSE_SUB_PS(XMM4,XMM6)
921: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
922: SSE_SHUFFLE(XMM7,XMM7,0x00)
923: SSE_MULT_PS(XMM7,XMM2)
924: SSE_SUB_PS(XMM4,XMM7)
926: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
927: SSE_SHUFFLE(XMM5,XMM5,0x00)
928: SSE_MULT_PS(XMM5,XMM3)
929: SSE_SUB_PS(XMM4,XMM5)
931: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
932: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
933: SSE_INLINE_END_2;
934: pv += 16;
935: }
936: PetscLogFlops(128.0*nz+112.0);
937: }
938: row = *bjtmp++;
939: /* row = (*bjtmp++)/4; */
940: }
941: /* finished row so stick it into b->a */
942: pv = ba + 16*bi[i];
943: pj = bj + bi[i];
944: nz = bi[i+1] - bi[i];
946: /* Copy x block back into pv block */
947: for (j=0; j<nz; j++) {
948: x = rtmp+16*pj[j];
949: /* x = rtmp+4*pj[j]; */
951: SSE_INLINE_BEGIN_2(x,pv)
952: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
953: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
954: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
956: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
957: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
959: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
960: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
962: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
963: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
965: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
966: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
968: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
969: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
971: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
972: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
974: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
975: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
976: SSE_INLINE_END_2;
977: pv += 16;
978: }
979: /* invert diagonal block */
980: w = ba + 16*diag_offset[i];
981: if (pivotinblocks) {
982: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
983: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
984: } else {
985: PetscKernel_A_gets_inverse_A_4_nopivot(w);
986: }
987: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
988: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
989: }
991: PetscFree(rtmp);
993: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
994: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
995: C->assembled = PETSC_TRUE;
997: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
998: /* Flop Count from inverting diagonal blocks */
999: SSE_SCOPE_END;
1000: return(0);
1001: }
1003: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1004: {
1005: Mat A =C;
1006: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1008: int i,j,n = a->mbs;
1009: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1010: unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1011: unsigned int row;
1012: int nz,*bi=b->i;
1013: int *diag_offset = b->diag,*ai=a->i;
1014: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1015: MatScalar *ba = b->a,*aa = a->a;
1016: int nonzero=0;
1017: PetscBool pivotinblocks = b->pivotinblocks;
1018: PetscReal shift = info->shiftamount;
1019: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1022: allowzeropivot = PetscNot(A->erroriffailure);
1023: SSE_SCOPE_BEGIN;
1025: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1026: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1027: PetscMalloc1(16*(n+1),&rtmp);
1028: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1029: /* if ((unsigned long)bj==(unsigned long)aj) { */
1030: /* colscale = 4; */
1031: /* } */
1033: for (i=0; i<n; i++) {
1034: nz = bi[i+1] - bi[i];
1035: bjtmp = bj + bi[i];
1036: /* zero out the 4x4 block accumulators */
1037: /* zero out one register */
1038: XOR_PS(XMM7,XMM7);
1039: for (j=0; j<nz; j++) {
1040: x = rtmp+16*((unsigned int)bjtmp[j]);
1041: /* x = rtmp+4*bjtmp[j]; */
1042: SSE_INLINE_BEGIN_1(x)
1043: /* Copy zero register to memory locations */
1044: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1045: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1046: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1047: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1048: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1049: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1050: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1051: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1052: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1053: SSE_INLINE_END_1;
1054: }
1055: /* load in initial (unfactored row) */
1056: nz = ai[i+1] - ai[i];
1057: ajtmp = aj + ai[i];
1058: v = aa + 16*ai[i];
1059: for (j=0; j<nz; j++) {
1060: x = rtmp+16*((unsigned int)ajtmp[j]);
1061: /* x = rtmp+colscale*ajtmp[j]; */
1062: /* Copy v block into x block */
1063: SSE_INLINE_BEGIN_2(v,x)
1064: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1068: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1069: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1071: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1072: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1074: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1075: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1077: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1078: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1080: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1081: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1083: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1084: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1086: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1087: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1088: SSE_INLINE_END_2;
1090: v += 16;
1091: }
1092: /* row = (*bjtmp++)/4; */
1093: row = (unsigned int)(*bjtmp++);
1094: while (row < i) {
1095: pc = rtmp + 16*row;
1096: SSE_INLINE_BEGIN_1(pc)
1097: /* Load block from lower triangle */
1098: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1099: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1100: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1102: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1103: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1105: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1106: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1108: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1109: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1111: /* Compare block to zero block */
1113: SSE_COPY_PS(XMM4,XMM7)
1114: SSE_CMPNEQ_PS(XMM4,XMM0)
1116: SSE_COPY_PS(XMM5,XMM7)
1117: SSE_CMPNEQ_PS(XMM5,XMM1)
1119: SSE_COPY_PS(XMM6,XMM7)
1120: SSE_CMPNEQ_PS(XMM6,XMM2)
1122: SSE_CMPNEQ_PS(XMM7,XMM3)
1124: /* Reduce the comparisons to one SSE register */
1125: SSE_OR_PS(XMM6,XMM7)
1126: SSE_OR_PS(XMM5,XMM4)
1127: SSE_OR_PS(XMM5,XMM6)
1128: SSE_INLINE_END_1;
1130: /* Reduce the one SSE register to an integer register for branching */
1131: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1132: MOVEMASK(nonzero,XMM5);
1134: /* If block is nonzero ... */
1135: if (nonzero) {
1136: pv = ba + 16*diag_offset[row];
1137: PREFETCH_L1(&pv[16]);
1138: pj = bj + diag_offset[row] + 1;
1140: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1141: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1142: /* but the diagonal was inverted already */
1143: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1145: SSE_INLINE_BEGIN_2(pv,pc)
1146: /* Column 0, product is accumulated in XMM4 */
1147: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1148: SSE_SHUFFLE(XMM4,XMM4,0x00)
1149: SSE_MULT_PS(XMM4,XMM0)
1151: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1152: SSE_SHUFFLE(XMM5,XMM5,0x00)
1153: SSE_MULT_PS(XMM5,XMM1)
1154: SSE_ADD_PS(XMM4,XMM5)
1156: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1157: SSE_SHUFFLE(XMM6,XMM6,0x00)
1158: SSE_MULT_PS(XMM6,XMM2)
1159: SSE_ADD_PS(XMM4,XMM6)
1161: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1162: SSE_SHUFFLE(XMM7,XMM7,0x00)
1163: SSE_MULT_PS(XMM7,XMM3)
1164: SSE_ADD_PS(XMM4,XMM7)
1166: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1167: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1169: /* Column 1, product is accumulated in XMM5 */
1170: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1171: SSE_SHUFFLE(XMM5,XMM5,0x00)
1172: SSE_MULT_PS(XMM5,XMM0)
1174: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1175: SSE_SHUFFLE(XMM6,XMM6,0x00)
1176: SSE_MULT_PS(XMM6,XMM1)
1177: SSE_ADD_PS(XMM5,XMM6)
1179: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1180: SSE_SHUFFLE(XMM7,XMM7,0x00)
1181: SSE_MULT_PS(XMM7,XMM2)
1182: SSE_ADD_PS(XMM5,XMM7)
1184: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1185: SSE_SHUFFLE(XMM6,XMM6,0x00)
1186: SSE_MULT_PS(XMM6,XMM3)
1187: SSE_ADD_PS(XMM5,XMM6)
1189: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1190: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1192: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1194: /* Column 2, product is accumulated in XMM6 */
1195: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1196: SSE_SHUFFLE(XMM6,XMM6,0x00)
1197: SSE_MULT_PS(XMM6,XMM0)
1199: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1200: SSE_SHUFFLE(XMM7,XMM7,0x00)
1201: SSE_MULT_PS(XMM7,XMM1)
1202: SSE_ADD_PS(XMM6,XMM7)
1204: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1205: SSE_SHUFFLE(XMM7,XMM7,0x00)
1206: SSE_MULT_PS(XMM7,XMM2)
1207: SSE_ADD_PS(XMM6,XMM7)
1209: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1210: SSE_SHUFFLE(XMM7,XMM7,0x00)
1211: SSE_MULT_PS(XMM7,XMM3)
1212: SSE_ADD_PS(XMM6,XMM7)
1214: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1215: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1217: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1218: /* Column 3, product is accumulated in XMM0 */
1219: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1220: SSE_SHUFFLE(XMM7,XMM7,0x00)
1221: SSE_MULT_PS(XMM0,XMM7)
1223: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1224: SSE_SHUFFLE(XMM7,XMM7,0x00)
1225: SSE_MULT_PS(XMM1,XMM7)
1226: SSE_ADD_PS(XMM0,XMM1)
1228: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1229: SSE_SHUFFLE(XMM1,XMM1,0x00)
1230: SSE_MULT_PS(XMM1,XMM2)
1231: SSE_ADD_PS(XMM0,XMM1)
1233: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1234: SSE_SHUFFLE(XMM7,XMM7,0x00)
1235: SSE_MULT_PS(XMM3,XMM7)
1236: SSE_ADD_PS(XMM0,XMM3)
1238: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1239: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1241: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1242: /* This is code to be maintained and read by humans afterall. */
1243: /* Copy Multiplier Col 3 into XMM3 */
1244: SSE_COPY_PS(XMM3,XMM0)
1245: /* Copy Multiplier Col 2 into XMM2 */
1246: SSE_COPY_PS(XMM2,XMM6)
1247: /* Copy Multiplier Col 1 into XMM1 */
1248: SSE_COPY_PS(XMM1,XMM5)
1249: /* Copy Multiplier Col 0 into XMM0 */
1250: SSE_COPY_PS(XMM0,XMM4)
1251: SSE_INLINE_END_2;
1253: /* Update the row: */
1254: nz = bi[row+1] - diag_offset[row] - 1;
1255: pv += 16;
1256: for (j=0; j<nz; j++) {
1257: PREFETCH_L1(&pv[16]);
1258: x = rtmp + 16*((unsigned int)pj[j]);
1259: /* x = rtmp + 4*pj[j]; */
1261: /* X:=X-M*PV, One column at a time */
1262: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1263: SSE_INLINE_BEGIN_2(x,pv)
1264: /* Load First Column of X*/
1265: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1268: /* Matrix-Vector Product: */
1269: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1270: SSE_SHUFFLE(XMM5,XMM5,0x00)
1271: SSE_MULT_PS(XMM5,XMM0)
1272: SSE_SUB_PS(XMM4,XMM5)
1274: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1275: SSE_SHUFFLE(XMM6,XMM6,0x00)
1276: SSE_MULT_PS(XMM6,XMM1)
1277: SSE_SUB_PS(XMM4,XMM6)
1279: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1280: SSE_SHUFFLE(XMM7,XMM7,0x00)
1281: SSE_MULT_PS(XMM7,XMM2)
1282: SSE_SUB_PS(XMM4,XMM7)
1284: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1285: SSE_SHUFFLE(XMM5,XMM5,0x00)
1286: SSE_MULT_PS(XMM5,XMM3)
1287: SSE_SUB_PS(XMM4,XMM5)
1289: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1290: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1292: /* Second Column */
1293: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1296: /* Matrix-Vector Product: */
1297: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1298: SSE_SHUFFLE(XMM6,XMM6,0x00)
1299: SSE_MULT_PS(XMM6,XMM0)
1300: SSE_SUB_PS(XMM5,XMM6)
1302: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1303: SSE_SHUFFLE(XMM7,XMM7,0x00)
1304: SSE_MULT_PS(XMM7,XMM1)
1305: SSE_SUB_PS(XMM5,XMM7)
1307: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1308: SSE_SHUFFLE(XMM4,XMM4,0x00)
1309: SSE_MULT_PS(XMM4,XMM2)
1310: SSE_SUB_PS(XMM5,XMM4)
1312: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1313: SSE_SHUFFLE(XMM6,XMM6,0x00)
1314: SSE_MULT_PS(XMM6,XMM3)
1315: SSE_SUB_PS(XMM5,XMM6)
1317: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1318: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1320: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1322: /* Third Column */
1323: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1326: /* Matrix-Vector Product: */
1327: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1328: SSE_SHUFFLE(XMM7,XMM7,0x00)
1329: SSE_MULT_PS(XMM7,XMM0)
1330: SSE_SUB_PS(XMM6,XMM7)
1332: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1333: SSE_SHUFFLE(XMM4,XMM4,0x00)
1334: SSE_MULT_PS(XMM4,XMM1)
1335: SSE_SUB_PS(XMM6,XMM4)
1337: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1338: SSE_SHUFFLE(XMM5,XMM5,0x00)
1339: SSE_MULT_PS(XMM5,XMM2)
1340: SSE_SUB_PS(XMM6,XMM5)
1342: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1343: SSE_SHUFFLE(XMM7,XMM7,0x00)
1344: SSE_MULT_PS(XMM7,XMM3)
1345: SSE_SUB_PS(XMM6,XMM7)
1347: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1348: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1350: /* Fourth Column */
1351: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1354: /* Matrix-Vector Product: */
1355: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1356: SSE_SHUFFLE(XMM5,XMM5,0x00)
1357: SSE_MULT_PS(XMM5,XMM0)
1358: SSE_SUB_PS(XMM4,XMM5)
1360: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1361: SSE_SHUFFLE(XMM6,XMM6,0x00)
1362: SSE_MULT_PS(XMM6,XMM1)
1363: SSE_SUB_PS(XMM4,XMM6)
1365: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1366: SSE_SHUFFLE(XMM7,XMM7,0x00)
1367: SSE_MULT_PS(XMM7,XMM2)
1368: SSE_SUB_PS(XMM4,XMM7)
1370: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1371: SSE_SHUFFLE(XMM5,XMM5,0x00)
1372: SSE_MULT_PS(XMM5,XMM3)
1373: SSE_SUB_PS(XMM4,XMM5)
1375: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1376: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1377: SSE_INLINE_END_2;
1378: pv += 16;
1379: }
1380: PetscLogFlops(128.0*nz+112.0);
1381: }
1382: row = (unsigned int)(*bjtmp++);
1383: /* row = (*bjtmp++)/4; */
1384: /* bjtmp++; */
1385: }
1386: /* finished row so stick it into b->a */
1387: pv = ba + 16*bi[i];
1388: pj = bj + bi[i];
1389: nz = bi[i+1] - bi[i];
1391: /* Copy x block back into pv block */
1392: for (j=0; j<nz; j++) {
1393: x = rtmp+16*((unsigned int)pj[j]);
1394: /* x = rtmp+4*pj[j]; */
1396: SSE_INLINE_BEGIN_2(x,pv)
1397: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1398: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1399: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1401: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1402: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1404: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1405: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1407: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1408: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1410: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1411: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1413: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1414: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1416: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1417: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1419: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1420: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1421: SSE_INLINE_END_2;
1422: pv += 16;
1423: }
1424: /* invert diagonal block */
1425: w = ba + 16*diag_offset[i];
1426: if (pivotinblocks) {
1427: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1428: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1429: } else {
1430: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1431: }
1432: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1433: }
1435: PetscFree(rtmp);
1437: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1438: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1439: C->assembled = PETSC_TRUE;
1441: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1442: /* Flop Count from inverting diagonal blocks */
1443: SSE_SCOPE_END;
1444: return(0);
1445: }
1447: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1448: {
1449: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1451: int i,j,n = a->mbs;
1452: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1453: unsigned int row;
1454: int *ajtmpold,nz,*bi=b->i;
1455: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1456: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1457: MatScalar *ba = b->a,*aa = a->a;
1458: int nonzero=0;
1459: PetscBool pivotinblocks = b->pivotinblocks;
1460: PetscReal shift = info->shiftamount;
1461: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1464: allowzeropivot = PetscNot(A->erroriffailure);
1465: SSE_SCOPE_BEGIN;
1467: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1468: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1469: PetscMalloc1(16*(n+1),&rtmp);
1470: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1471: /* if ((unsigned long)bj==(unsigned long)aj) { */
1472: /* colscale = 4; */
1473: /* } */
1474: if ((unsigned long)bj==(unsigned long)aj) {
1475: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1476: }
1478: for (i=0; i<n; i++) {
1479: nz = bi[i+1] - bi[i];
1480: bjtmp = bj + bi[i];
1481: /* zero out the 4x4 block accumulators */
1482: /* zero out one register */
1483: XOR_PS(XMM7,XMM7);
1484: for (j=0; j<nz; j++) {
1485: x = rtmp+16*((unsigned int)bjtmp[j]);
1486: /* x = rtmp+4*bjtmp[j]; */
1487: SSE_INLINE_BEGIN_1(x)
1488: /* Copy zero register to memory locations */
1489: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1490: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1491: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1492: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1493: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1494: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1495: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1496: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1497: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1498: SSE_INLINE_END_1;
1499: }
1500: /* load in initial (unfactored row) */
1501: nz = ai[i+1] - ai[i];
1502: ajtmpold = aj + ai[i];
1503: v = aa + 16*ai[i];
1504: for (j=0; j<nz; j++) {
1505: x = rtmp+16*ajtmpold[j];
1506: /* x = rtmp+colscale*ajtmpold[j]; */
1507: /* Copy v block into x block */
1508: SSE_INLINE_BEGIN_2(v,x)
1509: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1510: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1511: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1513: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1514: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1516: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1517: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1519: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1520: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1522: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1523: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1525: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1526: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1528: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1529: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1531: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1532: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1533: SSE_INLINE_END_2;
1535: v += 16;
1536: }
1537: /* row = (*bjtmp++)/4; */
1538: row = (unsigned int)(*bjtmp++);
1539: while (row < i) {
1540: pc = rtmp + 16*row;
1541: SSE_INLINE_BEGIN_1(pc)
1542: /* Load block from lower triangle */
1543: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1544: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1545: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1547: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1548: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1550: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1551: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1553: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1554: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1556: /* Compare block to zero block */
1558: SSE_COPY_PS(XMM4,XMM7)
1559: SSE_CMPNEQ_PS(XMM4,XMM0)
1561: SSE_COPY_PS(XMM5,XMM7)
1562: SSE_CMPNEQ_PS(XMM5,XMM1)
1564: SSE_COPY_PS(XMM6,XMM7)
1565: SSE_CMPNEQ_PS(XMM6,XMM2)
1567: SSE_CMPNEQ_PS(XMM7,XMM3)
1569: /* Reduce the comparisons to one SSE register */
1570: SSE_OR_PS(XMM6,XMM7)
1571: SSE_OR_PS(XMM5,XMM4)
1572: SSE_OR_PS(XMM5,XMM6)
1573: SSE_INLINE_END_1;
1575: /* Reduce the one SSE register to an integer register for branching */
1576: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1577: MOVEMASK(nonzero,XMM5);
1579: /* If block is nonzero ... */
1580: if (nonzero) {
1581: pv = ba + 16*diag_offset[row];
1582: PREFETCH_L1(&pv[16]);
1583: pj = bj + diag_offset[row] + 1;
1585: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1586: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1587: /* but the diagonal was inverted already */
1588: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1590: SSE_INLINE_BEGIN_2(pv,pc)
1591: /* Column 0, product is accumulated in XMM4 */
1592: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1593: SSE_SHUFFLE(XMM4,XMM4,0x00)
1594: SSE_MULT_PS(XMM4,XMM0)
1596: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1597: SSE_SHUFFLE(XMM5,XMM5,0x00)
1598: SSE_MULT_PS(XMM5,XMM1)
1599: SSE_ADD_PS(XMM4,XMM5)
1601: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1602: SSE_SHUFFLE(XMM6,XMM6,0x00)
1603: SSE_MULT_PS(XMM6,XMM2)
1604: SSE_ADD_PS(XMM4,XMM6)
1606: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1607: SSE_SHUFFLE(XMM7,XMM7,0x00)
1608: SSE_MULT_PS(XMM7,XMM3)
1609: SSE_ADD_PS(XMM4,XMM7)
1611: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1612: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1614: /* Column 1, product is accumulated in XMM5 */
1615: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1616: SSE_SHUFFLE(XMM5,XMM5,0x00)
1617: SSE_MULT_PS(XMM5,XMM0)
1619: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1620: SSE_SHUFFLE(XMM6,XMM6,0x00)
1621: SSE_MULT_PS(XMM6,XMM1)
1622: SSE_ADD_PS(XMM5,XMM6)
1624: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1625: SSE_SHUFFLE(XMM7,XMM7,0x00)
1626: SSE_MULT_PS(XMM7,XMM2)
1627: SSE_ADD_PS(XMM5,XMM7)
1629: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1630: SSE_SHUFFLE(XMM6,XMM6,0x00)
1631: SSE_MULT_PS(XMM6,XMM3)
1632: SSE_ADD_PS(XMM5,XMM6)
1634: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1635: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1637: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1639: /* Column 2, product is accumulated in XMM6 */
1640: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1641: SSE_SHUFFLE(XMM6,XMM6,0x00)
1642: SSE_MULT_PS(XMM6,XMM0)
1644: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1645: SSE_SHUFFLE(XMM7,XMM7,0x00)
1646: SSE_MULT_PS(XMM7,XMM1)
1647: SSE_ADD_PS(XMM6,XMM7)
1649: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1650: SSE_SHUFFLE(XMM7,XMM7,0x00)
1651: SSE_MULT_PS(XMM7,XMM2)
1652: SSE_ADD_PS(XMM6,XMM7)
1654: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1655: SSE_SHUFFLE(XMM7,XMM7,0x00)
1656: SSE_MULT_PS(XMM7,XMM3)
1657: SSE_ADD_PS(XMM6,XMM7)
1659: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1660: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1662: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1663: /* Column 3, product is accumulated in XMM0 */
1664: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1665: SSE_SHUFFLE(XMM7,XMM7,0x00)
1666: SSE_MULT_PS(XMM0,XMM7)
1668: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1669: SSE_SHUFFLE(XMM7,XMM7,0x00)
1670: SSE_MULT_PS(XMM1,XMM7)
1671: SSE_ADD_PS(XMM0,XMM1)
1673: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1674: SSE_SHUFFLE(XMM1,XMM1,0x00)
1675: SSE_MULT_PS(XMM1,XMM2)
1676: SSE_ADD_PS(XMM0,XMM1)
1678: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1679: SSE_SHUFFLE(XMM7,XMM7,0x00)
1680: SSE_MULT_PS(XMM3,XMM7)
1681: SSE_ADD_PS(XMM0,XMM3)
1683: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1684: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1686: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1687: /* This is code to be maintained and read by humans afterall. */
1688: /* Copy Multiplier Col 3 into XMM3 */
1689: SSE_COPY_PS(XMM3,XMM0)
1690: /* Copy Multiplier Col 2 into XMM2 */
1691: SSE_COPY_PS(XMM2,XMM6)
1692: /* Copy Multiplier Col 1 into XMM1 */
1693: SSE_COPY_PS(XMM1,XMM5)
1694: /* Copy Multiplier Col 0 into XMM0 */
1695: SSE_COPY_PS(XMM0,XMM4)
1696: SSE_INLINE_END_2;
1698: /* Update the row: */
1699: nz = bi[row+1] - diag_offset[row] - 1;
1700: pv += 16;
1701: for (j=0; j<nz; j++) {
1702: PREFETCH_L1(&pv[16]);
1703: x = rtmp + 16*((unsigned int)pj[j]);
1704: /* x = rtmp + 4*pj[j]; */
1706: /* X:=X-M*PV, One column at a time */
1707: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1708: SSE_INLINE_BEGIN_2(x,pv)
1709: /* Load First Column of X*/
1710: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1713: /* Matrix-Vector Product: */
1714: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1715: SSE_SHUFFLE(XMM5,XMM5,0x00)
1716: SSE_MULT_PS(XMM5,XMM0)
1717: SSE_SUB_PS(XMM4,XMM5)
1719: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1720: SSE_SHUFFLE(XMM6,XMM6,0x00)
1721: SSE_MULT_PS(XMM6,XMM1)
1722: SSE_SUB_PS(XMM4,XMM6)
1724: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1725: SSE_SHUFFLE(XMM7,XMM7,0x00)
1726: SSE_MULT_PS(XMM7,XMM2)
1727: SSE_SUB_PS(XMM4,XMM7)
1729: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1730: SSE_SHUFFLE(XMM5,XMM5,0x00)
1731: SSE_MULT_PS(XMM5,XMM3)
1732: SSE_SUB_PS(XMM4,XMM5)
1734: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1735: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1737: /* Second Column */
1738: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1741: /* Matrix-Vector Product: */
1742: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1743: SSE_SHUFFLE(XMM6,XMM6,0x00)
1744: SSE_MULT_PS(XMM6,XMM0)
1745: SSE_SUB_PS(XMM5,XMM6)
1747: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1748: SSE_SHUFFLE(XMM7,XMM7,0x00)
1749: SSE_MULT_PS(XMM7,XMM1)
1750: SSE_SUB_PS(XMM5,XMM7)
1752: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1753: SSE_SHUFFLE(XMM4,XMM4,0x00)
1754: SSE_MULT_PS(XMM4,XMM2)
1755: SSE_SUB_PS(XMM5,XMM4)
1757: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1758: SSE_SHUFFLE(XMM6,XMM6,0x00)
1759: SSE_MULT_PS(XMM6,XMM3)
1760: SSE_SUB_PS(XMM5,XMM6)
1762: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1763: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1765: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1767: /* Third Column */
1768: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1771: /* Matrix-Vector Product: */
1772: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1773: SSE_SHUFFLE(XMM7,XMM7,0x00)
1774: SSE_MULT_PS(XMM7,XMM0)
1775: SSE_SUB_PS(XMM6,XMM7)
1777: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1778: SSE_SHUFFLE(XMM4,XMM4,0x00)
1779: SSE_MULT_PS(XMM4,XMM1)
1780: SSE_SUB_PS(XMM6,XMM4)
1782: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1783: SSE_SHUFFLE(XMM5,XMM5,0x00)
1784: SSE_MULT_PS(XMM5,XMM2)
1785: SSE_SUB_PS(XMM6,XMM5)
1787: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1788: SSE_SHUFFLE(XMM7,XMM7,0x00)
1789: SSE_MULT_PS(XMM7,XMM3)
1790: SSE_SUB_PS(XMM6,XMM7)
1792: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1793: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1795: /* Fourth Column */
1796: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1799: /* Matrix-Vector Product: */
1800: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1801: SSE_SHUFFLE(XMM5,XMM5,0x00)
1802: SSE_MULT_PS(XMM5,XMM0)
1803: SSE_SUB_PS(XMM4,XMM5)
1805: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1806: SSE_SHUFFLE(XMM6,XMM6,0x00)
1807: SSE_MULT_PS(XMM6,XMM1)
1808: SSE_SUB_PS(XMM4,XMM6)
1810: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1811: SSE_SHUFFLE(XMM7,XMM7,0x00)
1812: SSE_MULT_PS(XMM7,XMM2)
1813: SSE_SUB_PS(XMM4,XMM7)
1815: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1816: SSE_SHUFFLE(XMM5,XMM5,0x00)
1817: SSE_MULT_PS(XMM5,XMM3)
1818: SSE_SUB_PS(XMM4,XMM5)
1820: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1821: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1822: SSE_INLINE_END_2;
1823: pv += 16;
1824: }
1825: PetscLogFlops(128.0*nz+112.0);
1826: }
1827: row = (unsigned int)(*bjtmp++);
1828: /* row = (*bjtmp++)/4; */
1829: /* bjtmp++; */
1830: }
1831: /* finished row so stick it into b->a */
1832: pv = ba + 16*bi[i];
1833: pj = bj + bi[i];
1834: nz = bi[i+1] - bi[i];
1836: /* Copy x block back into pv block */
1837: for (j=0; j<nz; j++) {
1838: x = rtmp+16*((unsigned int)pj[j]);
1839: /* x = rtmp+4*pj[j]; */
1841: SSE_INLINE_BEGIN_2(x,pv)
1842: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1843: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1844: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1846: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1847: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1849: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1850: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1852: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1853: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1855: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1856: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1858: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1859: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1861: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1862: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1864: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1865: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1866: SSE_INLINE_END_2;
1867: pv += 16;
1868: }
1869: /* invert diagonal block */
1870: w = ba + 16*diag_offset[i];
1871: if (pivotinblocks) {
1872: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1873: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1874: } else {
1875: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1876: }
1877: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1878: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1879: }
1881: PetscFree(rtmp);
1883: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1884: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1885: C->assembled = PETSC_TRUE;
1887: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1888: /* Flop Count from inverting diagonal blocks */
1889: SSE_SCOPE_END;
1890: return(0);
1891: }
1893: #endif