Actual source code: baijsolvnat4.c
petsc-3.14.6 2021-03-30
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
11: PetscInt n =a->mbs;
12: const PetscInt *ai=a->i,*aj=a->j;
13: PetscErrorCode ierr;
14: const PetscInt *diag = a->diag;
15: const MatScalar *aa =a->a;
16: PetscScalar *x;
17: const PetscScalar *b;
20: VecGetArrayRead(bb,&b);
21: VecGetArray(xx,&x);
23: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
24: {
25: static PetscScalar w[2000]; /* very BAD need to fix */
26: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
27: }
28: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
29: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
30: #else
31: {
32: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
33: const MatScalar *v;
34: PetscInt jdx,idt,idx,nz,i,ai16;
35: const PetscInt *vi;
37: /* forward solve the lower triangular */
38: idx = 0;
39: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
40: for (i=1; i<n; i++) {
41: v = aa + 16*ai[i];
42: vi = aj + ai[i];
43: nz = diag[i] - ai[i];
44: idx += 4;
45: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
46: while (nz--) {
47: jdx = 4*(*vi++);
48: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
49: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
50: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
51: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
52: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
53: v += 16;
54: }
55: x[idx] = s1;
56: x[1+idx] = s2;
57: x[2+idx] = s3;
58: x[3+idx] = s4;
59: }
60: /* backward solve the upper triangular */
61: idt = 4*(n-1);
62: for (i=n-1; i>=0; i--) {
63: ai16 = 16*diag[i];
64: v = aa + ai16 + 16;
65: vi = aj + diag[i] + 1;
66: nz = ai[i+1] - diag[i] - 1;
67: s1 = x[idt]; s2 = x[1+idt];
68: s3 = x[2+idt];s4 = x[3+idt];
69: while (nz--) {
70: idx = 4*(*vi++);
71: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
72: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
73: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
74: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
75: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
76: v += 16;
77: }
78: v = aa + ai16;
79: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
80: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
81: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
82: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
83: idt -= 4;
84: }
85: }
86: #endif
88: VecRestoreArrayRead(bb,&b);
89: VecRestoreArray(xx,&x);
90: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
91: return(0);
92: }
94: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
95: {
96: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
97: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
98: PetscInt i,k,nz,idx,jdx,idt;
99: PetscErrorCode ierr;
100: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
101: const MatScalar *aa=a->a,*v;
102: PetscScalar *x;
103: const PetscScalar *b;
104: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
107: VecGetArrayRead(bb,&b);
108: VecGetArray(xx,&x);
109: /* forward solve the lower triangular */
110: idx = 0;
111: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
112: for (i=1; i<n; i++) {
113: v = aa + bs2*ai[i];
114: vi = aj + ai[i];
115: nz = ai[i+1] - ai[i];
116: idx = bs*i;
117: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
118: for (k=0; k<nz; k++) {
119: jdx = bs*vi[k];
120: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
121: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
122: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
123: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
124: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
126: v += bs2;
127: }
129: x[idx] = s1;
130: x[1+idx] = s2;
131: x[2+idx] = s3;
132: x[3+idx] = s4;
133: }
135: /* backward solve the upper triangular */
136: for (i=n-1; i>=0; i--) {
137: v = aa + bs2*(adiag[i+1]+1);
138: vi = aj + adiag[i+1]+1;
139: nz = adiag[i] - adiag[i+1]-1;
140: idt = bs*i;
141: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
143: for (k=0; k<nz; k++) {
144: idx = bs*vi[k];
145: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
146: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
147: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
148: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
149: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
151: v += bs2;
152: }
153: /* x = inv_diagonal*x */
154: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
155: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
156: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
157: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
159: }
161: VecRestoreArrayRead(bb,&b);
162: VecRestoreArray(xx,&x);
163: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
164: return(0);
165: }
167: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
168: {
169: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
170: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
171: PetscErrorCode ierr;
172: const MatScalar *aa=a->a;
173: const PetscScalar *b;
174: PetscScalar *x;
177: VecGetArrayRead(bb,&b);
178: VecGetArray(xx,&x);
180: {
181: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
182: const MatScalar *v;
183: MatScalar *t=(MatScalar*)x;
184: PetscInt jdx,idt,idx,nz,i,ai16;
185: const PetscInt *vi;
187: /* forward solve the lower triangular */
188: idx = 0;
189: t[0] = (MatScalar)b[0];
190: t[1] = (MatScalar)b[1];
191: t[2] = (MatScalar)b[2];
192: t[3] = (MatScalar)b[3];
193: for (i=1; i<n; i++) {
194: v = aa + 16*ai[i];
195: vi = aj + ai[i];
196: nz = diag[i] - ai[i];
197: idx += 4;
198: s1 = (MatScalar)b[idx];
199: s2 = (MatScalar)b[1+idx];
200: s3 = (MatScalar)b[2+idx];
201: s4 = (MatScalar)b[3+idx];
202: while (nz--) {
203: jdx = 4*(*vi++);
204: x1 = t[jdx];
205: x2 = t[1+jdx];
206: x3 = t[2+jdx];
207: x4 = t[3+jdx];
208: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
209: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
210: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
211: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
212: v += 16;
213: }
214: t[idx] = s1;
215: t[1+idx] = s2;
216: t[2+idx] = s3;
217: t[3+idx] = s4;
218: }
219: /* backward solve the upper triangular */
220: idt = 4*(n-1);
221: for (i=n-1; i>=0; i--) {
222: ai16 = 16*diag[i];
223: v = aa + ai16 + 16;
224: vi = aj + diag[i] + 1;
225: nz = ai[i+1] - diag[i] - 1;
226: s1 = t[idt];
227: s2 = t[1+idt];
228: s3 = t[2+idt];
229: s4 = t[3+idt];
230: while (nz--) {
231: idx = 4*(*vi++);
232: x1 = (MatScalar)x[idx];
233: x2 = (MatScalar)x[1+idx];
234: x3 = (MatScalar)x[2+idx];
235: x4 = (MatScalar)x[3+idx];
236: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
237: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
238: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
239: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
240: v += 16;
241: }
242: v = aa + ai16;
243: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
244: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
245: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
246: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
247: idt -= 4;
248: }
249: }
251: VecRestoreArrayRead(bb,&b);
252: VecRestoreArray(xx,&x);
253: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
254: return(0);
255: }
257: #if defined(PETSC_HAVE_SSE)
259: #include PETSC_HAVE_SSE
260: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
261: {
262: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
263: unsigned short *aj=(unsigned short*)a->j;
265: int *ai=a->i,n=a->mbs,*diag = a->diag;
266: MatScalar *aa=a->a;
267: PetscScalar *x,*b;
270: SSE_SCOPE_BEGIN;
271: /*
272: Note: This code currently uses demotion of double
273: to float when performing the mixed-mode computation.
274: This may not be numerically reasonable for all applications.
275: */
276: PREFETCH_NTA(aa+16*ai[1]);
278: VecGetArray(bb,&b);
279: VecGetArray(xx,&x);
280: {
281: /* x will first be computed in single precision then promoted inplace to double */
282: MatScalar *v,*t=(MatScalar*)x;
283: int nz,i,idt,ai16;
284: unsigned int jdx,idx;
285: unsigned short *vi;
286: /* Forward solve the lower triangular factor. */
288: /* First block is the identity. */
289: idx = 0;
290: CONVERT_DOUBLE4_FLOAT4(t,b);
291: v = aa + 16*((unsigned int)ai[1]);
293: for (i=1; i<n;) {
294: PREFETCH_NTA(&v[8]);
295: vi = aj + ai[i];
296: nz = diag[i] - ai[i];
297: idx += 4;
299: /* Demote RHS from double to float. */
300: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
301: LOAD_PS(&t[idx],XMM7);
303: while (nz--) {
304: PREFETCH_NTA(&v[16]);
305: jdx = 4*((unsigned int)(*vi++));
307: /* 4x4 Matrix-Vector product with negative accumulation: */
308: SSE_INLINE_BEGIN_2(&t[jdx],v)
309: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
311: /* First Column */
312: SSE_COPY_PS(XMM0,XMM6)
313: SSE_SHUFFLE(XMM0,XMM0,0x00)
314: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
315: SSE_SUB_PS(XMM7,XMM0)
317: /* Second Column */
318: SSE_COPY_PS(XMM1,XMM6)
319: SSE_SHUFFLE(XMM1,XMM1,0x55)
320: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
321: SSE_SUB_PS(XMM7,XMM1)
323: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
325: /* Third Column */
326: SSE_COPY_PS(XMM2,XMM6)
327: SSE_SHUFFLE(XMM2,XMM2,0xAA)
328: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
329: SSE_SUB_PS(XMM7,XMM2)
331: /* Fourth Column */
332: SSE_COPY_PS(XMM3,XMM6)
333: SSE_SHUFFLE(XMM3,XMM3,0xFF)
334: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
335: SSE_SUB_PS(XMM7,XMM3)
336: SSE_INLINE_END_2
338: v += 16;
339: }
340: v = aa + 16*ai[++i];
341: PREFETCH_NTA(v);
342: STORE_PS(&t[idx],XMM7);
343: }
345: /* Backward solve the upper triangular factor.*/
347: idt = 4*(n-1);
348: ai16 = 16*diag[n-1];
349: v = aa + ai16 + 16;
350: for (i=n-1; i>=0;) {
351: PREFETCH_NTA(&v[8]);
352: vi = aj + diag[i] + 1;
353: nz = ai[i+1] - diag[i] - 1;
355: LOAD_PS(&t[idt],XMM7);
357: while (nz--) {
358: PREFETCH_NTA(&v[16]);
359: idx = 4*((unsigned int)(*vi++));
361: /* 4x4 Matrix-Vector Product with negative accumulation: */
362: SSE_INLINE_BEGIN_2(&t[idx],v)
363: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
365: /* First Column */
366: SSE_COPY_PS(XMM0,XMM6)
367: SSE_SHUFFLE(XMM0,XMM0,0x00)
368: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
369: SSE_SUB_PS(XMM7,XMM0)
371: /* Second Column */
372: SSE_COPY_PS(XMM1,XMM6)
373: SSE_SHUFFLE(XMM1,XMM1,0x55)
374: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
375: SSE_SUB_PS(XMM7,XMM1)
377: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
379: /* Third Column */
380: SSE_COPY_PS(XMM2,XMM6)
381: SSE_SHUFFLE(XMM2,XMM2,0xAA)
382: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
383: SSE_SUB_PS(XMM7,XMM2)
385: /* Fourth Column */
386: SSE_COPY_PS(XMM3,XMM6)
387: SSE_SHUFFLE(XMM3,XMM3,0xFF)
388: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
389: SSE_SUB_PS(XMM7,XMM3)
390: SSE_INLINE_END_2
391: v += 16;
392: }
393: v = aa + ai16;
394: ai16 = 16*diag[--i];
395: PREFETCH_NTA(aa+ai16+16);
396: /*
397: Scale the result by the diagonal 4x4 block,
398: which was inverted as part of the factorization
399: */
400: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
401: /* First Column */
402: SSE_COPY_PS(XMM0,XMM7)
403: SSE_SHUFFLE(XMM0,XMM0,0x00)
404: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
406: /* Second Column */
407: SSE_COPY_PS(XMM1,XMM7)
408: SSE_SHUFFLE(XMM1,XMM1,0x55)
409: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
410: SSE_ADD_PS(XMM0,XMM1)
412: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
414: /* Third Column */
415: SSE_COPY_PS(XMM2,XMM7)
416: SSE_SHUFFLE(XMM2,XMM2,0xAA)
417: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
418: SSE_ADD_PS(XMM0,XMM2)
420: /* Fourth Column */
421: SSE_COPY_PS(XMM3,XMM7)
422: SSE_SHUFFLE(XMM3,XMM3,0xFF)
423: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
424: SSE_ADD_PS(XMM0,XMM3)
426: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
427: SSE_INLINE_END_3
429: v = aa + ai16 + 16;
430: idt -= 4;
431: }
433: /* Convert t from single precision back to double precision (inplace)*/
434: idt = 4*(n-1);
435: for (i=n-1; i>=0; i--) {
436: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
437: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
438: PetscScalar *xtemp=&x[idt];
439: MatScalar *ttemp=&t[idt];
440: xtemp[3] = (PetscScalar)ttemp[3];
441: xtemp[2] = (PetscScalar)ttemp[2];
442: xtemp[1] = (PetscScalar)ttemp[1];
443: xtemp[0] = (PetscScalar)ttemp[0];
444: idt -= 4;
445: }
447: } /* End of artificial scope. */
448: VecRestoreArray(bb,&b);
449: VecRestoreArray(xx,&x);
450: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
451: SSE_SCOPE_END;
452: return(0);
453: }
455: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
456: {
457: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
458: int *aj=a->j;
460: int *ai=a->i,n=a->mbs,*diag = a->diag;
461: MatScalar *aa=a->a;
462: PetscScalar *x,*b;
465: SSE_SCOPE_BEGIN;
466: /*
467: Note: This code currently uses demotion of double
468: to float when performing the mixed-mode computation.
469: This may not be numerically reasonable for all applications.
470: */
471: PREFETCH_NTA(aa+16*ai[1]);
473: VecGetArray(bb,&b);
474: VecGetArray(xx,&x);
475: {
476: /* x will first be computed in single precision then promoted inplace to double */
477: MatScalar *v,*t=(MatScalar*)x;
478: int nz,i,idt,ai16;
479: int jdx,idx;
480: int *vi;
481: /* Forward solve the lower triangular factor. */
483: /* First block is the identity. */
484: idx = 0;
485: CONVERT_DOUBLE4_FLOAT4(t,b);
486: v = aa + 16*ai[1];
488: for (i=1; i<n;) {
489: PREFETCH_NTA(&v[8]);
490: vi = aj + ai[i];
491: nz = diag[i] - ai[i];
492: idx += 4;
494: /* Demote RHS from double to float. */
495: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
496: LOAD_PS(&t[idx],XMM7);
498: while (nz--) {
499: PREFETCH_NTA(&v[16]);
500: jdx = 4*(*vi++);
501: /* jdx = *vi++; */
503: /* 4x4 Matrix-Vector product with negative accumulation: */
504: SSE_INLINE_BEGIN_2(&t[jdx],v)
505: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
507: /* First Column */
508: SSE_COPY_PS(XMM0,XMM6)
509: SSE_SHUFFLE(XMM0,XMM0,0x00)
510: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
511: SSE_SUB_PS(XMM7,XMM0)
513: /* Second Column */
514: SSE_COPY_PS(XMM1,XMM6)
515: SSE_SHUFFLE(XMM1,XMM1,0x55)
516: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
517: SSE_SUB_PS(XMM7,XMM1)
519: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
521: /* Third Column */
522: SSE_COPY_PS(XMM2,XMM6)
523: SSE_SHUFFLE(XMM2,XMM2,0xAA)
524: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
525: SSE_SUB_PS(XMM7,XMM2)
527: /* Fourth Column */
528: SSE_COPY_PS(XMM3,XMM6)
529: SSE_SHUFFLE(XMM3,XMM3,0xFF)
530: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
531: SSE_SUB_PS(XMM7,XMM3)
532: SSE_INLINE_END_2
534: v += 16;
535: }
536: v = aa + 16*ai[++i];
537: PREFETCH_NTA(v);
538: STORE_PS(&t[idx],XMM7);
539: }
541: /* Backward solve the upper triangular factor.*/
543: idt = 4*(n-1);
544: ai16 = 16*diag[n-1];
545: v = aa + ai16 + 16;
546: for (i=n-1; i>=0;) {
547: PREFETCH_NTA(&v[8]);
548: vi = aj + diag[i] + 1;
549: nz = ai[i+1] - diag[i] - 1;
551: LOAD_PS(&t[idt],XMM7);
553: while (nz--) {
554: PREFETCH_NTA(&v[16]);
555: idx = 4*(*vi++);
556: /* idx = *vi++; */
558: /* 4x4 Matrix-Vector Product with negative accumulation: */
559: SSE_INLINE_BEGIN_2(&t[idx],v)
560: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
562: /* First Column */
563: SSE_COPY_PS(XMM0,XMM6)
564: SSE_SHUFFLE(XMM0,XMM0,0x00)
565: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
566: SSE_SUB_PS(XMM7,XMM0)
568: /* Second Column */
569: SSE_COPY_PS(XMM1,XMM6)
570: SSE_SHUFFLE(XMM1,XMM1,0x55)
571: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
572: SSE_SUB_PS(XMM7,XMM1)
574: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
576: /* Third Column */
577: SSE_COPY_PS(XMM2,XMM6)
578: SSE_SHUFFLE(XMM2,XMM2,0xAA)
579: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
580: SSE_SUB_PS(XMM7,XMM2)
582: /* Fourth Column */
583: SSE_COPY_PS(XMM3,XMM6)
584: SSE_SHUFFLE(XMM3,XMM3,0xFF)
585: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
586: SSE_SUB_PS(XMM7,XMM3)
587: SSE_INLINE_END_2
588: v += 16;
589: }
590: v = aa + ai16;
591: ai16 = 16*diag[--i];
592: PREFETCH_NTA(aa+ai16+16);
593: /*
594: Scale the result by the diagonal 4x4 block,
595: which was inverted as part of the factorization
596: */
597: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
598: /* First Column */
599: SSE_COPY_PS(XMM0,XMM7)
600: SSE_SHUFFLE(XMM0,XMM0,0x00)
601: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
603: /* Second Column */
604: SSE_COPY_PS(XMM1,XMM7)
605: SSE_SHUFFLE(XMM1,XMM1,0x55)
606: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
607: SSE_ADD_PS(XMM0,XMM1)
609: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
611: /* Third Column */
612: SSE_COPY_PS(XMM2,XMM7)
613: SSE_SHUFFLE(XMM2,XMM2,0xAA)
614: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
615: SSE_ADD_PS(XMM0,XMM2)
617: /* Fourth Column */
618: SSE_COPY_PS(XMM3,XMM7)
619: SSE_SHUFFLE(XMM3,XMM3,0xFF)
620: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
621: SSE_ADD_PS(XMM0,XMM3)
623: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
624: SSE_INLINE_END_3
626: v = aa + ai16 + 16;
627: idt -= 4;
628: }
630: /* Convert t from single precision back to double precision (inplace)*/
631: idt = 4*(n-1);
632: for (i=n-1; i>=0; i--) {
633: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
634: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
635: PetscScalar *xtemp=&x[idt];
636: MatScalar *ttemp=&t[idt];
637: xtemp[3] = (PetscScalar)ttemp[3];
638: xtemp[2] = (PetscScalar)ttemp[2];
639: xtemp[1] = (PetscScalar)ttemp[1];
640: xtemp[0] = (PetscScalar)ttemp[0];
641: idt -= 4;
642: }
644: } /* End of artificial scope. */
645: VecRestoreArray(bb,&b);
646: VecRestoreArray(xx,&x);
647: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
648: SSE_SCOPE_END;
649: return(0);
650: }
652: #endif