Actual source code: petsc-blockinvert.h
petsc-3.4.5 2014-06-29
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
15: #include <petscblaslapack.h>
17: /*
18: These are C kernels,they are contained in
19: src/mat/impls/baij/seq
20: */
22: extern PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*);
23: extern PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal);
25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal);
27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; \
28: { \
29: MatScalar d, di; \
30: \
31: di = mat[0]; \
32: mat[0] = d = 1.0 / di; \
33: mat[4] *= -d; \
34: mat[8] *= -d; \
35: mat[12] *= -d; \
36: mat[1] *= d; \
37: mat[2] *= d; \
38: mat[3] *= d; \
39: mat[5] += mat[4] * mat[1] * di; \
40: mat[6] += mat[4] * mat[2] * di; \
41: mat[7] += mat[4] * mat[3] * di; \
42: mat[9] += mat[8] * mat[1] * di; \
43: mat[10] += mat[8] * mat[2] * di; \
44: mat[11] += mat[8] * mat[3] * di; \
45: mat[13] += mat[12] * mat[1] * di; \
46: mat[14] += mat[12] * mat[2] * di; \
47: mat[15] += mat[12] * mat[3] * di; \
48: di = mat[5]; \
49: mat[5] = d = 1.0 / di; \
50: mat[1] *= -d; \
51: mat[9] *= -d; \
52: mat[13] *= -d; \
53: mat[4] *= d; \
54: mat[6] *= d; \
55: mat[7] *= d; \
56: mat[0] += mat[1] * mat[4] * di; \
57: mat[2] += mat[1] * mat[6] * di; \
58: mat[3] += mat[1] * mat[7] * di; \
59: mat[8] += mat[9] * mat[4] * di; \
60: mat[10] += mat[9] * mat[6] * di; \
61: mat[11] += mat[9] * mat[7] * di; \
62: mat[12] += mat[13] * mat[4] * di; \
63: mat[14] += mat[13] * mat[6] * di; \
64: mat[15] += mat[13] * mat[7] * di; \
65: di = mat[10]; \
66: mat[10] = d = 1.0 / di; \
67: mat[2] *= -d; \
68: mat[6] *= -d; \
69: mat[14] *= -d; \
70: mat[8] *= d; \
71: mat[9] *= d; \
72: mat[11] *= d; \
73: mat[0] += mat[2] * mat[8] * di; \
74: mat[1] += mat[2] * mat[9] * di; \
75: mat[3] += mat[2] * mat[11] * di; \
76: mat[4] += mat[6] * mat[8] * di; \
77: mat[5] += mat[6] * mat[9] * di; \
78: mat[7] += mat[6] * mat[11] * di; \
79: mat[12] += mat[14] * mat[8] * di; \
80: mat[13] += mat[14] * mat[9] * di; \
81: mat[15] += mat[14] * mat[11] * di; \
82: di = mat[15]; \
83: mat[15] = d = 1.0 / di; \
84: mat[3] *= -d; \
85: mat[7] *= -d; \
86: mat[11] *= -d; \
87: mat[12] *= d; \
88: mat[13] *= d; \
89: mat[14] *= d; \
90: mat[0] += mat[3] * mat[12] * di; \
91: mat[1] += mat[3] * mat[13] * di; \
92: mat[2] += mat[3] * mat[14] * di; \
93: mat[4] += mat[7] * mat[12] * di; \
94: mat[5] += mat[7] * mat[13] * di; \
95: mat[6] += mat[7] * mat[14] * di; \
96: mat[8] += mat[11] * mat[12] * di; \
97: mat[9] += mat[11] * mat[13] * di; \
98: mat[10] += mat[11] * mat[14] * di; \
99: }
101: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal);
102: # if defined(PETSC_HAVE_SSE)
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
104: # endif
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal);
107: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal);
108: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal);
109: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal);
111: /*
112: A = inv(A) A_gets_inverse_A
114: A - square bs by bs array stored in column major order
115: pivots - integer work array of length bs
116: W - bs by 1 work array
117: */
118: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W) (PetscLINPACKgefa((A),(bs),(pivots)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))
120: /* -----------------------------------------------------------------------*/
122: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
123: /*
124: Version that calls the BLAS directly
125: */
126: /*
127: A = A * B A_gets_A_times_B
129: A, B - square bs by bs arrays stored in column major order
130: W - square bs by bs work array
132: */
133: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
134: { \
135: PetscBLASInt _bbs; \
136: PetscScalar _one = 1.0,_zero = 0.0; \
137: PetscErrorCode _ierr; \
138: _PetscBLASIntCast(bs,&_bbs); \
139: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
140: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
141: }
143: /*
145: A = A - B * C A_gets_A_minus_B_times_C
147: A, B, C - square bs by bs arrays stored in column major order
148: */
149: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
150: { \
151: PetscScalar _mone = -1.0,_one = 1.0; \
152: PetscBLASInt _bbs; \
153: PetscErrorCode _ierr; \
154: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
155: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
156: }
158: /*
160: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
162: A, B, C - square bs by bs arrays stored in column major order
163: */
164: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
165: { \
166: PetscScalar _one = 1.0; \
167: PetscBLASInt _bbs; \
168: PetscErrorCode _ierr; \
169: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
170: PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
171: }
173: /*
174: v = v + A^T w v_gets_v_plus_Atranspose_times_w
176: v - array of length bs
177: A - square bs by bs array
178: w - array of length bs
179: */
180: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
181: { \
182: PetscScalar _one = 1.0; \
183: PetscBLASInt _ione = 1, _bbs; \
184: PetscErrorCode _ierr; \
185: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
186: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
187: }
189: /*
190: v = v - A w v_gets_v_minus_A_times_w
192: v - array of length bs
193: A - square bs by bs array
194: w - array of length bs
195: */
196: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
197: { \
198: PetscScalar _mone = -1.0,_one = 1.0; \
199: PetscBLASInt _ione = 1,_bbs; \
200: PetscErrorCode _ierr; \
201: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
202: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
203: }
205: /*
206: v = v - A w v_gets_v_minus_transA_times_w
208: v - array of length bs
209: A - square bs by bs array
210: w - array of length bs
211: */
212: #define PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) \
213: { \
214: PetscScalar _mone = -1.0,_one = 1.0; \
215: PetscBLASInt _ione = 1,_bbs; \
216: PetscErrorCode _ierr; \
217: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
218: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
219: }
221: /*
222: v = v + A w v_gets_v_plus_A_times_w
224: v - array of length bs
225: A - square bs by bs array
226: w - array of length bs
227: */
228: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
229: { \
230: PetscScalar _one = 1.0; \
231: PetscBLASInt _ione = 1,_bbs; \
232: PetscErrorCode _ierr; \
233: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
234: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
235: }
237: /*
238: v = v + A w v_gets_v_plus_Ar_times_w
240: v - array of length bs
241: A - square bs by bs array
242: w - array of length bs
243: */
244: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
245: { \
246: PetscScalar _one = 1.0; \
247: PetscBLASInt _ione = 1,_bbs,_bncols; \
248: PetscErrorCode _ierr; \
249: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
250: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
251: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
252: }
254: /*
255: v = v - A w v_gets_v_minus_Ar_times_w
257: v - array of length ncol
258: A(bs,ncols)
259: w - array of length bs
260: */
261: #define PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) \
262: { \
263: PetscScalar _one = 1.0,_mone = -1.0; \
264: PetscBLASInt _ione = 1,_bbs,_bncols; \
265: PetscErrorCode _ierr; \
266: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
267: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
268: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
269: }
271: /*
272: w = A v w_gets_A_times_v
274: v - array of length bs
275: A - square bs by bs array
276: w - array of length bs
277: */
278: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
279: { \
280: PetscScalar _zero = 0.0,_one = 1.0; \
281: PetscBLASInt _ione = 1,_bbs; \
282: PetscErrorCode _ierr; \
283: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
284: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
285: }
287: /*
288: w = A' v w_gets_transA_times_v
290: v - array of length bs
291: A - square bs by bs array
292: w - array of length bs
293: */
294: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) \
295: { \
296: PetscScalar _zero = 0.0,_one = 1.0; \
297: PetscBLASInt _ione = 1,_bbs; \
298: PetscErrorCode _ierr; \
299: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
300: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
301: }
303: /*
304: z = A*x
305: */
306: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
307: { \
308: PetscScalar _one = 1.0,_zero = 0.0; \
309: PetscBLASInt _ione = 1,_bbs,_bncols; \
310: PetscErrorCode _ierr; \
311: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
312: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
313: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
314: }
316: /*
317: z = trans(A)*x
318: */
319: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
320: { \
321: PetscScalar _one = 1.0; \
322: PetscBLASInt _ione = 1,_bbs,_bncols; \
323: PetscErrorCode _ierr; \
324: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
325: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
326: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
327: }
329: #else
330: /*
331: Version that calls Fortran routines; can handle different precision
332: of matrix (array) and vectors
333: */
334: /*
335: These are Fortran kernels: They replace certain BLAS routines but
336: have some arguments that may be single precision,rather than double
337: These routines are provided in src/fortran/kernels/sgemv.F
338: They are pretty pitiful but get the job done. The intention is
339: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
340: code is used.
341: */
343: #if defined(PETSC_HAVE_FORTRAN_CAPS)
344: #define msgemv_ MSGEMV
345: #define msgemvp_ MSGEMVP
346: #define msgemvm_ MSGEMVM
347: #define msgemvt_ MSGEMVT
348: #define msgemmi_ MSGEMMI
349: #define msgemm_ MSGEMM
350: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
351: #define msgemv_ msgemv
352: #define msgemvp_ msgemvp
353: #define msgemvm_ msgemvm
354: #define msgemvt_ msgemvt
355: #define msgemmi_ msgemmi
356: #define msgemm_ msgemm
357: #endif
359: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
360: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
361: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
362: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
363: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
364: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
366: /*
367: A = A * B A_gets_A_times_B
369: A, B - square bs by bs arrays stored in column major order
370: W - square bs by bs work array
372: */
373: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
374: { \
375: PetscErrorCode _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
376: msgemmi_(&bs,A,B,W); \
377: }
379: /*
381: A = A - B * C A_gets_A_minus_B_times_C
383: A, B, C - square bs by bs arrays stored in column major order
384: */
385: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
386: { \
387: msgemm_(&bs,A,B,C); \
388: }
390: /*
391: v = v - A w v_gets_v_minus_A_times_w
393: v - array of length bs
394: A - square bs by bs array
395: w - array of length bs
396: */
397: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
398: { \
399: msgemvm_(&bs,&bs,A,w,v); \
400: }
402: /*
403: v = v + A w v_gets_v_plus_A_times_w
405: v - array of length bs
406: A - square bs by bs array
407: w - array of length bs
408: */
409: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
410: { \
411: msgemvp_(&bs,&bs,A,w,v); \
412: }
414: /*
415: v = v + A w v_gets_v_plus_Ar_times_w
417: v - array of length bs
418: A - square bs by bs array
419: w - array of length bs
420: */
421: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
422: { \
423: msgemvp_(&bs,&ncol,A,v,w); \
424: }
426: /*
427: w = A v w_gets_A_times_v
429: v - array of length bs
430: A - square bs by bs array
431: w - array of length bs
432: */
433: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
434: { \
435: msgemv_(&bs,&bs,A,v,w); \
436: }
438: /*
439: z = A*x
440: */
441: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
442: { \
443: msgemv_(&bs,&ncols,A,x,z); \
444: }
446: /*
447: z = trans(A)*x
448: */
449: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
450: { \
451: msgemvt_(&bs,&ncols,A,x,z); \
452: }
454: /* These do not work yet */
455: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
456: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
459: #endif
461: #endif