Actual source code: petscaxpy.h


  2: /*
  3:     PetscKernelAXPY -  X = X + alpha * Y

  5:    Input Parameters:
  6: +    X, Y - arrays
  7: .    alpha - scalar
  8: -    n - length of arrays

 10:    Also PetscKernelAXPY2(), PetscKernelAXPY3(), PetscKernelAXPY4()

 12: */

 14: #ifndef PetscKernelAXPY

 16: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
 17: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 18: #define fortranmaxpy4_ FORTRANMAXPY4
 19: #define fortranmaxpy3_ FORTRANMAXPY3
 20: #define fortranmaxpy2_ FORTRANMAXPY2
 21: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 22: #define fortranmaxpy4_ fortranmaxpy4
 23: #define fortranmaxpy3_ fortranmaxpy3
 24: #define fortranmaxpy2_ fortranmaxpy2
 25: #endif
 26: PETSC_EXTERN void fortranmaxpy4_(void*,void*,void*,void*,void*,const void*,const void*,const void*,const void*,PetscInt*);
 27: PETSC_EXTERN void fortranmaxpy3_(void*,void*,void*,void*,const void*,const void*,const void*,PetscInt*);
 28: PETSC_EXTERN void fortranmaxpy2_(void*,void*,void*,const void*,const void*,PetscInt*);
 29: #endif
 30: #include <petscblaslapack.h>

 32: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
 33: #define PetscKernelAXPY(U,a1,p1,n)                   {PetscBLASInt one=1; PetscBLASInt nn = (PetscBLASInt) n;  PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&nn,&a1,p1,&one,U,&one));}
 34: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n)            {fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
 35: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n)      {fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
 36: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}

 38: #elif defined(PETSC_USE_UNROLL_KERNELS)

 40: #define PetscKernelAXPY(U,Alpha,P,n) {\
 41:   switch (n & 0x3) {\
 42:   case 3: *U++    += Alpha * *P++;\
 43:   case 2: *U++    += Alpha * *P++;\
 44:   case 1: *U++    += Alpha * *P++;\
 45:   n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1]; U[2] += Alpha * P[2]; U[3] += Alpha * P[3];U += 4; P += 4; n -= 4;}}
 46: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {\
 47:   switch (n & 0x3) {\
 48:   case 3: *U++    += a1 * *p1++ + a2 * *p2++;\
 49:   case 2: *U++    += a1 * *p1++ + a2 * *p2++;\
 50:   case 1: *U++    += a1 * *p1++ + a2 * *p2++;\
 51:   n -= 4;case 0: break;}\
 52:   while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1]; U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
 53: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n) {\
 54:   switch (n & 0x3) {\
 55:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
 56:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
 57:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
 58:   n -= 4;case 0:break;}\
 59:   while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
 60: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
 61:   switch (n & 0x3) {\
 62:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
 63:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
 64:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
 65:   n -= 4;case 0:break;}\
 66:   while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}

 68: #elif defined(PETSC_USE_WHILE_KERNELS)

 70: #define PetscKernelAXPY(U,a1,p1,n)  {while (n--) *U++ += a1 * *p1++;}
 71: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n)  {while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
 72: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n) {while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
 73: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}

 75: #elif defined(PETSC_USE_BLAS_KERNELS)

 77: #define PetscKernelAXPY(U,a1,p1,n)  {PetscBLASInt one=1; PetscBLASInt nn = (PetscBLASInt) n; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&nn,&a1,p1,&one,U,&one));}
 78: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n){PetscKernelAXPY(U,a1,p1,n); PetscKernelAXPY(U,a2,p2,n);}
 79: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscKernelAXPY2(U,a1,a2,p1,p2,n); PetscKernelAXPY(U,a3,p3,n);}
 80: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscKernelAXPY2(U,a1,a2,p1,p2,n); PetscKernelAXPY2(U,a3,a4,p3,p4,n);}

 82: #elif defined(PETSC_USE_FOR_KERNELS)

 84: #define PetscKernelAXPY(U,a1,p1,n)  {PetscInt __i;PetscScalar __s1,__s2; \
 85:   for (__i=0;__i<n-1;__i+=2){\
 86:    __s1=a1*p1[__i];__s2=a1*p1[__i+1]; __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\
 87:   if (n & 0x1) U[__i] += a1 * p1[__i];}
 88: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {PetscInt __i; for (__i=0;__i<n;__i++) U[__i] += a1 * p1[__i] + a2 * p2[__i];}
 89: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i; for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
 90: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

 92: #else

 94: #define PetscKernelAXPY(U,a1,p1,n)  {PetscInt __i;PetscScalar _a1=a1; for (__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
 95: #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {PetscInt __i; for (__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
 96: #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i; for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
 97: #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

 99: #endif

101: #endif