Actual source code: petscaijdevice.h

  1: #ifndef PETSCAIJDEVICE_H
  2: #define PETSCAIJDEVICE_H

  4: #include <petscmat.h>

  6: /* SUBMANSEC = Mat */

  8: #define CSRDataStructure(datatype) \
  9:   PetscInt *i; \
 10:   PetscInt *j; \
 11:   datatype *a; \
 12:   PetscInt  n; \
 13:   PetscInt  ignorezeroentries;

 15: typedef struct {
 16:   CSRDataStructure(PetscScalar)
 17: } PetscCSRDataStructure;

 19: struct _n_SplitCSRMat {
 20:   PetscInt              cstart, cend, rstart, rend;
 21:   PetscCSRDataStructure diag, offdiag;
 22:   PetscInt             *colmap;
 23:   PetscInt              M; // number of columns for out of bounds check
 24:   PetscMPIInt           rank;
 25:   PetscBool             allocated_indices;
 26: };

 28: /* 64-bit floating-point version of atomicAdd() is only natively supported by
 29:    CUDA devices of compute capability 6.x and higher. See also sfcuda.cu
 30: */
 31: #if defined(PETSC_USE_REAL_DOUBLE) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 32: __device__ double atomicAdd(double *x, double y)
 33: {
 34:   typedef unsigned long long int ullint;
 35:   double                        *address = x, val = y;
 36:   ullint                        *address_as_ull = (ullint *)address;
 37:   ullint                         old            = *address_as_ull, assumed;
 38:   do {
 39:     assumed = old;
 40:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
 41:   } while (assumed != old);
 42:   return __longlong_as_double(old);
 43: }
 44: #endif

 46: #if defined(KOKKOS_INLINE_FUNCTION)
 47:   #define PetscAtomicAdd(a, b) Kokkos::atomic_fetch_add(a, b)
 48: #elif defined(__CUDA_ARCH__)
 49:   #if defined(PETSC_USE_COMPLEX)
 50:     #define PetscAtomicAdd(a, b) \
 51:       { \
 52:         PetscReal *_a = (PetscReal *)(a); \
 53:         PetscReal *_b = (PetscReal *)&(b); \
 54:         atomicAdd(&_a[0], _b[0]); \
 55:         atomicAdd(&_a[1], _b[1]); \
 56:       }
 57:   #else
 58:     #define PetscAtomicAdd(a, b) atomicAdd(a, b)
 59:   #endif
 60: #else
 61:   /* TODO: support devices other than CUDA and Kokkos */
 62:   #define PetscAtomicAdd(a, b) *(a) += b
 63: #endif

 65: #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv) \
 66:   { \
 67:     inserted = 0; \
 68:     if (col <= lastcol1) low1 = 0; \
 69:     else high1 = nrow1; \
 70:     lastcol1 = col; \
 71:     while (high1 - low1 > 5) { \
 72:       t = (low1 + high1) / 2; \
 73:       if (rp1[t] > col) high1 = t; \
 74:       else low1 = t; \
 75:     } \
 76:     for (_i = low1; _i < high1; _i++) { \
 77:       if (rp1[_i] > col) break; \
 78:       if (rp1[_i] == col) { \
 79:         if (addv == ADD_VALUES) { \
 80:           PetscAtomicAdd(&ap1[_i], value); \
 81:         } else ap1[_i] = value; \
 82:         inserted = 1; \
 83:         break; \
 84:       } \
 85:     } \
 86:   }

 88: #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv) \
 89:   { \
 90:     inserted = 0; \
 91:     if (col <= lastcol2) low2 = 0; \
 92:     else high2 = nrow2; \
 93:     lastcol2 = col; \
 94:     while (high2 - low2 > 5) { \
 95:       t = (low2 + high2) / 2; \
 96:       if (rp2[t] > col) high2 = t; \
 97:       else low2 = t; \
 98:     } \
 99:     for (_i = low2; _i < high2; _i++) { \
100:       if (rp2[_i] > col) break; \
101:       if (rp2[_i] == col) { \
102:         if (addv == ADD_VALUES) { \
103:           PetscAtomicAdd(&ap2[_i], value); \
104:         } else ap2[_i] = value; \
105:         inserted = 1; \
106:         break; \
107:       } \
108:     } \
109:   }

111: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
112:   #define SETERR return (printf("[%d] ERROR in %s() at %s:%d: Location (%ld,%ld) not found! v=%g\n", d_mat->rank, __func__, __FILE__, __LINE__, (long int)im[i], (long int)in[j], PetscRealPart(value)), PETSC_ERR_ARG_OUTOFRANGE)
113: #else
114:   #define SETERR return PETSC_ERR_ARG_OUTOFRANGE
115: #endif

117: #if defined(__CUDA_ARCH__)
118: __device__
119: #elif defined(KOKKOS_INLINE_FUNCTION)
120: KOKKOS_INLINE_FUNCTION
121: #else
122: static
123: #endif

125:   /*@C
126:        MatSetValuesDevice - sets a set of values into a matrix, this may be called by CUDA or KOKKOS kernels

128:     Input Parameters:
129: +   d_mat - an object obtained with `MatCUSPARSEGetDeviceMatWrite()` or `MatKokkosGetDeviceMatWrite()`
130: .   m - the number of rows to insert or add to
131: .   im - the rows to insert or add to
132: .   n - number of columns to insert or add to
133: .   in - the columns to insert or add to
134: .   v - the values to insert or add to the matrix (treated as a  by n row oriented dense array
135: -   is - either `INSERT_VALUES` or `ADD_VALUES`

137:     Notes:
138:     Any row or column indices that are outside the bounds of the matrix on the rank are discarded

140:     It is recommended that `MatSetValuesCOO()` be used instead of this routine for efficiency

142:    Level: advanced

144: .seealso: `MatSetValues()`, `MatCreate()`, `MatCreateDenseCUDA()`, `MatCreateAIJCUSPARSE()`, `MatKokkosGetDeviceMatWrite()`,
145:           `MatCUSPARSEGetDeviceMatWrite()`, `MatSetValuesCOO()`
146: @*/
147:   PetscErrorCode
148:   MatSetValuesDevice(PetscSplitCSRDataStructure d_mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
149: {
150:   MatScalar       value;
151:   const PetscInt *rp1, *rp2 = NULL, *ai = d_mat->diag.i, *aj = d_mat->diag.j;
152:   const PetscInt *bi = d_mat->offdiag.i, *bj = d_mat->offdiag.j;
153:   MatScalar      *ba = d_mat->offdiag.a, *aa = d_mat->diag.a;
154:   PetscInt        nrow1, nrow2 = 0, _i, low1, high1, low2 = 0, high2 = 0, t, lastcol1, lastcol2 = 0, inserted;
155:   MatScalar      *ap1, *ap2 = NULL;
156:   PetscBool       roworiented = PETSC_TRUE;
157:   PetscInt        i, j, row, col;
158:   const PetscInt  rstart = d_mat->rstart, rend = d_mat->rend, cstart = d_mat->cstart, cend = d_mat->cend, M = d_mat->M;

160:   for (i = 0; i < m; i++) {
161:     if (im[i] >= rstart && im[i] < rend) { // silently ignore off processor rows
162:       row      = (int)(im[i] - rstart);
163:       lastcol1 = -1;
164:       rp1      = aj + ai[row];
165:       ap1      = aa + ai[row];
166:       nrow1    = ai[row + 1] - ai[row];
167:       low1     = 0;
168:       high1    = nrow1;
169:       if (bj) {
170:         lastcol2 = -1;
171:         rp2      = bj + bi[row];
172:         ap2      = ba + bi[row];
173:         nrow2    = bi[row + 1] - bi[row];
174:         low2     = 0;
175:         high2    = nrow2;
176:       } else {
177:         high2 = low2 = 0;
178:       }
179:       for (j = 0; j < n; j++) {
180:         value = roworiented ? v[i * n + j] : v[i + j * m];
181:         if (in[j] >= cstart && in[j] < cend) {
182:           col = (in[j] - cstart);
183:           MatSetValues_SeqAIJ_A_Private(row, col, value, is);
184:           if (!inserted) SETERR;
185:         } else if (in[j] < 0) { // silently ignore off processor rows
186:           continue;
187:         } else if (in[j] >= M) SETERR;
188:         else {
189:           col = d_mat->colmap[in[j]] - 1;
190:           if (col < 0) SETERR;
191:           MatSetValues_SeqAIJ_B_Private(row, col, value, is);
192:           if (!inserted) SETERR;
193:         }
194:       }
195:     }
196:   }
197:   return 0;
198: }

200: #undef MatSetValues_SeqAIJ_A_Private
201: #undef MatSetValues_SeqAIJ_B_Private
202: #undef SETERR
203: #undef PetscAtomicAdd
204: #undef PetscCSRDataStructure_

206: #endif