Actual source code: ex52_integrateElement.cu
petsc-3.4.5 2014-06-29
1: #include <petscsys.h>
3: #include "ex52_gpu.h"
5: __device__ vecType f1_laplacian(realType u[], vecType gradU[], int comp)
6: {
7: return gradU[comp];
8: }
10: #if (SPATIAL_DIM_0 == 2)
12: __device__ vecType f1_elasticity(realType u[], vecType gradU[], int comp)
13: {
14: vecType f1;
16: switch (comp) {
17: case 0:
18: f1.x = 0.5*(gradU[0].x + gradU[0].x);
19: f1.y = 0.5*(gradU[0].y + gradU[1].x);
20: break;
21: case 1:
22: f1.x = 0.5*(gradU[1].x + gradU[0].y);
23: f1.y = 0.5*(gradU[1].y + gradU[1].y);
24: }
25: return f1;
26: }
28: #elif (SPATIAL_DIM_0 == 3)
30: __device__ vecType f1_elasticity(realType u[], vecType gradU[], int comp)
31: {
32: vecType f1;
34: switch (comp) {
35: case 0:
36: f1.x = 0.5*(gradU[0].x + gradU[0].x);
37: f1.y = 0.5*(gradU[0].y + gradU[1].x);
38: f1.z = 0.5*(gradU[0].z + gradU[2].x);
39: break;
40: case 1:
41: f1.x = 0.5*(gradU[1].x + gradU[0].y);
42: f1.y = 0.5*(gradU[1].y + gradU[1].y);
43: f1.z = 0.5*(gradU[1].z + gradU[2].y);
44: break;
45: case 2:
46: f1.x = 0.5*(gradU[2].x + gradU[0].z);
47: f1.y = 0.5*(gradU[2].y + gradU[1].z);
48: f1.z = 0.5*(gradU[2].z + gradU[2].z);
49: }
50: return f1;
51: }
53: #else
55: #error "Invalid spatial dimension"
57: #endif
59: // dim Number of spatial dimensions: 2
60: // N_b Number of basis functions: generated
61: // N_{bt} Number of total basis functions: N_b * N_{comp}
62: // N_q Number of quadrature points: generated
63: // N_{bs} Number of block cells LCM(N_b, N_q)
64: // N_{bst} Number of block cell components LCM(N_{bt}, N_q)
65: // N_{bl} Number of concurrent blocks generated
66: // N_t Number of threads: N_{bl} * N_{bs}
67: // N_{cbc} Number of concurrent basis cells: N_{bl} * N_q
68: // N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b
69: // N_{sbc} Number of serial basis cells: N_{bs} / N_q
70: // N_{sqc} Number of serial quadrature cells: N_{bs} / N_b
71: // N_{cb} Number of serial cell batches: input
72: // N_c Number of total cells: N_{cb}*N_{t}/N_{comp}
74: __global__ void integrateElementQuadrature(int N_cb, realType *coefficients, realType *jacobianInverses, realType *jacobianDeterminants, realType *elemVec)
75: {
76: #include "ex52_gpu_inline.h"
77: const int dim = SPATIAL_DIM_0;
78: const int N_b = numBasisFunctions_0; // The number of basis functions
79: const int N_comp = numBasisComponents_0; // The number of basis function components
80: const int N_bt = N_b*N_comp; // The total number of scalar basis functions
81: const int N_q = numQuadraturePoints_0; // The number of quadrature points
82: const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously
83: const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl
84: const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)
85: const int N_c = N_cb * N_bc;
86: const int N_sbc = N_bst / (N_q * N_comp);
87: const int N_sqc = N_bst / N_bt;
89: /* Calculated indices */
90: const int tidx = threadIdx.x + blockDim.x*threadIdx.y;
91: const int blidx = tidx / N_bst; // Block number for this thread
92: const int bidx = tidx % N_bt; // Basis function mapped to this thread
93: const int cidx = tidx % N_comp; // Basis component mapped to this thread
94: const int qidx = tidx % N_q; // Quadrature point mapped to this thread
95: const int blbidx = tidx % N_q + blidx*N_q; // Cell mapped to this thread in the basis phase
96: const int blqidx = tidx % N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase
97: const int gidx = blockIdx.y*gridDim.x + blockIdx.x;
98: const int Goffset = gidx*N_c;
99: const int Coffset = gidx*N_c*N_bt;
100: const int Eoffset = gidx*N_c*N_bt;
102: /* Quadrature data */
103: realType w; // $w_q$, Quadrature weight at $x_q$
104: //__shared__ realType phi_i[N_bt*N_q]; // $\phi_i(x_q)$, Value of the basis function $i$ at $x_q$
105: __shared__ vecType phiDer_i[N_bt*N_q]; // $\frac{\partial\phi_i(x_q)}{\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$
106: /* Geometric data */
107: __shared__ realType detJ[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$
108: __shared__ realType invJ[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$
109: /* FEM data */
110: __shared__ realType u_i[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\mathcal{T}} = \sum_i u_i \phi_i$
111: /* Intermediate calculations */
112: //__shared__ realType f_0[N_t*N_sqc]; // $f_0(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
113: __shared__ vecType f_1[N_t*N_sqc]; // $f_1(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
114: /* Output data */
115: realType e_i; // Coefficient $e_i$ of the residual
117: /* These should be generated inline */
118: /* Load quadrature weights */
119: w = weights_0[qidx];
120: /* Load basis tabulation \phi_i for this cell */
121: if (tidx < N_bt*N_q) {
122: // phi_i[tidx] = Basis_0[tidx];
123: phiDer_i[tidx] = BasisDerivatives_0[tidx];
124: }
126: for (int batch = 0; batch < N_cb; ++batch) {
127: /* Load geometry */
128: detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];
129: for (int n = 0; n < dim*dim; ++n) {
130: const int offset = n*N_t;
131: invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];
132: }
133: /* Load coefficients u_i for this cell */
134: for (int n = 0; n < N_bt; ++n) {
135: const int offset = n*N_t;
136: u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];
137: }
139: /* Map coefficients to values at quadrature points */
140: for (int c = 0; c < N_sqc; ++c) {
141: realType u[N_comp]; // $u(x_q)$, Value of the field at $x_q$
142: vecType gradU[N_comp]; // $\nabla u(x_q)$, Value of the field gradient at $x_q$
143: // vecType x = {0.0, 0.0}; // Quadrature point $x_q$
144: const int cell = c*N_bl*N_b + blqidx;
145: const int fidx = (cell*N_q + qidx)*N_comp + cidx;
147: for (int comp = 0; comp < N_comp; ++comp) {
148: //u[comp] = 0.0;
149: #if SPATIAL_DIM_0 == 2
150: gradU[comp].x = 0.0; gradU[comp].y = 0.0;
151: #elif SPATIAL_DIM_0 == 3
152: gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;
153: #endif
154: }
155: /* Get field and derivatives at this quadrature point */
156: for (int i = 0; i < N_b; ++i) {
157: for (int comp = 0; comp < N_comp; ++comp) {
158: const int b = i*N_comp+comp;
159: const int pidx = qidx*N_bt + b;
160: const int uidx = cell*N_bt + b;
161: vecType realSpaceDer;
163: // u[comp] += u_i[uidx]*phi_i[qidx*N_bt+bbidx];
164: #if SPATIAL_DIM_0 == 2
165: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
166: gradU[comp].x += u_i[uidx]*realSpaceDer.x;
167: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
168: gradU[comp].y += u_i[uidx]*realSpaceDer.y;
169: #elif SPATIAL_DIM_0 == 3
170: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
171: gradU[comp].x += u_i[uidx]*realSpaceDer.x;
172: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
173: gradU[comp].y += u_i[uidx]*realSpaceDer.y;
174: realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
175: gradU[comp].z += u_i[uidx]*realSpaceDer.z;
176: #endif
177: }
178: }
179: /* Process values at quadrature points */
180: f_1[fidx] = f1_func(u, gradU, cidx);
181: #if SPATIAL_DIM_0 == 2
182: f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;
183: #elif SPATIAL_DIM_0 == 3
184: f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;
185: #endif
186: }
188: /* ==== TRANSPOSE THREADS ==== */
189: __syncthreads();
191: /* Map values at quadrature points to coefficients */
192: for (int c = 0; c < N_sbc; ++c) {
193: const int cell = c*N_bl*N_q + blbidx;
195: e_i = 0.0;
196: for (int q = 0; q < N_q; ++q) {
197: const int pidx = q*N_bt + bidx;
198: const int fidx = (cell*N_q + q)*N_comp + cidx;
199: vecType realSpaceDer;
201: // e_i += phi_i[pidx]*f_0[fidx];
202: #if SPATIAL_DIM_0 == 2
203: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
204: e_i += realSpaceDer.x*f_1[fidx].x;
205: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
206: e_i += realSpaceDer.y*f_1[fidx].y;
207: #elif SPATIAL_DIM_0 == 3
208: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
209: e_i += realSpaceDer.x*f_1[fidx].x;
210: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
211: e_i += realSpaceDer.y*f_1[fidx].y;
212: realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
213: e_i += realSpaceDer.z*f_1[fidx].z;
214: #endif
215: }
216: #if 0
217: // Check f_1
218: {
219: const int q = 0;
220: const int i = bidx/N_comp;
221: // Prints f1[0].x, f1[1].x, f1[0].y, f1[1].y
222: switch (i) {
223: case 0:
224: e_i = f_1[(cell*N_q+q)*N_comp+cidx].x;break;
225: case 1:
226: e_i = f_1[(cell*N_q+q)*N_comp+cidx].y;break;
227: //case 2:
228: //e_i = f_1[(cell*N_q+q)*N_comp+cidx].z;break;
229: default:
230: e_i = 0.0;
231: }
232: }
233: // Check that u_i is being used correctly
234: //e_i = u_i[cell*N_bt+bidx];
235: e_i = detJ[cell];
236: //e_i = coefficients[Coffset+(batch*N_sbc+c)*N_t+tidx];
237: //e_i = Coffset+(batch*N_sbc+c)*N_t+tidx;
238: //e_i = cell*N_bt+bidx;
239: #endif
240: /* Write element vector for N_{cbc} cells at a time */
241: elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;
242: }
243: /* ==== Could do one write per batch ==== */
244: }
245: return;
246: }
248: __global__ void integrateLaplacianJacobianQuadrature()
249: {
250: /* Map coefficients to values at quadrature points */
251: /* Process values at quadrature points */
252: /* Map values at quadrature points to coefficients */
253: return;
254: }
256: // Calculate a conforming thread grid for N kernels
259: PetscErrorCode calculateGrid(const int N, const int blockSize, unsigned int& x, unsigned int& y, unsigned int& z)
260: {
262: z = 1;
263: if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
264: const int Nblocks = N/blockSize;
265: for (x = (int) (sqrt(Nblocks) + 0.5); x > 0; --x) {
266: y = Nblocks/x;
267: if (x*y == Nblocks) break;
268: }
269: if (x*y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
270: return(0);
271: }
275: /*
276: IntegrateElementBatchGPU - Produces element vectors from input element solution and geometric information via quadrature
278: Input Parameters:
279: + Ne - The total number of cells, Nchunk * Ncb * Nbc
280: . Ncb - The number of serial cell batches
281: . Nbc - The number of cells per batch
282: . Nbl - The number of concurrent cells blocks per thread block
283: . coefficients - An array of the solution vector for each cell
284: . jacobianInverses - An array of the inverse Jacobian for each cell
285: . jacobianDeterminants - An array of the Jacobian determinant for each cell
286: . event - A PetscEvent, used to log flops
287: - debug - A flag for debugging information
289: Output Parameter:
290: . elemVec - An array of the element vectors for each cell
291: */
292: PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt Nbl, const PetscScalar coefficients[],
293: const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
294: PetscLogEvent event, PetscInt debug)
295: {
296: #include "ex52_gpu_inline.h"
297: const int dim = SPATIAL_DIM_0;
298: const int N_b = numBasisFunctions_0; // The number of basis functions
299: const int N_comp = numBasisComponents_0; // The number of basis function components
300: const int N_bt = N_b*N_comp; // The total number of scalar basis functions
301: const int N_q = numQuadraturePoints_0; // The number of quadrature points
302: const int N_bst = N_bt*N_q; // The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously
303: const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl
305: realType *d_coefficients;
306: realType *d_jacobianInverses;
307: realType *d_jacobianDeterminants;
308: realType *d_elemVec;
312: if (Nbl != N_bl) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsisten block size %d should be %d", Nbl, N_bl);
313: if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
314: if (!Ne) {
315: PetscStageLog stageLog;
316: PetscEventPerfLog eventLog = NULL;
317: PetscInt stage;
319: PetscLogGetStageLog(&stageLog);
320: PetscStageLogGetCurrent(stageLog, &stage);
321: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
322: /* Log performance info */
323: eventLog->eventInfo[event].count++;
324: eventLog->eventInfo[event].time += 0.0;
325: eventLog->eventInfo[event].flops += 0;
326: return(0);
327: }
328: // Marshalling
329: cudaMalloc((void**) &d_coefficients, Ne*N_bt * sizeof(realType));
330: cudaMalloc((void**) &d_jacobianInverses, Ne*dim*dim * sizeof(realType));
331: cudaMalloc((void**) &d_jacobianDeterminants, Ne * sizeof(realType));
332: cudaMalloc((void**) &d_elemVec, Ne*N_bt * sizeof(realType));
333: if (sizeof(PetscReal) == sizeof(realType)) {
334: cudaMemcpy(d_coefficients, coefficients, Ne*N_bt * sizeof(realType), cudaMemcpyHostToDevice);
335: cudaMemcpy(d_jacobianInverses, jacobianInverses, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
336: cudaMemcpy(d_jacobianDeterminants, jacobianDeterminants, Ne * sizeof(realType), cudaMemcpyHostToDevice);
337: } else {
338: realType *c, *jI, *jD;
339: PetscInt i;
341: PetscMalloc3(Ne*N_bt,realType,&c,Ne*dim*dim,realType,&jI,Ne,realType,&jD);
342: for (i = 0; i < Ne*N_bt; ++i) c[i] = coefficients[i];
343: for (i = 0; i < Ne*dim*dim; ++i) jI[i] = jacobianInverses[i];
344: for (i = 0; i < Ne; ++i) jD[i] = jacobianDeterminants[i];
345: cudaMemcpy(d_coefficients, c, Ne*N_bt * sizeof(realType), cudaMemcpyHostToDevice);
346: cudaMemcpy(d_jacobianInverses, jI, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
347: cudaMemcpy(d_jacobianDeterminants, jD, Ne * sizeof(realType), cudaMemcpyHostToDevice);
348: PetscFree3(c,jI,jD);
349: }
350: // Kernel launch
351: unsigned int x, y, z;
352: calculateGrid(Ne, Ncb*Nbc, x, y, z);
353: dim3 grid(x, y, z);
354: dim3 block(Nbc*N_comp, 1, 1);
355: cudaEvent_t start, stop;
356: float msElapsedTime;
358: cudaEventCreate(&start);
359: cudaEventCreate(&stop);
360: // if (debug) {
361: PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
362: grid.x, grid.y, grid.z, block.x, block.y, block.z, Ncb);
363: PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
364: // }
365: cudaEventRecord(start, 0);
366: integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec);
367: cudaEventRecord(stop, 0);
368: cudaEventSynchronize(stop);
369: cudaEventElapsedTime(&msElapsedTime, start, stop);
370: cudaEventDestroy(start);
371: cudaEventDestroy(stop);
372: // Marshalling
373: if (sizeof(PetscReal) == sizeof(realType)) {
374: cudaMemcpy(elemVec, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
375: } else {
376: realType *eV;
377: PetscInt i;
379: PetscMalloc(Ne*N_bt * sizeof(realType), &eV);
380: cudaMemcpy(eV, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
381: for (i = 0; i < Ne*N_bt; ++i) elemVec[i] = eV[i];
382: PetscFree(eV);
383: }
384: cudaFree(d_coefficients);
385: cudaFree(d_jacobianInverses);
386: cudaFree(d_jacobianDeterminants);
387: cudaFree(d_elemVec);
388: {
389: PetscStageLog stageLog;
390: PetscEventPerfLog eventLog = NULL;
391: PetscInt stage;
393: PetscLogGetStageLog(&stageLog);
394: PetscStageLogGetCurrent(stageLog, &stage);
395: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
396: /* Log performance info */
397: eventLog->eventInfo[event].count++;
398: eventLog->eventInfo[event].time += msElapsedTime*1.0e-3;
399: eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
400: }
401: return(0);
402: }