Actual source code: feopencl.c
petsc-3.13.6 2020-09-29
1: #include <petsc/private/petscfeimpl.h>
3: #if defined(PETSC_HAVE_OPENCL)
5: static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6: {
7: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
8: PetscErrorCode ierr;
11: clReleaseCommandQueue(ocl->queue_id);
12: ocl->queue_id = 0;
13: clReleaseContext(ocl->ctx_id);
14: ocl->ctx_id = 0;
15: PetscFree(ocl);
16: return(0);
17: }
19: #define STRING_ERROR_CHECK(MSG) do { string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);} while(0)
20: enum {LAPLACIAN = 0, ELASTICITY = 1};
22: /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
23: /* dim Number of spatial dimensions: 2 */
24: /* N_b Number of basis functions: generated */
25: /* N_{bt} Number of total basis functions: N_b * N_{comp} */
26: /* N_q Number of quadrature points: generated */
27: /* N_{bs} Number of block cells LCM(N_b, N_q) */
28: /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
29: /* N_{bl} Number of concurrent blocks generated */
30: /* N_t Number of threads: N_{bl} * N_{bs} */
31: /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
32: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
33: /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
34: /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
35: /* N_{cb} Number of serial cell batches: input */
36: /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
37: static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
38: {
39: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
40: PetscQuadrature q;
41: char *string_tail = *string_buffer;
42: char *end_of_buffer = *string_buffer + buffer_length;
43: char float_str[] = "float", double_str[] = "double";
44: char *numeric_str = &(float_str[0]);
45: PetscInt op = ocl->op;
46: PetscBool useField = PETSC_FALSE;
47: PetscBool useFieldDer = PETSC_TRUE;
48: PetscBool useFieldAux = useAux;
49: PetscBool useFieldDerAux= PETSC_FALSE;
50: PetscBool useF0 = PETSC_TRUE;
51: PetscBool useF1 = PETSC_TRUE;
52: const PetscReal *points, *weights;
53: PetscTabulation T;
54: PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
55: size_t count;
56: PetscErrorCode ierr;
59: PetscFEGetSpatialDimension(fem, &dim);
60: PetscFEGetDimension(fem, &N_b);
61: PetscFEGetNumComponents(fem, &N_c);
62: PetscFEGetQuadrature(fem, &q);
63: PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
64: if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
65: N_t = N_b * N_c * N_q * N_bl;
66: /* Enable device extension for double precision */
67: if (ocl->realType == PETSC_DOUBLE) {
68: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
69: "#if defined(cl_khr_fp64)\n"
70: "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
71: "#elif defined(cl_amd_fp64)\n"
72: "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
73: "#endif\n",
74: &count);STRING_ERROR_CHECK("Message to short");
75: numeric_str = &(double_str[0]);
76: }
77: /* Kernel API */
78: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
79: "\n"
80: "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
81: "{\n",
82: &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
83: /* Quadrature */
84: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
85: " /* Quadrature points\n"
86: " - (x1,y1,x2,y2,...) */\n"
87: " const %s points[%d] = {\n",
88: &count, numeric_str, N_q*dim);STRING_ERROR_CHECK("Message to short");
89: for (p = 0; p < N_q; ++p) {
90: for (d = 0; d < dim; ++d) {
91: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);STRING_ERROR_CHECK("Message to short");
92: }
93: }
94: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
95: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
96: " /* Quadrature weights\n"
97: " - (v1,v2,...) */\n"
98: " const %s weights[%d] = {\n",
99: &count, numeric_str, N_q);STRING_ERROR_CHECK("Message to short");
100: for (p = 0; p < N_q; ++p) {
101: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);STRING_ERROR_CHECK("Message to short");
102: }
103: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
104: /* Basis Functions */
105: PetscFEGetCellTabulation(fem, &T);
106: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107: " /* Nodal basis function evaluations\n"
108: " - basis component is fastest varying, the basis function, then point */\n"
109: " const %s Basis[%d] = {\n",
110: &count, numeric_str, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short");
111: for (p = 0; p < N_q; ++p) {
112: for (b = 0; b < N_b; ++b) {
113: for (c = 0; c < N_c; ++c) {
114: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);STRING_ERROR_CHECK("Message to short");
115: }
116: }
117: }
118: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
119: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
120: "\n"
121: " /* Nodal basis function derivative evaluations,\n"
122: " - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
123: " const %s%d BasisDerivatives[%d] = {\n",
124: &count, numeric_str, dim, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short");
125: for (p = 0; p < N_q; ++p) {
126: for (b = 0; b < N_b; ++b) {
127: for (c = 0; c < N_c; ++c) {
128: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
129: for (d = 0; d < dim; ++d) {
130: if (d > 0) {
131: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short");
132: } else {
133: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short");
134: }
135: }
136: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);STRING_ERROR_CHECK("Message to short");
137: }
138: }
139: }
140: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
141: /* Sizes */
142: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
143: " const int dim = %d; // The spatial dimension\n"
144: " const int N_bl = %d; // The number of concurrent blocks\n"
145: " const int N_b = %d; // The number of basis functions\n"
146: " const int N_comp = %d; // The number of basis function components\n"
147: " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
148: " const int N_q = %d; // The number of quadrature points\n"
149: " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
150: " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
151: " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
152: " const int N_sbc = N_bst / (N_q * N_comp);\n"
153: " const int N_sqc = N_bst / N_bt;\n"
154: " /*const int N_c = N_cb * N_bc;*/\n"
155: "\n"
156: " /* Calculated indices */\n"
157: " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
158: " const int tidx = get_local_id(0);\n"
159: " const int blidx = tidx / N_bst; // Block number for this thread\n"
160: " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
161: " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
162: " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
163: " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
164: " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
165: " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
166: " const int Goffset = gidx*N_cb*N_bc;\n",
167: &count, dim, N_bl, N_b, N_c, N_q);STRING_ERROR_CHECK("Message to short");
168: /* Local memory */
169: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
170: "\n"
171: " /* Quadrature data */\n"
172: " %s w; // $w_q$, Quadrature weight at $x_q$\n"
173: " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
174: " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
175: " /* Geometric data */\n"
176: " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
177: " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
178: &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
179: numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short");
180: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
181: " /* FEM data */\n"
182: " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
183: &count, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short");
184: if (useAux) {
185: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186: " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
187: &count, numeric_str, N_t);STRING_ERROR_CHECK("Message to short");
188: }
189: if (useF0) {
190: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
191: " /* Intermediate calculations */\n"
192: " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
193: &count, numeric_str, N_t*N_q);STRING_ERROR_CHECK("Message to short");
194: }
195: if (useF1) {
196: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197: " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
198: &count, numeric_str, dim, N_t*N_q);STRING_ERROR_CHECK("Message to short");
199: }
200: /* TODO: If using elasticity, put in mu/lambda coefficients */
201: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202: " /* Output data */\n"
203: " %s e_i; // Coefficient $e_i$ of the residual\n\n",
204: &count, numeric_str);STRING_ERROR_CHECK("Message to short");
205: /* One-time loads */
206: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
207: " /* These should be generated inline */\n"
208: " /* Load quadrature weights */\n"
209: " w = weights[qidx];\n"
210: " /* Load basis tabulation \\phi_i for this cell */\n"
211: " if (tidx < N_bt*N_q) {\n"
212: " phi_i[tidx] = Basis[tidx];\n"
213: " phiDer_i[tidx] = BasisDerivatives[tidx];\n"
214: " }\n\n",
215: &count);STRING_ERROR_CHECK("Message to short");
216: /* Batch loads */
217: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
218: " for (int batch = 0; batch < N_cb; ++batch) {\n"
219: " /* Load geometry */\n"
220: " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
221: " for (int n = 0; n < dim*dim; ++n) {\n"
222: " const int offset = n*N_t;\n"
223: " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
224: " }\n"
225: " /* Load coefficients u_i for this cell */\n"
226: " for (int n = 0; n < N_bt; ++n) {\n"
227: " const int offset = n*N_t;\n"
228: " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
229: " }\n",
230: &count);STRING_ERROR_CHECK("Message to short");
231: if (useAux) {
232: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
233: " /* Load coefficients a_i for this cell */\n"
234: " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
235: " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
236: &count);STRING_ERROR_CHECK("Message to short");
237: }
238: /* Quadrature phase */
239: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
240: " barrier(CLK_LOCAL_MEM_FENCE);\n"
241: "\n"
242: " /* Map coefficients to values at quadrature points */\n"
243: " for (int c = 0; c < N_sqc; ++c) {\n"
244: " const int cell = c*N_bl*N_b + blqidx;\n"
245: " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n",
246: &count);STRING_ERROR_CHECK("Message to short");
247: if (useField) {
248: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
249: " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n",
250: &count, numeric_str, N_c);STRING_ERROR_CHECK("Message to short");
251: }
252: if (useFieldDer) {
253: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
254: " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
255: &count, numeric_str, dim, N_c);STRING_ERROR_CHECK("Message to short");
256: }
257: if (useFieldAux) {
258: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
259: " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
260: &count, numeric_str, 1);STRING_ERROR_CHECK("Message to short");
261: }
262: if (useFieldDerAux) {
263: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264: " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
265: &count, numeric_str, dim, 1);STRING_ERROR_CHECK("Message to short");
266: }
267: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
268: "\n"
269: " for (int comp = 0; comp < N_comp; ++comp) {\n",
270: &count);STRING_ERROR_CHECK("Message to short");
271: if (useField) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
272: if (useFieldDer) {
273: switch (dim) {
274: case 1:
275: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
276: case 2:
277: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
278: case 3:
279: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
280: }
281: }
282: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
283: " }\n",
284: &count);STRING_ERROR_CHECK("Message to short");
285: if (useFieldAux) {
286: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");
287: }
288: if (useFieldDerAux) {
289: switch (dim) {
290: case 1:
291: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
292: case 2:
293: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
294: case 3:
295: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
296: }
297: }
298: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
299: " /* Get field and derivatives at this quadrature point */\n"
300: " for (int i = 0; i < N_b; ++i) {\n"
301: " for (int comp = 0; comp < N_comp; ++comp) {\n"
302: " const int b = i*N_comp+comp;\n"
303: " const int pidx = qidx*N_bt + b;\n"
304: " const int uidx = cell*N_bt + b;\n"
305: " %s%d realSpaceDer;\n\n",
306: &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
307: if (useField) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
308: if (useFieldDer) {
309: switch (dim) {
310: case 2:
311: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
313: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
314: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
315: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
316: &count);STRING_ERROR_CHECK("Message to short");break;
317: case 3:
318: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
319: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
320: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
321: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
322: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
323: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
324: " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
325: &count);STRING_ERROR_CHECK("Message to short");break;
326: }
327: }
328: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
329: " }\n"
330: " }\n",
331: &count);STRING_ERROR_CHECK("Message to short");
332: if (useFieldAux) {
333: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);STRING_ERROR_CHECK("Message to short");
334: }
335: /* Calculate residual at quadrature points: Should be generated by an weak form egine */
336: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
337: " /* Process values at quadrature points */\n",
338: &count);STRING_ERROR_CHECK("Message to short");
339: switch (op) {
340: case LAPLACIAN:
341: if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
342: if (useF1) {
343: if (useAux) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
344: else {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
345: }
346: break;
347: case ELASTICITY:
348: if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
349: if (useF1) {
350: switch (dim) {
351: case 2:
352: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
353: " switch (cidx) {\n"
354: " case 0:\n"
355: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
356: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
357: " break;\n"
358: " case 1:\n"
359: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
360: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
361: " }\n",
362: &count);STRING_ERROR_CHECK("Message to short");break;
363: case 3:
364: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
365: " switch (cidx) {\n"
366: " case 0:\n"
367: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
368: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
369: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
370: " break;\n"
371: " case 1:\n"
372: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
373: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
374: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
375: " break;\n"
376: " case 2:\n"
377: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
378: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
379: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
380: " }\n",
381: &count);STRING_ERROR_CHECK("Message to short");break;
382: }}
383: break;
384: default:
385: SETERRQ1(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
386: }
387: if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");}
388: if (useF1) {
389: switch (dim) {
390: case 1:
391: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
392: case 2:
393: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
394: case 3:
395: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
396: }
397: }
398: /* Thread transpose */
399: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
400: " }\n\n"
401: " /* ==== TRANSPOSE THREADS ==== */\n"
402: " barrier(CLK_LOCAL_MEM_FENCE);\n\n",
403: &count);STRING_ERROR_CHECK("Message to short");
404: /* Basis phase */
405: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
406: " /* Map values at quadrature points to coefficients */\n"
407: " for (int c = 0; c < N_sbc; ++c) {\n"
408: " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
409: "\n"
410: " e_i = 0.0;\n"
411: " for (int q = 0; q < N_q; ++q) {\n"
412: " const int pidx = q*N_bt + bidx;\n"
413: " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
414: " %s%d realSpaceDer;\n\n",
415: &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
417: if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
418: if (useF1) {
419: switch (dim) {
420: case 2:
421: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
423: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
424: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
425: " e_i += realSpaceDer.y*f_1[fidx].y;\n",
426: &count);STRING_ERROR_CHECK("Message to short");break;
427: case 3:
428: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
429: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
430: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
431: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
432: " e_i += realSpaceDer.y*f_1[fidx].y;\n"
433: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
434: " e_i += realSpaceDer.z*f_1[fidx].z;\n",
435: &count);STRING_ERROR_CHECK("Message to short");break;
436: }
437: }
438: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
439: " }\n"
440: " /* Write element vector for N_{cbc} cells at a time */\n"
441: " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
442: " }\n"
443: " /* ==== Could do one write per batch ==== */\n"
444: " }\n"
445: " return;\n"
446: "}\n",
447: &count);STRING_ERROR_CHECK("Message to short");
448: return(0);
449: }
451: static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
452: {
453: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
454: PetscInt dim, N_bl;
455: PetscBool flg;
456: char *buffer;
457: size_t len;
458: char errMsg[8192];
459: cl_int ierr2;
460: PetscErrorCode ierr;
463: PetscFEGetSpatialDimension(fem, &dim);
464: PetscMalloc1(8192, &buffer);
465: PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);
466: PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);
467: PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);
468: if (flg) {PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);}
469: len = strlen(buffer);
470: *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &ierr2);CHKERRQ(ierr2);
471: clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
472: if (ierr != CL_SUCCESS) {
473: clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);
474: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
475: }
476: PetscFree(buffer);
477: *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);
478: return(0);
479: }
481: static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
482: {
483: const PetscInt Nblocks = N/blockSize;
486: if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
487: *z = 1;
488: *y = 1;
489: for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
490: *y = Nblocks / *x;
491: if (*x * *y == Nblocks) break;
492: }
493: if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
494: return(0);
495: }
497: static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
498: {
499: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
500: PetscStageLog stageLog;
501: PetscEventPerfLog eventLog = NULL;
502: int stage;
503: PetscErrorCode ierr;
506: PetscLogGetStageLog(&stageLog);
507: PetscStageLogGetCurrent(stageLog, &stage);
508: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
509: /* Log performance info */
510: eventLog->eventInfo[ocl->residualEvent].count++;
511: eventLog->eventInfo[ocl->residualEvent].time += time;
512: eventLog->eventInfo[ocl->residualEvent].flops += flops;
513: return(0);
514: }
516: static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom,
517: const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
518: {
519: /* Nbc = batchSize */
520: PetscFE fem;
521: PetscFE_OpenCL *ocl;
522: PetscPointFunc f0_func;
523: PetscPointFunc f1_func;
524: PetscQuadrature q;
525: PetscInt dim, qNc;
526: PetscInt N_b; /* The number of basis functions */
527: PetscInt N_comp; /* The number of basis function components */
528: PetscInt N_bt; /* The total number of scalar basis functions */
529: PetscInt N_q; /* The number of quadrature points */
530: PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
531: PetscInt N_t; /* The number of threads, N_bst * N_bl */
532: PetscInt N_bl; /* The number of blocks */
533: PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */
534: PetscInt N_cb; /* The number of batches */
535: PetscInt numFlops, f0Flops = 0, f1Flops = 0;
536: PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE;
537: PetscBool useField = PETSC_FALSE;
538: PetscBool useFieldDer = PETSC_TRUE;
539: PetscBool useF0 = PETSC_TRUE;
540: PetscBool useF1 = PETSC_TRUE;
541: /* OpenCL variables */
542: cl_program ocl_prog;
543: cl_kernel ocl_kernel;
544: cl_event ocl_ev; /* The event for tracking kernel execution */
545: cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
546: cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
547: cl_mem o_jacobianInverses, o_jacobianDeterminants;
548: cl_mem o_coefficients, o_coefficientsAux, o_elemVec;
549: float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
550: double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
551: PetscReal *r_invJ = NULL, *r_detJ = NULL;
552: void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
553: size_t local_work_size[3], global_work_size[3];
554: size_t realSize, x, y, z;
555: const PetscReal *points, *weights;
556: PetscErrorCode ierr;
559: PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);
560: ocl = (PetscFE_OpenCL *) fem->data;
561: if (!Ne) {PetscFEOpenCLLogResidual(fem, 0.0, 0.0); return(0);}
562: PetscFEGetSpatialDimension(fem, &dim);
563: PetscFEGetQuadrature(fem, &q);
564: PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
565: if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
566: PetscFEGetDimension(fem, &N_b);
567: PetscFEGetNumComponents(fem, &N_comp);
568: PetscDSGetResidual(prob, field, &f0_func, &f1_func);
569: PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);
570: N_bt = N_b*N_comp;
571: N_bst = N_bt*N_q;
572: N_t = N_bst*N_bl;
573: if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
574: /* Calculate layout */
575: if (Ne % (N_cb*N_bc)) { /* Remainder cells */
576: PetscFEIntegrateResidual_Basic(prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);
577: return(0);
578: }
579: PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);
580: local_work_size[0] = N_bc*N_comp;
581: local_work_size[1] = 1;
582: local_work_size[2] = 1;
583: global_work_size[0] = x * local_work_size[0];
584: global_work_size[1] = y * local_work_size[1];
585: global_work_size[2] = z * local_work_size[2];
586: PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);
587: PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
588: /* Generate code */
589: if (probAux) {
590: PetscSpace P;
591: PetscInt NfAux, order, f;
593: PetscDSGetNumFields(probAux, &NfAux);
594: for (f = 0; f < NfAux; ++f) {
595: PetscFE feAux;
597: PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);
598: PetscFEGetBasisSpace(feAux, &P);
599: PetscSpaceGetDegree(P, &order, NULL);
600: if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
601: }
602: }
603: PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);
604: /* Create buffers on the device and send data over */
605: PetscDataTypeGetSize(ocl->realType, &realSize);
606: if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
607: if (sizeof(PetscReal) != realSize) {
608: switch (ocl->realType) {
609: case PETSC_FLOAT:
610: {
611: PetscInt c, b, d;
613: PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);
614: for (c = 0; c < Ne; ++c) {
615: f_detJ[c] = (float) cgeom->detJ[c];
616: for (d = 0; d < dim*dim; ++d) {
617: f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
618: }
619: for (b = 0; b < N_bt; ++b) {
620: f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
621: }
622: }
623: if (coefficientsAux) { /* Assume P0 */
624: for (c = 0; c < Ne; ++c) {
625: f_coeffAux[c] = (float) coefficientsAux[c];
626: }
627: }
628: oclCoeff = (void *) f_coeff;
629: if (coefficientsAux) {
630: oclCoeffAux = (void *) f_coeffAux;
631: } else {
632: oclCoeffAux = NULL;
633: }
634: oclInvJ = (void *) f_invJ;
635: oclDetJ = (void *) f_detJ;
636: }
637: break;
638: case PETSC_DOUBLE:
639: {
640: PetscInt c, b, d;
642: PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);
643: for (c = 0; c < Ne; ++c) {
644: d_detJ[c] = (double) cgeom->detJ[c];
645: for (d = 0; d < dim*dim; ++d) {
646: d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
647: }
648: for (b = 0; b < N_bt; ++b) {
649: d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
650: }
651: }
652: if (coefficientsAux) { /* Assume P0 */
653: for (c = 0; c < Ne; ++c) {
654: d_coeffAux[c] = (double) coefficientsAux[c];
655: }
656: }
657: oclCoeff = (void *) d_coeff;
658: if (coefficientsAux) {
659: oclCoeffAux = (void *) d_coeffAux;
660: } else {
661: oclCoeffAux = NULL;
662: }
663: oclInvJ = (void *) d_invJ;
664: oclDetJ = (void *) d_detJ;
665: }
666: break;
667: default:
668: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
669: }
670: } else {
671: PetscInt c, d;
673: PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);
674: for (c = 0; c < Ne; ++c) {
675: r_detJ[c] = cgeom->detJ[c];
676: for (d = 0; d < dim*dim; ++d) {
677: r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
678: }
679: }
680: oclCoeff = (void *) coefficients;
681: oclCoeffAux = (void *) coefficientsAux;
682: oclInvJ = (void *) r_invJ;
683: oclDetJ = (void *) r_detJ;
684: }
685: o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr);
686: if (coefficientsAux) {
687: o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr);
688: } else {
689: o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr);
690: }
691: o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr);
692: o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr);
693: o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr);
694: /* Kernel launch */
695: clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);
696: clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);
697: clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);
698: clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);
699: clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);
700: clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);
701: clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);
702: /* Read data back from device */
703: if (sizeof(PetscReal) != realSize) {
704: switch (ocl->realType) {
705: case PETSC_FLOAT:
706: {
707: float *elem;
708: PetscInt c, b;
710: PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);
711: PetscMalloc1(Ne*N_bt, &elem);
712: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
713: for (c = 0; c < Ne; ++c) {
714: for (b = 0; b < N_bt; ++b) {
715: elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
716: }
717: }
718: PetscFree(elem);
719: }
720: break;
721: case PETSC_DOUBLE:
722: {
723: double *elem;
724: PetscInt c, b;
726: PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);
727: PetscMalloc1(Ne*N_bt, &elem);
728: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
729: for (c = 0; c < Ne; ++c) {
730: for (b = 0; b < N_bt; ++b) {
731: elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
732: }
733: }
734: PetscFree(elem);
735: }
736: break;
737: default:
738: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
739: }
740: } else {
741: PetscFree2(r_invJ,r_detJ);
742: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);
743: }
744: /* Log performance */
745: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
746: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);
747: f0Flops = 0;
748: switch (ocl->op) {
749: case LAPLACIAN:
750: f1Flops = useAux ? dim : 0;break;
751: case ELASTICITY:
752: f1Flops = 2*dim*dim;break;
753: }
754: numFlops = Ne*(
755: N_q*(
756: N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
757: /*+
758: N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
759: +
760: N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
761: +
762: N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
763: PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);
764: /* Cleanup */
765: clReleaseMemObject(o_coefficients);
766: clReleaseMemObject(o_coefficientsAux);
767: clReleaseMemObject(o_jacobianInverses);
768: clReleaseMemObject(o_jacobianDeterminants);
769: clReleaseMemObject(o_elemVec);
770: clReleaseKernel(ocl_kernel);
771: clReleaseProgram(ocl_prog);
772: return(0);
773: }
775: PETSC_EXTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
776: PETSC_EXTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
778: static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
779: {
781: fem->ops->setfromoptions = NULL;
782: fem->ops->setup = PetscFESetUp_Basic;
783: fem->ops->view = NULL;
784: fem->ops->destroy = PetscFEDestroy_OpenCL;
785: fem->ops->getdimension = PetscFEGetDimension_Basic;
786: fem->ops->createtabulation = PetscFECreateTabulation_Basic;
787: fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL;
788: fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
789: fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
790: fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic;
791: return(0);
792: }
794: /*MC
795: PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
797: Level: intermediate
799: .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
800: M*/
802: PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
803: {
804: PetscFE_OpenCL *ocl;
805: cl_uint num_platforms;
806: cl_platform_id platform_ids[42];
807: cl_uint num_devices;
808: cl_device_id device_ids[42];
809: cl_int ierr2;
810: PetscErrorCode ierr;
814: PetscNewLog(fem,&ocl);
815: fem->data = ocl;
817: /* Init Platform */
818: clGetPlatformIDs(42, platform_ids, &num_platforms);
819: if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
820: ocl->pf_id = platform_ids[0];
821: /* Init Device */
822: clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
823: if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
824: ocl->dev_id = device_ids[0];
825: /* Create context with one command queue */
826: ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &ierr2);CHKERRQ(ierr2);
827: ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr2);CHKERRQ(ierr2);
828: /* Types */
829: ocl->realType = PETSC_FLOAT;
830: /* Register events */
831: PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);
832: /* Equation handling */
833: ocl->op = LAPLACIAN;
835: PetscFEInitialize_OpenCL(fem);
836: return(0);
837: }
839: /*@
840: PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
842: Input Parameters:
843: + fem - The PetscFE
844: - realType - The scalar type
846: Level: developer
848: .seealso: PetscFEOpenCLGetRealType()
849: @*/
850: PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
851: {
852: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
856: ocl->realType = realType;
857: return(0);
858: }
860: /*@
861: PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
863: Input Parameter:
864: . fem - The PetscFE
866: Output Parameter:
867: . realType - The scalar type
869: Level: developer
871: .seealso: PetscFEOpenCLSetRealType()
872: @*/
873: PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
874: {
875: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
880: *realType = ocl->realType;
881: return(0);
882: }
884: #endif /* PETSC_HAVE_OPENCL */