Actual source code: feopencl.c
1: #include <petsc/private/petscfeimpl.h>
3: #if defined(PETSC_HAVE_OPENCL)
5: static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6: {
7: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
9: clReleaseCommandQueue(ocl->queue_id);
10: ocl->queue_id = 0;
11: clReleaseContext(ocl->ctx_id);
12: ocl->ctx_id = 0;
13: PetscFree(ocl);
14: return 0;
15: }
18: enum {LAPLACIAN = 0, ELASTICITY = 1};
20: /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
21: /* dim Number of spatial dimensions: 2 */
22: /* N_b Number of basis functions: generated */
23: /* N_{bt} Number of total basis functions: N_b * N_{comp} */
24: /* N_q Number of quadrature points: generated */
25: /* N_{bs} Number of block cells LCM(N_b, N_q) */
26: /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
27: /* N_{bl} Number of concurrent blocks generated */
28: /* N_t Number of threads: N_{bl} * N_{bs} */
29: /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
30: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
31: /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
32: /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
33: /* N_{cb} Number of serial cell batches: input */
34: /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
35: static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
36: {
37: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
38: PetscQuadrature q;
39: char *string_tail = *string_buffer;
40: char *end_of_buffer = *string_buffer + buffer_length;
41: char float_str[] = "float", double_str[] = "double";
42: char *numeric_str = &(float_str[0]);
43: PetscInt op = ocl->op;
44: PetscBool useField = PETSC_FALSE;
45: PetscBool useFieldDer = PETSC_TRUE;
46: PetscBool useFieldAux = useAux;
47: PetscBool useFieldDerAux= PETSC_FALSE;
48: PetscBool useF0 = PETSC_TRUE;
49: PetscBool useF1 = PETSC_TRUE;
50: const PetscReal *points, *weights;
51: PetscTabulation T;
52: PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
53: size_t count;
55: PetscFEGetSpatialDimension(fem, &dim);
56: PetscFEGetDimension(fem, &N_b);
57: PetscFEGetNumComponents(fem, &N_c);
58: PetscFEGetQuadrature(fem, &q);
59: PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
61: N_t = N_b * N_c * N_q * N_bl;
62: /* Enable device extension for double precision */
63: if (ocl->realType == PETSC_DOUBLE) {
64: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
65: "#if defined(cl_khr_fp64)\n"
66: "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
67: "#elif defined(cl_amd_fp64)\n"
68: "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
69: "#endif\n",
70: &count));
71: numeric_str = &(double_str[0]);
72: }
73: /* Kernel API */
74: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
75: "\n"
76: "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
77: "{\n",
78: &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
79: /* Quadrature */
80: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
81: " /* Quadrature points\n"
82: " - (x1,y1,x2,y2,...) */\n"
83: " const %s points[%d] = {\n",
84: &count, numeric_str, N_q*dim));
85: for (p = 0; p < N_q; ++p) {
86: for (d = 0; d < dim; ++d) {
87: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);
88: }
89: }
90: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
91: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
92: " /* Quadrature weights\n"
93: " - (v1,v2,...) */\n"
94: " const %s weights[%d] = {\n",
95: &count, numeric_str, N_q));
96: for (p = 0; p < N_q; ++p) {
97: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);
98: }
99: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
100: /* Basis Functions */
101: PetscFEGetCellTabulation(fem, 1, &T);
102: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
103: " /* Nodal basis function evaluations\n"
104: " - basis component is fastest varying, the basis function, then point */\n"
105: " const %s Basis[%d] = {\n",
106: &count, numeric_str, N_q*N_b*N_c));
107: for (p = 0; p < N_q; ++p) {
108: for (b = 0; b < N_b; ++b) {
109: for (c = 0; c < N_c; ++c) {
110: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);
111: }
112: }
113: }
114: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
115: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
116: "\n"
117: " /* Nodal basis function derivative evaluations,\n"
118: " - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
119: " const %s%d BasisDerivatives[%d] = {\n",
120: &count, numeric_str, dim, N_q*N_b*N_c));
121: for (p = 0; p < N_q; ++p) {
122: for (b = 0; b < N_b; ++b) {
123: for (c = 0; c < N_c; ++c) {
124: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);
125: for (d = 0; d < dim; ++d) {
126: if (d > 0) {
127: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);
128: } else {
129: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);
130: }
131: }
132: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);
133: }
134: }
135: }
136: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
137: /* Sizes */
138: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
139: " const int dim = %d; // The spatial dimension\n"
140: " const int N_bl = %d; // The number of concurrent blocks\n"
141: " const int N_b = %d; // The number of basis functions\n"
142: " const int N_comp = %d; // The number of basis function components\n"
143: " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
144: " const int N_q = %d; // The number of quadrature points\n"
145: " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
146: " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
147: " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
148: " const int N_sbc = N_bst / (N_q * N_comp);\n"
149: " const int N_sqc = N_bst / N_bt;\n"
150: " /*const int N_c = N_cb * N_bc;*/\n"
151: "\n"
152: " /* Calculated indices */\n"
153: " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
154: " const int tidx = get_local_id(0);\n"
155: " const int blidx = tidx / N_bst; // Block number for this thread\n"
156: " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
157: " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
158: " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
159: " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
160: " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
161: " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
162: " const int Goffset = gidx*N_cb*N_bc;\n",
163: &count, dim, N_bl, N_b, N_c, N_q));
164: /* Local memory */
165: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
166: "\n"
167: " /* Quadrature data */\n"
168: " %s w; // $w_q$, Quadrature weight at $x_q$\n"
169: " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
170: " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
171: " /* Geometric data */\n"
172: " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
173: " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
174: &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
175: numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c));
176: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
177: " /* FEM data */\n"
178: " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
179: &count, numeric_str, N_t*N_b*N_c));
180: if (useAux) {
181: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
182: " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
183: &count, numeric_str, N_t));
184: }
185: if (useF0) {
186: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
187: " /* Intermediate calculations */\n"
188: " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
189: &count, numeric_str, N_t*N_q));
190: }
191: if (useF1) {
192: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
193: " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
194: &count, numeric_str, dim, N_t*N_q));
195: }
196: /* TODO: If using elasticity, put in mu/lambda coefficients */
197: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
198: " /* Output data */\n"
199: " %s e_i; // Coefficient $e_i$ of the residual\n\n",
200: &count, numeric_str));
201: /* One-time loads */
202: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
203: " /* These should be generated inline */\n"
204: " /* Load quadrature weights */\n"
205: " w = weights[qidx];\n"
206: " /* Load basis tabulation \\phi_i for this cell */\n"
207: " if (tidx < N_bt*N_q) {\n"
208: " phi_i[tidx] = Basis[tidx];\n"
209: " phiDer_i[tidx] = BasisDerivatives[tidx];\n"
210: " }\n\n",
211: &count));
212: /* Batch loads */
213: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
214: " for (int batch = 0; batch < N_cb; ++batch) {\n"
215: " /* Load geometry */\n"
216: " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
217: " for (int n = 0; n < dim*dim; ++n) {\n"
218: " const int offset = n*N_t;\n"
219: " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
220: " }\n"
221: " /* Load coefficients u_i for this cell */\n"
222: " for (int n = 0; n < N_bt; ++n) {\n"
223: " const int offset = n*N_t;\n"
224: " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
225: " }\n",
226: &count));
227: if (useAux) {
228: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
229: " /* Load coefficients a_i for this cell */\n"
230: " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
231: " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
232: &count));
233: }
234: /* Quadrature phase */
235: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
236: " barrier(CLK_LOCAL_MEM_FENCE);\n"
237: "\n"
238: " /* Map coefficients to values at quadrature points */\n"
239: " for (int c = 0; c < N_sqc; ++c) {\n"
240: " const int cell = c*N_bl*N_b + blqidx;\n"
241: " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n",
242: &count));
243: if (useField) {
244: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
245: " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n",
246: &count, numeric_str, N_c));
247: }
248: if (useFieldDer) {
249: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
250: " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
251: &count, numeric_str, dim, N_c));
252: }
253: if (useFieldAux) {
254: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
255: " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
256: &count, numeric_str, 1));
257: }
258: if (useFieldDerAux) {
259: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
260: " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
261: &count, numeric_str, dim, 1));
262: }
263: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264: "\n"
265: " for (int comp = 0; comp < N_comp; ++comp) {\n",
266: &count));
267: if (useField) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);
268: if (useFieldDer) {
269: switch (dim) {
270: case 1:
271: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);break;
272: case 2:
273: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);break;
274: case 3:
275: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);break;
276: }
277: }
278: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
279: " }\n",
280: &count));
281: if (useFieldAux) {
282: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);
283: }
284: if (useFieldDerAux) {
285: switch (dim) {
286: case 1:
287: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);break;
288: case 2:
289: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);break;
290: case 3:
291: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);break;
292: }
293: }
294: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
295: " /* Get field and derivatives at this quadrature point */\n"
296: " for (int i = 0; i < N_b; ++i) {\n"
297: " for (int comp = 0; comp < N_comp; ++comp) {\n"
298: " const int b = i*N_comp+comp;\n"
299: " const int pidx = qidx*N_bt + b;\n"
300: " const int uidx = cell*N_bt + b;\n"
301: " %s%d realSpaceDer;\n\n",
302: &count, numeric_str, dim));
303: if (useField) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);
304: if (useFieldDer) {
305: switch (dim) {
306: case 2:
307: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
308: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
309: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
310: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
311: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
312: &count));break;
313: case 3:
314: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
315: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
316: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
317: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
318: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
319: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
320: " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
321: &count));break;
322: }
323: }
324: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
325: " }\n"
326: " }\n",
327: &count));
328: if (useFieldAux) {
329: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);
330: }
331: /* Calculate residual at quadrature points: Should be generated by an weak form egine */
332: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
333: " /* Process values at quadrature points */\n",
334: &count));
335: switch (op) {
336: case LAPLACIAN:
337: if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);}
338: if (useF1) {
339: if (useAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);
340: else PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);
341: }
342: break;
343: case ELASTICITY:
344: if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);
345: if (useF1) {
346: switch (dim) {
347: case 2:
348: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
349: " switch (cidx) {\n"
350: " case 0:\n"
351: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
352: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
353: " break;\n"
354: " case 1:\n"
355: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
356: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
357: " }\n",
358: &count));break;
359: case 3:
360: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
361: " switch (cidx) {\n"
362: " case 0:\n"
363: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
364: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
365: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
366: " break;\n"
367: " case 1:\n"
368: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
369: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
370: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
371: " break;\n"
372: " case 2:\n"
373: " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
374: " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
375: " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
376: " }\n",
377: &count));break;
378: }}
379: break;
380: default:
381: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
382: }
383: if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);
384: if (useF1) {
385: switch (dim) {
386: case 1:
387: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);break;
388: case 2:
389: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);break;
390: case 3:
391: PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);break;
392: }
393: }
394: /* Thread transpose */
395: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
396: " }\n\n"
397: " /* ==== TRANSPOSE THREADS ==== */\n"
398: " barrier(CLK_LOCAL_MEM_FENCE);\n\n",
399: &count));
400: /* Basis phase */
401: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
402: " /* Map values at quadrature points to coefficients */\n"
403: " for (int c = 0; c < N_sbc; ++c) {\n"
404: " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
405: "\n"
406: " e_i = 0.0;\n"
407: " for (int q = 0; q < N_q; ++q) {\n"
408: " const int pidx = q*N_bt + bidx;\n"
409: " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
410: " %s%d realSpaceDer;\n\n",
411: &count, numeric_str, dim));
413: if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);
414: if (useF1) {
415: switch (dim) {
416: case 2:
417: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
418: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
419: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
420: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
421: " e_i += realSpaceDer.y*f_1[fidx].y;\n",
422: &count));break;
423: case 3:
424: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
425: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
426: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
427: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
428: " e_i += realSpaceDer.y*f_1[fidx].y;\n"
429: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
430: " e_i += realSpaceDer.z*f_1[fidx].z;\n",
431: &count));break;
432: }
433: }
434: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
435: " }\n"
436: " /* Write element vector for N_{cbc} cells at a time */\n"
437: " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
438: " }\n"
439: " /* ==== Could do one write per batch ==== */\n"
440: " }\n"
441: " return;\n"
442: "}\n",
443: &count));
444: return 0;
445: }
447: static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
448: {
449: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
450: PetscInt dim, N_bl;
451: PetscBool flg;
452: char *buffer;
453: size_t len;
454: char errMsg[8192];
455: cl_int err;
456: PetscErrorCode ierr;
458: PetscFEGetSpatialDimension(fem, &dim);
459: PetscMalloc1(8192, &buffer);
460: PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);
461: PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);
462: PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);
463: if (flg) PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);
464: PetscStrlen(buffer,&len);
465: *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);err;
466: err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
467: if (err != CL_SUCCESS) {
468: err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);
469: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
470: }
471: PetscFree(buffer);
472: *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);
473: return 0;
474: }
476: static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
477: {
478: const PetscInt Nblocks = N/blockSize;
481: *z = 1;
482: *y = 1;
483: for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
484: *y = Nblocks / *x;
485: if (*x * *y == (size_t)Nblocks) break;
486: }
488: return 0;
489: }
491: static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
492: {
493: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
494: PetscStageLog stageLog;
495: PetscEventPerfLog eventLog = NULL;
496: int stage;
498: PetscLogGetStageLog(&stageLog);
499: PetscStageLogGetCurrent(stageLog, &stage);
500: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
501: /* Log performance info */
502: eventLog->eventInfo[ocl->residualEvent].count++;
503: eventLog->eventInfo[ocl->residualEvent].time += time;
504: eventLog->eventInfo[ocl->residualEvent].flops += flops;
505: return 0;
506: }
508: static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom,
509: const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
510: {
511: /* Nbc = batchSize */
512: PetscFE fem;
513: PetscFE_OpenCL *ocl;
514: PetscPointFunc f0_func;
515: PetscPointFunc f1_func;
516: PetscQuadrature q;
517: PetscInt dim, qNc;
518: PetscInt N_b; /* The number of basis functions */
519: PetscInt N_comp; /* The number of basis function components */
520: PetscInt N_bt; /* The total number of scalar basis functions */
521: PetscInt N_q; /* The number of quadrature points */
522: PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
523: PetscInt N_t; /* The number of threads, N_bst * N_bl */
524: PetscInt N_bl; /* The number of blocks */
525: PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */
526: PetscInt N_cb; /* The number of batches */
527: const PetscInt field = key.field;
528: PetscInt numFlops, f0Flops = 0, f1Flops = 0;
529: PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE;
530: PetscBool useField = PETSC_FALSE;
531: PetscBool useFieldDer = PETSC_TRUE;
532: PetscBool useF0 = PETSC_TRUE;
533: PetscBool useF1 = PETSC_TRUE;
534: /* OpenCL variables */
535: cl_program ocl_prog;
536: cl_kernel ocl_kernel;
537: cl_event ocl_ev; /* The event for tracking kernel execution */
538: cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
539: cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
540: cl_mem o_jacobianInverses, o_jacobianDeterminants;
541: cl_mem o_coefficients, o_coefficientsAux, o_elemVec;
542: float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
543: double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
544: PetscReal *r_invJ = NULL, *r_detJ = NULL;
545: void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
546: size_t local_work_size[3], global_work_size[3];
547: size_t realSize, x, y, z;
548: const PetscReal *points, *weights;
549: PetscErrorCode ierr;
551: PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);
552: ocl = (PetscFE_OpenCL *) fem->data;
553: if (!Ne) {PetscFEOpenCLLogResidual(fem, 0.0, 0.0)); PetscFunctionReturn(0;}
554: PetscFEGetSpatialDimension(fem, &dim);
555: PetscFEGetQuadrature(fem, &q);
556: PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
558: PetscFEGetDimension(fem, &N_b);
559: PetscFEGetNumComponents(fem, &N_comp);
560: PetscDSGetResidual(prob, field, &f0_func, &f1_func);
561: PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);
562: N_bt = N_b*N_comp;
563: N_bst = N_bt*N_q;
564: N_t = N_bst*N_bl;
566: /* Calculate layout */
567: if (Ne % (N_cb*N_bc)) { /* Remainder cells */
568: PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);
569: return 0;
570: }
571: PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);
572: local_work_size[0] = N_bc*N_comp;
573: local_work_size[1] = 1;
574: local_work_size[2] = 1;
575: global_work_size[0] = x * local_work_size[0];
576: global_work_size[1] = y * local_work_size[1];
577: global_work_size[2] = z * local_work_size[2];
578: PetscInfo(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);
579: PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
580: /* Generate code */
581: if (probAux) {
582: PetscSpace P;
583: PetscInt NfAux, order, f;
585: PetscDSGetNumFields(probAux, &NfAux);
586: for (f = 0; f < NfAux; ++f) {
587: PetscFE feAux;
589: PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);
590: PetscFEGetBasisSpace(feAux, &P);
591: PetscSpaceGetDegree(P, &order, NULL);
593: }
594: }
595: PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);
596: /* Create buffers on the device and send data over */
597: PetscDataTypeGetSize(ocl->realType, &realSize);
599: if (sizeof(PetscReal) != realSize) {
600: switch (ocl->realType) {
601: case PETSC_FLOAT:
602: {
603: PetscInt c, b, d;
605: PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);
606: for (c = 0; c < Ne; ++c) {
607: f_detJ[c] = (float) cgeom->detJ[c];
608: for (d = 0; d < dim*dim; ++d) {
609: f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
610: }
611: for (b = 0; b < N_bt; ++b) {
612: f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
613: }
614: }
615: if (coefficientsAux) { /* Assume P0 */
616: for (c = 0; c < Ne; ++c) {
617: f_coeffAux[c] = (float) coefficientsAux[c];
618: }
619: }
620: oclCoeff = (void *) f_coeff;
621: if (coefficientsAux) {
622: oclCoeffAux = (void *) f_coeffAux;
623: } else {
624: oclCoeffAux = NULL;
625: }
626: oclInvJ = (void *) f_invJ;
627: oclDetJ = (void *) f_detJ;
628: }
629: break;
630: case PETSC_DOUBLE:
631: {
632: PetscInt c, b, d;
634: PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);
635: for (c = 0; c < Ne; ++c) {
636: d_detJ[c] = (double) cgeom->detJ[c];
637: for (d = 0; d < dim*dim; ++d) {
638: d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
639: }
640: for (b = 0; b < N_bt; ++b) {
641: d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
642: }
643: }
644: if (coefficientsAux) { /* Assume P0 */
645: for (c = 0; c < Ne; ++c) {
646: d_coeffAux[c] = (double) coefficientsAux[c];
647: }
648: }
649: oclCoeff = (void *) d_coeff;
650: if (coefficientsAux) {
651: oclCoeffAux = (void *) d_coeffAux;
652: } else {
653: oclCoeffAux = NULL;
654: }
655: oclInvJ = (void *) d_invJ;
656: oclDetJ = (void *) d_detJ;
657: }
658: break;
659: default:
660: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
661: }
662: } else {
663: PetscInt c, d;
665: PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);
666: for (c = 0; c < Ne; ++c) {
667: r_detJ[c] = cgeom->detJ[c];
668: for (d = 0; d < dim*dim; ++d) {
669: r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
670: }
671: }
672: oclCoeff = (void *) coefficients;
673: oclCoeffAux = (void *) coefficientsAux;
674: oclInvJ = (void *) r_invJ;
675: oclDetJ = (void *) r_detJ;
676: }
677: o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr);
678: if (coefficientsAux) {
679: o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr);
680: } else {
681: o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr);
682: }
683: o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr);
684: o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr);
685: o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr);
686: /* Kernel launch */
687: clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);
688: clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);
689: clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);
690: clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);
691: clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);
692: clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);
693: clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);
694: /* Read data back from device */
695: if (sizeof(PetscReal) != realSize) {
696: switch (ocl->realType) {
697: case PETSC_FLOAT:
698: {
699: float *elem;
700: PetscInt c, b;
702: PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);
703: PetscMalloc1(Ne*N_bt, &elem);
704: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
705: for (c = 0; c < Ne; ++c) {
706: for (b = 0; b < N_bt; ++b) {
707: elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
708: }
709: }
710: PetscFree(elem);
711: }
712: break;
713: case PETSC_DOUBLE:
714: {
715: double *elem;
716: PetscInt c, b;
718: PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);
719: PetscMalloc1(Ne*N_bt, &elem);
720: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
721: for (c = 0; c < Ne; ++c) {
722: for (b = 0; b < N_bt; ++b) {
723: elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
724: }
725: }
726: PetscFree(elem);
727: }
728: break;
729: default:
730: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
731: }
732: } else {
733: PetscFree2(r_invJ,r_detJ);
734: clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);
735: }
736: /* Log performance */
737: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
738: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);
739: f0Flops = 0;
740: switch (ocl->op) {
741: case LAPLACIAN:
742: f1Flops = useAux ? dim : 0;break;
743: case ELASTICITY:
744: f1Flops = 2*dim*dim;break;
745: }
746: numFlops = Ne*(
747: N_q*(
748: N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
749: /*+
750: N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
751: +
752: N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
753: +
754: N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
755: PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);
756: /* Cleanup */
757: clReleaseMemObject(o_coefficients);
758: clReleaseMemObject(o_coefficientsAux);
759: clReleaseMemObject(o_jacobianInverses);
760: clReleaseMemObject(o_jacobianDeterminants);
761: clReleaseMemObject(o_elemVec);
762: clReleaseKernel(ocl_kernel);
763: clReleaseProgram(ocl_prog);
764: return 0;
765: }
767: PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
768: PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
770: static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
771: {
772: fem->ops->setfromoptions = NULL;
773: fem->ops->setup = PetscFESetUp_Basic;
774: fem->ops->view = NULL;
775: fem->ops->destroy = PetscFEDestroy_OpenCL;
776: fem->ops->getdimension = PetscFEGetDimension_Basic;
777: fem->ops->createtabulation = PetscFECreateTabulation_Basic;
778: fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL;
779: fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
780: fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
781: fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic;
782: return 0;
783: }
785: /*MC
786: PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
788: Level: intermediate
790: .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
791: M*/
793: PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
794: {
795: PetscFE_OpenCL *ocl;
796: cl_uint num_platforms;
797: cl_platform_id platform_ids[42];
798: cl_uint num_devices;
799: cl_device_id device_ids[42];
800: cl_int err;
803: PetscNewLog(fem,&ocl);
804: fem->data = ocl;
806: /* Init Platform */
807: clGetPlatformIDs(42, platform_ids, &num_platforms);
809: ocl->pf_id = platform_ids[0];
810: /* Init Device */
811: clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
813: ocl->dev_id = device_ids[0];
814: /* Create context with one command queue */
815: ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);err;
816: ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);err;
817: /* Types */
818: ocl->realType = PETSC_FLOAT;
819: /* Register events */
820: PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);
821: /* Equation handling */
822: ocl->op = LAPLACIAN;
824: PetscFEInitialize_OpenCL(fem);
825: return 0;
826: }
828: /*@
829: PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
831: Input Parameters:
832: + fem - The PetscFE
833: - realType - The scalar type
835: Level: developer
837: .seealso: PetscFEOpenCLGetRealType()
838: @*/
839: PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
840: {
841: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
844: ocl->realType = realType;
845: return 0;
846: }
848: /*@
849: PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
851: Input Parameter:
852: . fem - The PetscFE
854: Output Parameter:
855: . realType - The scalar type
857: Level: developer
859: .seealso: PetscFEOpenCLSetRealType()
860: @*/
861: PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
862: {
863: PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
867: *realType = ocl->realType;
868: return 0;
869: }
871: #endif /* PETSC_HAVE_OPENCL */