Actual source code: dalocalizationletkf.kokkos.cxx
1: #include <petsc.h>
2: #include <petscmat.h>
3: #include <petsc_kokkos.hpp>
5: typedef struct {
6: PetscReal distance;
7: PetscInt obs_index;
8: } DistObsPair;
10: KOKKOS_INLINE_FUNCTION
11: static PetscReal GaspariCohn(PetscReal distance, PetscReal radius)
12: {
13: if (radius <= 0.0) return 0.0;
14: const PetscReal r = distance / radius;
16: if (r >= 2.0) return 0.0;
18: const PetscReal r2 = r * r;
19: const PetscReal r3 = r2 * r;
20: const PetscReal r4 = r3 * r;
21: const PetscReal r5 = r4 * r;
23: if (r <= 1.0) {
24: // Region [0, 1]
25: return -0.25 * r5 + 0.5 * r4 + 0.625 * r3 - (5.0 / 3.0) * r2 + 1.0;
26: } else {
27: // Region [1, 2]
28: return (1.0 / 12.0) * r5 - 0.5 * r4 + 0.625 * r3 + (5.0 / 3.0) * r2 - 5.0 * r + 4.0 - (2.0 / 3.0) / r;
29: }
30: }
32: #define RADIUS_FACTOR 1.1
34: template <class ViewType>
35: struct RadiusStatsFunctor {
36: ViewType best_dists;
37: PetscInt n_obs_vertex;
39: struct value_type {
40: double sum, sq_sum;
41: };
43: KOKKOS_INLINE_FUNCTION void operator()(const PetscInt i, value_type &update) const
44: {
45: PetscReal r2 = best_dists(i, n_obs_vertex - 1);
46: PetscReal r = std::sqrt(r2);
47: r *= RADIUS_FACTOR;
48: if (r == 0.0) r = 1.0;
49: update.sum += r;
50: update.sq_sum += r * r;
51: }
53: KOKKOS_INLINE_FUNCTION void init(value_type &update) const
54: {
55: update.sum = 0.0;
56: update.sq_sum = 0.0;
57: }
59: KOKKOS_INLINE_FUNCTION void join(value_type &dest, const value_type &src) const
60: {
61: dest.sum += src.sum;
62: dest.sq_sum += src.sq_sum;
63: }
64: };
66: /*@
67: PetscDALETKFGetLocalizationMatrix - Compute localization weight matrix for LETKF [move to ml/da/interface]
69: Collective
71: Input Parameters:
72: + n_obs_vertex - Number of observations to localize to per vertex
73: . n_dof - Number of degrees of freedom
74: . Vecxyz - Array of vectors containing the vertex coordinates
75: . bd - Array of boundary extents per dimension (used for periodicity)
76: - H - Observation operator matrix
78: Output Parameter:
79: . Q - Localization weight matrix (sparse, AIJ format)
81: Level: intermediate
83: Notes:
84: The output matrix Q has dimensions (n_vert_global x n_obs_global) where
85: n_vert_global is the number of vertices in the DMPlex. Each row contains
86: exactly n_obs_vertex non-zero entries corresponding to the nearest
87: observations, weighted by the Gaspari-Cohn fifth-order piecewise
88: rational function.
90: The observation locations are computed as H * V where V is the vector
91: of vertex coordinates. The localization weights ensure smooth tapering
92: of observation influence with distance.
94: Kokkos is required for this routine.
96: .seealso: [](ch_da), `PetscDALETKFSetLocalization()`
97: @*/
98: PetscErrorCode PetscDALETKFGetLocalizationMatrix(const PetscInt n_obs_vertex, const PetscInt n_dof, Vec Vecxyz[3], PetscReal bd[3], Mat H, Mat *Q)
99: {
100: PetscInt dim = 0, n_vert_local, d, n_obs_global, n_obs_local;
101: Vec *obs_vecs;
102: MPI_Comm comm;
104: PetscFunctionBegin;
106: PetscAssertPointer(Q, 6);
108: PetscCall(PetscKokkosInitializeCheck());
109: PetscCall(PetscObjectGetComm((PetscObject)H, &comm));
110: PetscCall(MatGetLocalSize(H, &n_obs_local, NULL));
111: PetscCall(MatGetSize(H, &n_obs_global, NULL));
112: /* Infer dim from the number of vectors in Vecxyz */
113: for (d = 0; d < 3; ++d) {
114: if (Vecxyz[d]) dim++;
115: else break;
116: }
117: PetscCall(VecGetLocalSize(Vecxyz[0], &n_vert_local));
119: /* Check H dimensions */
120: // If n_obs_global < n_obs_vertex, we will pad with -1 indices and 0.0 weights. ???
121: // This is not an error condition, but rather a case where we have fewer observations than requested neighbors.
122: PetscCheck(dim > 0, comm, PETSC_ERR_ARG_WRONG, "Dim must be > 0");
123: PetscCheck(n_obs_vertex > 0 && n_obs_vertex <= n_obs_global, comm, PETSC_ERR_ARG_WRONG, "n_obs_vertex must be > 0 and <= n_obs_global");
125: /* Allocate storage for observation locations */
126: PetscCall(PetscMalloc1(dim, &obs_vecs));
128: /* Compute observation locations per dimension */
129: for (d = 0; d < dim; ++d) {
130: PetscCall(MatCreateVecs(H, NULL, &obs_vecs[d]));
131: PetscCall(MatMult(H, Vecxyz[d], obs_vecs[d]));
132: }
134: /* Create output matrix Q in N/n_dof x P */
135: PetscCall(MatCreate(comm, Q));
136: PetscCall(MatSetSizes(*Q, n_vert_local, n_obs_local, PETSC_DETERMINE, n_obs_global));
137: PetscCall(MatSetType(*Q, MATAIJ));
138: PetscCall(MatSeqAIJSetPreallocation(*Q, n_obs_vertex, NULL));
139: PetscCall(MatMPIAIJSetPreallocation(*Q, n_obs_vertex, NULL, n_obs_vertex, NULL));
140: PetscCall(MatSetFromOptions(*Q));
141: PetscCall(MatSetUp(*Q));
143: PetscCall(PetscInfo((PetscObject)*Q, "Computing LETKF localization matrix: %" PetscInt_FMT " vertices, %" PetscInt_FMT " observations, %" PetscInt_FMT " observations per vertex\n", n_vert_local, n_obs_global, n_obs_vertex));
145: /* Prepare Kokkos Views */
146: using ExecSpace = Kokkos::DefaultExecutionSpace;
147: using MemSpace = ExecSpace::memory_space;
149: /* Vertex Coordinates */
150: // Use LayoutLeft for coalesced access on GPU (i is contiguous)
151: Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, MemSpace> vertex_coords_dev("vertex_coords", n_vert_local, dim);
152: {
153: // Host view must match the data layout from VecGetArray (d-major, i-minor implies LayoutLeft for (i,d) view)
154: Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, Kokkos::HostSpace> vertex_coords_host("vertex_coords_host", n_vert_local, dim);
155: for (d = 0; d < dim; ++d) {
156: const PetscScalar *local_coords_array;
157: PetscCall(VecGetArrayRead(Vecxyz[d], &local_coords_array));
158: // Copy data. Since vertex_coords_host is LayoutLeft, &vertex_coords_host(0, d) is the start of column d.
159: for (PetscInt i = 0; i < n_vert_local; ++i) vertex_coords_host(i, d) = local_coords_array[i];
160: PetscCall(VecRestoreArrayRead(Vecxyz[d], &local_coords_array));
161: }
162: Kokkos::deep_copy(vertex_coords_dev, vertex_coords_host);
163: }
165: /* Observation Coordinates */
166: Kokkos::View<PetscReal **, Kokkos::LayoutRight, MemSpace> obs_coords_dev("obs_coords", n_obs_global, dim);
167: {
168: Kokkos::View<PetscReal **, Kokkos::LayoutRight, Kokkos::HostSpace> obs_coords_host("obs_coords_host", n_obs_global, dim);
169: for (d = 0; d < dim; ++d) {
170: VecScatter ctx;
171: Vec seq_vec;
172: const PetscScalar *array;
174: PetscCall(VecScatterCreateToAll(obs_vecs[d], &ctx, &seq_vec));
175: PetscCall(VecScatterBegin(ctx, obs_vecs[d], seq_vec, INSERT_VALUES, SCATTER_FORWARD));
176: PetscCall(VecScatterEnd(ctx, obs_vecs[d], seq_vec, INSERT_VALUES, SCATTER_FORWARD));
178: PetscCall(VecGetArrayRead(seq_vec, &array));
179: for (PetscInt j = 0; j < n_obs_global; ++j) obs_coords_host(j, d) = PetscRealPart(array[j]);
180: PetscCall(VecRestoreArrayRead(seq_vec, &array));
181: PetscCall(VecScatterDestroy(&ctx));
182: PetscCall(VecDestroy(&seq_vec));
183: }
184: Kokkos::deep_copy(obs_coords_dev, obs_coords_host);
185: }
187: PetscInt rstart;
188: PetscCall(VecGetOwnershipRange(Vecxyz[0], &rstart, NULL));
190: /* Output Views */
191: // LayoutLeft for coalesced access on GPU
192: Kokkos::View<PetscInt **, Kokkos::LayoutLeft, MemSpace> indices_dev("indices", n_vert_local, n_obs_vertex);
193: Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, MemSpace> values_dev("values", n_vert_local, n_obs_vertex);
195: /* Temporary storage for top-k per vertex */
196: // LayoutLeft for coalesced access on GPU.
197: // Note: For the insertion sort within a thread, LayoutRight would offer better cache locality for the thread's private list.
198: // However, LayoutLeft is preferred for coalesced access across threads during the final weight computation and initialization.
199: // Given the random access nature of the sort (divergence), we stick to the default GPU layout (Left).
200: Kokkos::View<PetscReal **, Kokkos::LayoutLeft, MemSpace> best_dists_dev("best_dists", n_vert_local, n_obs_vertex);
201: Kokkos::View<PetscInt **, Kokkos::LayoutLeft, MemSpace> best_idxs_dev("best_idxs", n_vert_local, n_obs_vertex);
203: /* Copy boundary data to device */
204: Kokkos::View<PetscReal *, MemSpace> bd_dev("bd_dev", dim);
205: {
206: Kokkos::View<PetscReal *, Kokkos::HostSpace> bd_host("bd_host", dim);
207: for (PetscInt d = 0; d < dim; ++d) bd_host(d) = bd[d];
208: Kokkos::deep_copy(bd_dev, bd_host);
209: }
211: /* Main Kernel */
212: Kokkos::parallel_for(
213: "ComputeLocalization", Kokkos::RangePolicy<ExecSpace>(0, n_vert_local), KOKKOS_LAMBDA(const PetscInt i) {
214: PetscReal current_max_dist = PETSC_MAX_REAL;
216: // Cache vertex coordinates in registers to avoid repeated global memory access
217: // dim is small (<= 3), so this fits easily in registers
218: PetscReal v_coords[3] = {0.0, 0.0, 0.0};
219: for (PetscInt d = 0; d < dim; ++d) v_coords[d] = PetscRealPart(vertex_coords_dev(i, d));
221: // Initialize with infinity
222: for (PetscInt k = 0; k < n_obs_vertex; ++k) {
223: best_dists_dev(i, k) = PETSC_MAX_REAL;
224: best_idxs_dev(i, k) = -1;
225: }
227: // Iterate over all observations
228: for (PetscInt j = 0; j < n_obs_global; ++j) {
229: PetscReal dist2 = 0.0;
230: for (PetscInt d = 0; d < dim; ++d) {
231: PetscReal diff = v_coords[d] - obs_coords_dev(j, d);
232: if (bd_dev(d) != 0) { // Periodic boundary
233: PetscReal domain_size = bd_dev(d);
234: if (diff > 0.5 * domain_size) diff -= domain_size;
235: else if (diff < -0.5 * domain_size) diff += domain_size;
236: }
237: dist2 += diff * diff;
238: }
240: // Check if this observation is closer than the furthest stored observation
241: if (dist2 < current_max_dist) {
242: // Insert sorted
243: PetscInt pos = n_obs_vertex - 1;
244: while (pos > 0 && best_dists_dev(i, pos - 1) > dist2) {
245: best_dists_dev(i, pos) = best_dists_dev(i, pos - 1);
246: best_idxs_dev(i, pos) = best_idxs_dev(i, pos - 1);
247: pos--;
248: }
249: best_dists_dev(i, pos) = dist2;
250: best_idxs_dev(i, pos) = j;
252: // Update current max distance
253: current_max_dist = best_dists_dev(i, n_obs_vertex - 1);
254: }
255: }
256: // Compute weights
257: PetscReal radius2 = best_dists_dev(i, n_obs_vertex - 1);
258: PetscReal radius = std::sqrt(radius2);
259: radius *= RADIUS_FACTOR;
260: if (radius == 0.0) radius = 1.0;
262: for (PetscInt k = 0; k < n_obs_vertex; ++k) {
263: if (best_idxs_dev(i, k) != -1) {
264: PetscReal dist = std::sqrt(best_dists_dev(i, k));
265: indices_dev(i, k) = best_idxs_dev(i, k);
266: values_dev(i, k) = GaspariCohn(dist, radius);
267: } else {
268: indices_dev(i, k) = -1; // Ignore this entry
269: values_dev(i, k) = 0.0;
270: }
271: }
272: });
274: /* Copy back to host and fill matrix */
275: // Host views must be LayoutRight for MatSetValues (row-major)
276: Kokkos::View<PetscInt **, Kokkos::LayoutRight, Kokkos::HostSpace> indices_host("indices_host", n_vert_local, n_obs_vertex);
277: Kokkos::View<PetscScalar **, Kokkos::LayoutRight, Kokkos::HostSpace> values_host("values_host", n_vert_local, n_obs_vertex);
279: // Deep copy will handle layout conversion (transpose) if device views are LayoutLeft
280: // Note: Kokkos::deep_copy cannot copy between different layouts if the memory spaces are different (e.g. GPU to Host).
281: // We need an intermediate mirror view on the host with the same layout as the device view.
282: Kokkos::View<PetscInt **, Kokkos::LayoutLeft, Kokkos::HostSpace> indices_host_left = Kokkos::create_mirror_view(indices_dev);
283: Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, Kokkos::HostSpace> values_host_left = Kokkos::create_mirror_view(values_dev);
285: Kokkos::deep_copy(indices_host_left, indices_dev);
286: Kokkos::deep_copy(values_host_left, values_dev);
288: // Now copy from LayoutLeft host view to LayoutRight host view
289: Kokkos::deep_copy(indices_host, indices_host_left);
290: Kokkos::deep_copy(values_host, values_host_left);
292: for (PetscInt i = 0; i < n_vert_local; ++i) {
293: PetscInt globalRow = rstart + i;
294: PetscCall(MatSetValues(*Q, 1, &globalRow, n_obs_vertex, &indices_host(i, 0), &values_host(i, 0), INSERT_VALUES));
295: }
297: /* Compute mean and std dev of localization radius */
298: {
299: using FunctorType = RadiusStatsFunctor<decltype(best_dists_dev)>;
300: typename FunctorType::value_type result;
301: Kokkos::parallel_reduce("ComputeRadiusStats", Kokkos::RangePolicy<ExecSpace>(0, n_vert_local), FunctorType{best_dists_dev, n_obs_vertex}, result);
303: if (n_vert_local > 0) {
304: double mean = result.sum / n_vert_local;
305: double var = (result.sq_sum / n_vert_local) - (mean * mean);
306: double stddev = (var > 1e-1 * PETSC_SQRT_MACHINE_EPSILON) ? std::sqrt(var) : 0.0;
307: PetscCall(PetscInfo((PetscObject)obs_vecs[0], "LETKF localization radius: mean %g, std dev %g\n", mean, stddev));
308: }
309: }
311: /* Cleanup storage */
312: for (d = 0; d < dim; ++d) PetscCall(VecDestroy(&obs_vecs[d]));
313: PetscCall(PetscFree(obs_vecs));
315: /* Assemble matrix */
316: PetscCall(MatAssemblyBegin(*Q, MAT_FINAL_ASSEMBLY));
317: PetscCall(MatAssemblyEnd(*Q, MAT_FINAL_ASSEMBLY));
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }