Actual source code: mpmpishm.c
1: #include <petscsys.h>
2: #include <petsc/private/petscimpl.h>
3: #include <pthread.h>
4: #include <hwloc.h>
5: #include <omp.h>
7: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8: otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9: simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10: by 50%. Until the reason is found out, we use mmap() instead.
11: */
12: #define USE_MMAP_ALLOCATE_SHARED_MEMORY
14: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15: #include <sys/mman.h>
16: #include <sys/types.h>
17: #include <sys/stat.h>
18: #include <fcntl.h>
19: #endif
21: struct _n_PetscOmpCtrl {
22: MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
23: MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
24: PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
25: PetscBool is_omp_master; /* rank 0's in omp_comm */
26: MPI_Win omp_win; /* a shared memory window containing a barrier */
27: pthread_barrier_t *barrier; /* pointer to the barrier */
28: hwloc_topology_t topology;
29: hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
30: hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31: };
33: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34: contained by the controller.
36: PETSc OpenMP controller users do not call this function directly. This function exists
37: only because we want to separate shared memory allocation methods from other code.
38: */
39: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40: {
41: MPI_Aint size;
42: void *baseptr;
43: pthread_barrierattr_t attr;
45: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46: int fd;
47: char pathname[PETSC_MAX_PATH_LEN];
48: #else
49: PetscMPIInt disp_unit;
50: #endif
52: PetscFunctionBegin;
53: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54: size = sizeof(pthread_barrier_t);
55: if (ctrl->is_omp_master) {
56: /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57: PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58: PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59: /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60: fd = mkstemp(pathname);
61: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62: PetscCallExternal(ftruncate, fd, size);
63: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65: PetscCallExternal(close, fd);
66: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67: /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69: PetscCallExternal(unlink, pathname);
70: } else {
71: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72: fd = open(pathname, O_RDWR);
73: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76: PetscCallExternal(close, fd);
77: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78: }
79: #else
80: size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81: PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82: PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83: #endif
84: ctrl->barrier = (pthread_barrier_t *)baseptr;
86: /* omp master initializes the barrier */
87: if (ctrl->is_omp_master) {
88: PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89: PetscCallExternal(pthread_barrierattr_init, &attr);
90: PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91: PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92: PetscCallExternal(pthread_barrierattr_destroy, &attr);
93: }
95: /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97: PetscFunctionReturn(PETSC_SUCCESS);
98: }
100: /* Destroy the pthread barrier in the PETSc OpenMP controller */
101: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102: {
103: PetscFunctionBegin;
104: /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106: if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
108: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109: PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110: #else
111: PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112: #endif
113: PetscFunctionReturn(PETSC_SUCCESS);
114: }
116: /*@C
117: PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
119: Input Parameters:
120: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121: - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
123: Output Parameter:
124: . pctrl - a PETSc OpenMP controller
126: Level: developer
128: Developer Note:
129: Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
131: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132: @*/
133: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134: {
135: PetscOmpCtrl ctrl;
136: unsigned long *cpu_ulongs = NULL;
137: PetscInt i, nr_cpu_ulongs;
138: PetscShmComm pshmcomm;
139: MPI_Comm shm_comm;
140: PetscMPIInt shm_rank, shm_comm_size, omp_rank, color;
141: PetscInt num_packages, num_cores;
143: PetscFunctionBegin;
144: PetscCall(PetscNew(&ctrl));
146: /*=================================================================================
147: Init hwloc
148: ==================================================================================*/
149: PetscCallExternal(hwloc_topology_init, &ctrl->topology);
150: #if HWLOC_API_VERSION >= 0x00020000
151: /* to filter out unneeded info and have faster hwloc_topology_load */
152: PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
153: PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
154: #endif
155: PetscCallExternal(hwloc_topology_load, ctrl->topology);
157: /*=================================================================================
158: Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
159: physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
160: others are called slaves. OMP Masters make up a new comm called omp_master_comm,
161: which is usually passed to third party libraries.
162: ==================================================================================*/
164: /* fetch the stored shared memory communicator */
165: PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
166: PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
168: PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
169: PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
171: /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
172: if (nthreads == -1) {
173: num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
174: num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
175: nthreads = num_cores / num_packages;
176: if (nthreads > shm_comm_size) nthreads = shm_comm_size;
177: }
179: PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
180: if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
182: /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
183: shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
184: color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
185: be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
186: Use 0 as key so that rank ordering wont change in new comm.
187: */
188: color = shm_rank / nthreads;
189: PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
191: /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
192: PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
193: if (!omp_rank) {
194: ctrl->is_omp_master = PETSC_TRUE; /* master */
195: color = 0;
196: } else {
197: ctrl->is_omp_master = PETSC_FALSE; /* slave */
198: color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
199: }
200: PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
202: /*=================================================================================
203: Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
204: slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
205: and run them on the idle CPUs.
206: ==================================================================================*/
207: PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
209: /*=================================================================================
210: omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
211: is the union of the bindings of all ranks in the omp_comm
212: =================================================================================*/
214: ctrl->cpuset = hwloc_bitmap_alloc();
215: PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
216: PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
218: /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
219: nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
220: PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
221: if (nr_cpu_ulongs == 1) {
222: cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
223: } else {
224: for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
225: }
227: PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
229: if (ctrl->is_omp_master) {
230: ctrl->omp_cpuset = hwloc_bitmap_alloc();
231: PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
232: if (nr_cpu_ulongs == 1) {
233: #if HWLOC_API_VERSION >= 0x00020000
234: PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
235: #else
236: hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
237: #endif
238: } else {
239: for (i = 0; i < nr_cpu_ulongs; i++) {
240: #if HWLOC_API_VERSION >= 0x00020000
241: PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
242: #else
243: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
244: #endif
245: }
246: }
247: }
248: PetscCall(PetscFree(cpu_ulongs));
249: *pctrl = ctrl;
250: PetscFunctionReturn(PETSC_SUCCESS);
251: }
253: /*@C
254: PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
256: Input Parameter:
257: . pctrl - a PETSc OpenMP controller
259: Level: developer
261: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
262: @*/
263: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
264: {
265: PetscOmpCtrl ctrl = *pctrl;
267: PetscFunctionBegin;
268: hwloc_bitmap_free(ctrl->cpuset);
269: hwloc_topology_destroy(ctrl->topology);
270: PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
271: PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
272: if (ctrl->is_omp_master) {
273: hwloc_bitmap_free(ctrl->omp_cpuset);
274: PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
275: }
276: PetscCall(PetscFree(ctrl));
277: PetscFunctionReturn(PETSC_SUCCESS);
278: }
280: /*@C
281: PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
283: Input Parameter:
284: . ctrl - a PETSc OMP controller
286: Output Parameters:
287: + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
288: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
289: on slave ranks, `MPI_COMM_NULL` will be return in reality.
290: - is_omp_master - true if the calling process is an OMP master rank.
292: Note:
293: Any output parameter can be `NULL`. The parameter is just ignored.
295: Level: developer
297: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
298: @*/
299: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
300: {
301: PetscFunctionBegin;
302: if (omp_comm) *omp_comm = ctrl->omp_comm;
303: if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
304: if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
305: PetscFunctionReturn(PETSC_SUCCESS);
306: }
308: /*@C
309: PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
311: Input Parameter:
312: . ctrl - a PETSc OMP controller
314: Notes:
315: This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
316: require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
317: MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
319: A code using `PetscOmpCtrlBarrier()` would be like this,
320: .vb
321: if (is_omp_master) {
322: PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
323: Call the library using OpenMP
324: PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
325: }
326: PetscOmpCtrlBarrier(ctrl);
327: .ve
329: Level: developer
331: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
332: @*/
333: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
334: {
335: int err;
337: PetscFunctionBegin;
338: err = pthread_barrier_wait(ctrl->barrier);
339: PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: /*@C
344: PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
346: Input Parameter:
347: . ctrl - a PETSc OMP controller
349: Note:
350: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
351: This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
353: Level: developer
355: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
356: @*/
357: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
358: {
359: PetscFunctionBegin;
360: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
361: omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
362: PetscFunctionReturn(PETSC_SUCCESS);
363: }
365: /*@C
366: PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
368: Input Parameter:
369: . ctrl - a PETSc OMP controller
371: Note:
372: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
373: This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
375: Level: developer
377: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
378: @*/
379: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
380: {
381: PetscFunctionBegin;
382: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
383: omp_set_num_threads(1);
384: PetscFunctionReturn(PETSC_SUCCESS);
385: }
387: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY