Actual source code: mpishm.c
1: #include <petscsys.h>
2: #include <petsc/private/petscimpl.h>
4: struct _n_PetscShmComm {
5: PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */
6: PetscMPIInt shmsize; /* size of the shared memory communicator */
7: MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8: };
10: /*
11: Private routine to delete internal shared memory communicator when a communicator is freed.
13: This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed.
15: Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
17: */
18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19: {
20: PetscShmComm p = (PetscShmComm)val;
22: PetscFunctionBegin;
23: PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
24: PetscCallMPI(MPI_Comm_free(&p->shmcomm));
25: PetscCallMPI(PetscFree(p->globranks));
26: PetscCallMPI(PetscFree(val));
27: PetscFunctionReturn(MPI_SUCCESS);
28: }
30: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31: /* Data structures to support freeing comms created in PetscShmCommGet().
32: Since we predict communicators passed to PetscShmCommGet() are very likely
33: either a petsc inner communicator or an MPI communicator with a linked petsc
34: inner communicator, we use a simple static array to store dupped communicators
35: on rare cases otherwise.
36: */
37: #define MAX_SHMCOMM_DUPPED_COMMS 16
38: static PetscInt num_dupped_comms = 0;
39: static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41: {
42: PetscInt i;
44: PetscFunctionBegin;
45: for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
46: num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
49: #endif
51: /*@C
52: PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
54: Collective.
56: Input Parameter:
57: . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`
59: Output Parameter:
60: . pshmcomm - the PETSc shared memory communicator object
62: Level: developer
64: Note:
65: When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`
67: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
68: @*/
69: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
70: {
71: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
72: MPI_Group globgroup, shmgroup;
73: PetscMPIInt *shmranks, i, flg;
74: PetscCommCounter *counter;
76: PetscFunctionBegin;
77: PetscAssertPointer(pshmcomm, 2);
78: /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
79: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
80: if (!flg) { /* globcomm is not a petsc comm */
81: union
82: {
83: MPI_Comm comm;
84: void *ptr;
85: } ucomm;
86: /* check if globcomm already has a linked petsc inner comm */
87: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
88: if (!flg) {
89: /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
90: PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
91: PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
92: /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
93: if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
94: shmcomm_dupped_comms[num_dupped_comms] = globcomm;
95: num_dupped_comms++;
96: } else {
97: /* otherwise, we pull out the inner comm and use it as globcomm */
98: globcomm = ucomm.comm;
99: }
100: }
102: /* Check if globcomm already has an attached pshmcomm. If no, create one */
103: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
104: if (flg) PetscFunctionReturn(PETSC_SUCCESS);
106: PetscCall(PetscNew(pshmcomm));
107: (*pshmcomm)->globcomm = globcomm;
109: PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
111: PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
112: PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
113: PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
114: PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
115: PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
116: for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
117: PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
118: PetscCall(PetscFree(shmranks));
119: PetscCallMPI(MPI_Group_free(&globgroup));
120: PetscCallMPI(MPI_Group_free(&shmgroup));
122: for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
123: PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
124: PetscFunctionReturn(PETSC_SUCCESS);
125: #else
126: SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
127: #endif
128: }
130: /*@C
131: PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
133: Input Parameters:
134: + pshmcomm - the shared memory communicator object
135: - grank - the global rank
137: Output Parameter:
138: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
140: Level: developer
142: Developer Notes:
143: Assumes the pshmcomm->globranks[] is sorted
145: It may be better to rewrite this to map multiple global ranks to local in the same function call
147: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
148: @*/
149: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
150: {
151: PetscMPIInt low, high, t, i;
152: PetscBool flg = PETSC_FALSE;
154: PetscFunctionBegin;
155: PetscAssertPointer(pshmcomm, 1);
156: PetscAssertPointer(lrank, 3);
157: *lrank = MPI_PROC_NULL;
158: if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
159: if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
160: PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
161: if (flg) PetscFunctionReturn(PETSC_SUCCESS);
162: low = 0;
163: high = pshmcomm->shmsize;
164: while (high - low > 5) {
165: t = (low + high) / 2;
166: if (pshmcomm->globranks[t] > grank) high = t;
167: else low = t;
168: }
169: for (i = low; i < high; i++) {
170: if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
171: if (pshmcomm->globranks[i] == grank) {
172: *lrank = i;
173: PetscFunctionReturn(PETSC_SUCCESS);
174: }
175: }
176: PetscFunctionReturn(PETSC_SUCCESS);
177: }
179: /*@C
180: PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
182: Input Parameters:
183: + pshmcomm - the shared memory communicator object
184: - lrank - the local rank in the shared memory communicator
186: Output Parameter:
187: . grank - the global rank in the global communicator where the shared memory communicator is built
189: Level: developer
191: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
192: @*/
193: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
194: {
195: PetscFunctionBegin;
196: PetscAssertPointer(pshmcomm, 1);
197: PetscAssertPointer(grank, 3);
198: PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
199: *grank = pshmcomm->globranks[lrank];
200: PetscFunctionReturn(PETSC_SUCCESS);
201: }
203: /*@C
204: PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
206: Input Parameter:
207: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
209: Output Parameter:
210: . comm - the MPI communicator
212: Level: developer
214: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
215: @*/
216: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
217: {
218: PetscFunctionBegin;
219: PetscAssertPointer(pshmcomm, 1);
220: PetscAssertPointer(comm, 2);
221: *comm = pshmcomm->shmcomm;
222: PetscFunctionReturn(PETSC_SUCCESS);
223: }
225: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
226: #include <pthread.h>
227: #include <hwloc.h>
228: #include <omp.h>
230: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
231: otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
232: simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
233: by 50%. Until the reason is found out, we use mmap() instead.
234: */
235: #define USE_MMAP_ALLOCATE_SHARED_MEMORY
237: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238: #include <sys/mman.h>
239: #include <sys/types.h>
240: #include <sys/stat.h>
241: #include <fcntl.h>
242: #endif
244: struct _n_PetscOmpCtrl {
245: MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
246: MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
247: PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
248: PetscBool is_omp_master; /* rank 0's in omp_comm */
249: MPI_Win omp_win; /* a shared memory window containing a barrier */
250: pthread_barrier_t *barrier; /* pointer to the barrier */
251: hwloc_topology_t topology;
252: hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
253: hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
254: };
256: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
257: contained by the controller.
259: PETSc OpenMP controller users do not call this function directly. This function exists
260: only because we want to separate shared memory allocation methods from other code.
261: */
262: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
263: {
264: MPI_Aint size;
265: void *baseptr;
266: pthread_barrierattr_t attr;
268: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269: int fd;
270: PetscChar pathname[PETSC_MAX_PATH_LEN];
271: #else
272: PetscMPIInt disp_unit;
273: #endif
275: PetscFunctionBegin;
276: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
277: size = sizeof(pthread_barrier_t);
278: if (ctrl->is_omp_master) {
279: /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
280: PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
281: PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
282: /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
283: fd = mkstemp(pathname);
284: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
285: PetscCallExternal(ftruncate, fd, size);
286: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
287: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
288: PetscCallExternal(close, fd);
289: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
290: /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
291: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
292: PetscCallExternal(unlink, pathname);
293: } else {
294: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
295: fd = open(pathname, O_RDWR);
296: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
297: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
298: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
299: PetscCallExternal(close, fd);
300: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
301: }
302: #else
303: size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
304: PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
305: PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
306: #endif
307: ctrl->barrier = (pthread_barrier_t *)baseptr;
309: /* omp master initializes the barrier */
310: if (ctrl->is_omp_master) {
311: PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
312: PetscCallExternal(pthread_barrierattr_init, &attr);
313: PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
314: PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
315: PetscCallExternal(pthread_barrierattr_destroy, &attr);
316: }
318: /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
319: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
320: PetscFunctionReturn(PETSC_SUCCESS);
321: }
323: /* Destroy the pthread barrier in the PETSc OpenMP controller */
324: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
325: {
326: PetscFunctionBegin;
327: /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
328: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
329: if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
331: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
332: PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
333: #else
334: PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
335: #endif
336: PetscFunctionReturn(PETSC_SUCCESS);
337: }
339: /*@C
340: PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
342: Input Parameters:
343: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
344: - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
346: Output Parameter:
347: . pctrl - a PETSc OpenMP controller
349: Level: developer
351: Developer Note:
352: Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
354: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
355: @*/
356: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
357: {
358: PetscOmpCtrl ctrl;
359: unsigned long *cpu_ulongs = NULL;
360: PetscInt i, nr_cpu_ulongs;
361: PetscShmComm pshmcomm;
362: MPI_Comm shm_comm;
363: PetscMPIInt shm_rank, shm_comm_size, omp_rank, color;
364: PetscInt num_packages, num_cores;
366: PetscFunctionBegin;
367: PetscCall(PetscNew(&ctrl));
369: /*=================================================================================
370: Init hwloc
371: ==================================================================================*/
372: PetscCallExternal(hwloc_topology_init, &ctrl->topology);
373: #if HWLOC_API_VERSION >= 0x00020000
374: /* to filter out unneeded info and have faster hwloc_topology_load */
375: PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
376: PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
377: #endif
378: PetscCallExternal(hwloc_topology_load, ctrl->topology);
380: /*=================================================================================
381: Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
382: physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
383: others are called slaves. OMP Masters make up a new comm called omp_master_comm,
384: which is usually passed to third party libraries.
385: ==================================================================================*/
387: /* fetch the stored shared memory communicator */
388: PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
389: PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
391: PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
392: PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
394: /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
395: if (nthreads == -1) {
396: num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
397: num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
398: nthreads = num_cores / num_packages;
399: if (nthreads > shm_comm_size) nthreads = shm_comm_size;
400: }
402: PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
403: if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
405: /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
406: shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
407: color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
408: be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
409: Use 0 as key so that rank ordering wont change in new comm.
410: */
411: color = shm_rank / nthreads;
412: PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
414: /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
415: PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
416: if (!omp_rank) {
417: ctrl->is_omp_master = PETSC_TRUE; /* master */
418: color = 0;
419: } else {
420: ctrl->is_omp_master = PETSC_FALSE; /* slave */
421: color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
422: }
423: PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
425: /*=================================================================================
426: Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
427: slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
428: and run them on the idle CPUs.
429: ==================================================================================*/
430: PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
432: /*=================================================================================
433: omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
434: is the union of the bindings of all ranks in the omp_comm
435: =================================================================================*/
437: ctrl->cpuset = hwloc_bitmap_alloc();
438: PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
439: PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
441: /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
442: nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
443: PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
444: if (nr_cpu_ulongs == 1) {
445: cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
446: } else {
447: for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
448: }
450: PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
452: if (ctrl->is_omp_master) {
453: ctrl->omp_cpuset = hwloc_bitmap_alloc();
454: PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
455: if (nr_cpu_ulongs == 1) {
456: #if HWLOC_API_VERSION >= 0x00020000
457: PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
458: #else
459: hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
460: #endif
461: } else {
462: for (i = 0; i < nr_cpu_ulongs; i++) {
463: #if HWLOC_API_VERSION >= 0x00020000
464: PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
465: #else
466: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
467: #endif
468: }
469: }
470: }
471: PetscCall(PetscFree(cpu_ulongs));
472: *pctrl = ctrl;
473: PetscFunctionReturn(PETSC_SUCCESS);
474: }
476: /*@C
477: PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
479: Input Parameter:
480: . pctrl - a PETSc OpenMP controller
482: Level: developer
484: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
485: @*/
486: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
487: {
488: PetscOmpCtrl ctrl = *pctrl;
490: PetscFunctionBegin;
491: hwloc_bitmap_free(ctrl->cpuset);
492: hwloc_topology_destroy(ctrl->topology);
493: PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
494: PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
495: if (ctrl->is_omp_master) {
496: hwloc_bitmap_free(ctrl->omp_cpuset);
497: PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
498: }
499: PetscCall(PetscFree(ctrl));
500: PetscFunctionReturn(PETSC_SUCCESS);
501: }
503: /*@C
504: PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
506: Input Parameter:
507: . ctrl - a PETSc OMP controller
509: Output Parameters:
510: + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
511: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
512: on slave ranks, `MPI_COMM_NULL` will be return in reality.
513: - is_omp_master - true if the calling process is an OMP master rank.
515: Note:
516: Any output parameter can be `NULL`. The parameter is just ignored.
518: Level: developer
520: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
521: @*/
522: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
523: {
524: PetscFunctionBegin;
525: if (omp_comm) *omp_comm = ctrl->omp_comm;
526: if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
527: if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
528: PetscFunctionReturn(PETSC_SUCCESS);
529: }
531: /*@C
532: PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
534: Input Parameter:
535: . ctrl - a PETSc OMP controller
537: Notes:
538: this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
539: require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
540: MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
542: A code using `PetscOmpCtrlBarrier()` would be like this,
543: .vb
544: if (is_omp_master) {
545: PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
546: Call the library using OpenMP
547: PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
548: }
549: PetscOmpCtrlBarrier(ctrl);
550: .ve
552: Level: developer
554: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
555: @*/
556: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
557: {
558: int err;
560: PetscFunctionBegin;
561: err = pthread_barrier_wait(ctrl->barrier);
562: PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
563: PetscFunctionReturn(PETSC_SUCCESS);
564: }
566: /*@C
567: PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
569: Input Parameter:
570: . ctrl - a PETSc OMP controller
572: Note:
573: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
574: This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
576: Level: developer
578: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
579: @*/
580: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
581: {
582: PetscFunctionBegin;
583: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
584: omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: /*@C
589: PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
591: Input Parameter:
592: . ctrl - a PETSc OMP controller
594: Note:
595: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
596: This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
598: Level: developer
600: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
601: @*/
602: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
603: {
604: PetscFunctionBegin;
605: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
606: omp_set_num_threads(1);
607: PetscFunctionReturn(PETSC_SUCCESS);
608: }
610: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
611: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */