Actual source code: mpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
  7:   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
 19: {
 20:   PetscShmComm p = (PetscShmComm)val;

 22:   PetscFunctionBegin;
 23:   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
 24:   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
 25:   PetscCallMPI(PetscFree(p->globranks));
 26:   PetscCallMPI(PetscFree(val));
 27:   PetscFunctionReturn(MPI_SUCCESS);
 28: }

 30: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 31:   /* Data structures to support freeing comms created in PetscShmCommGet().
 32:   Since we predict communicators passed to PetscShmCommGet() are very likely
 33:   either a petsc inner communicator or an MPI communicator with a linked petsc
 34:   inner communicator, we use a simple static array to store dupped communicators
 35:   on rare cases otherwise.
 36:  */
 37:   #define MAX_SHMCOMM_DUPPED_COMMS 16
 38: static PetscInt       num_dupped_comms = 0;
 39: static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
 40: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
 41: {
 42:   PetscInt i;

 44:   PetscFunctionBegin;
 45:   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
 46:   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
 47:   PetscFunctionReturn(PETSC_SUCCESS);
 48: }
 49: #endif

 51: /*@C
 52:   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory

 54:   Collective.

 56:   Input Parameter:
 57: . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`

 59:   Output Parameter:
 60: . pshmcomm - the PETSc shared memory communicator object

 62:   Level: developer

 64:   Note:
 65:   When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`

 67: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
 68: @*/
 69: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
 70: {
 71: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 72:   MPI_Group         globgroup, shmgroup;
 73:   PetscMPIInt      *shmranks, i, flg;
 74:   PetscCommCounter *counter;

 76:   PetscFunctionBegin;
 77:   PetscAssertPointer(pshmcomm, 2);
 78:   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
 79:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
 80:   if (!flg) { /* globcomm is not a petsc comm */
 81:     union
 82:     {
 83:       MPI_Comm comm;
 84:       void    *ptr;
 85:     } ucomm;
 86:     /* check if globcomm already has a linked petsc inner comm */
 87:     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
 88:     if (!flg) {
 89:       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
 90:       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
 91:       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
 92:       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
 93:       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
 94:       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
 95:       num_dupped_comms++;
 96:     } else {
 97:       /* otherwise, we pull out the inner comm and use it as globcomm */
 98:       globcomm = ucomm.comm;
 99:     }
100:   }

102:   /* Check if globcomm already has an attached pshmcomm. If no, create one */
103:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
104:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);

106:   PetscCall(PetscNew(pshmcomm));
107:   (*pshmcomm)->globcomm = globcomm;

109:   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));

111:   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
112:   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
113:   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
114:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
115:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
116:   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
117:   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
118:   PetscCall(PetscFree(shmranks));
119:   PetscCallMPI(MPI_Group_free(&globgroup));
120:   PetscCallMPI(MPI_Group_free(&shmgroup));

122:   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
123:   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
124:   PetscFunctionReturn(PETSC_SUCCESS);
125: #else
126:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
127: #endif
128: }

130: /*@C
131:   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

133:   Input Parameters:
134: + pshmcomm - the shared memory communicator object
135: - grank    - the global rank

137:   Output Parameter:
138: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist

140:   Level: developer

142:   Developer Notes:
143:   Assumes the pshmcomm->globranks[] is sorted

145:   It may be better to rewrite this to map multiple global ranks to local in the same function call

147: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
148: @*/
149: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
150: {
151:   PetscMPIInt low, high, t, i;
152:   PetscBool   flg = PETSC_FALSE;

154:   PetscFunctionBegin;
155:   PetscAssertPointer(pshmcomm, 1);
156:   PetscAssertPointer(lrank, 3);
157:   *lrank = MPI_PROC_NULL;
158:   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
159:   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
160:   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
161:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
162:   low  = 0;
163:   high = pshmcomm->shmsize;
164:   while (high - low > 5) {
165:     t = (low + high) / 2;
166:     if (pshmcomm->globranks[t] > grank) high = t;
167:     else low = t;
168:   }
169:   for (i = low; i < high; i++) {
170:     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
171:     if (pshmcomm->globranks[i] == grank) {
172:       *lrank = i;
173:       PetscFunctionReturn(PETSC_SUCCESS);
174:     }
175:   }
176:   PetscFunctionReturn(PETSC_SUCCESS);
177: }

179: /*@C
180:   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

182:   Input Parameters:
183: + pshmcomm - the shared memory communicator object
184: - lrank    - the local rank in the shared memory communicator

186:   Output Parameter:
187: . grank - the global rank in the global communicator where the shared memory communicator is built

189:   Level: developer

191: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
192: @*/
193: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
194: {
195:   PetscFunctionBegin;
196:   PetscAssertPointer(pshmcomm, 1);
197:   PetscAssertPointer(grank, 3);
198:   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
199:   *grank = pshmcomm->globranks[lrank];
200:   PetscFunctionReturn(PETSC_SUCCESS);
201: }

203: /*@C
204:   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

206:   Input Parameter:
207: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

209:   Output Parameter:
210: . comm - the MPI communicator

212:   Level: developer

214: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
215: @*/
216: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
217: {
218:   PetscFunctionBegin;
219:   PetscAssertPointer(pshmcomm, 1);
220:   PetscAssertPointer(comm, 2);
221:   *comm = pshmcomm->shmcomm;
222:   PetscFunctionReturn(PETSC_SUCCESS);
223: }

225: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
226:   #include <pthread.h>
227:   #include <hwloc.h>
228:   #include <omp.h>

230:   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
231:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
232:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
233:    by 50%. Until the reason is found out, we use mmap() instead.
234: */
235:   #define USE_MMAP_ALLOCATE_SHARED_MEMORY

237:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238:     #include <sys/mman.h>
239:     #include <sys/types.h>
240:     #include <sys/stat.h>
241:     #include <fcntl.h>
242:   #endif

244: struct _n_PetscOmpCtrl {
245:   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
246:   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
247:   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
248:   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
249:   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
250:   pthread_barrier_t *barrier;         /* pointer to the barrier */
251:   hwloc_topology_t   topology;
252:   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
253:   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
254: };

256: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
257:    contained by the controller.

259:    PETSc OpenMP controller users do not call this function directly. This function exists
260:    only because we want to separate shared memory allocation methods from other code.
261:  */
262: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
263: {
264:   MPI_Aint              size;
265:   void                 *baseptr;
266:   pthread_barrierattr_t attr;

268:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269:   int       fd;
270:   PetscChar pathname[PETSC_MAX_PATH_LEN];
271:   #else
272:   PetscMPIInt disp_unit;
273:   #endif

275:   PetscFunctionBegin;
276:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
277:   size = sizeof(pthread_barrier_t);
278:   if (ctrl->is_omp_master) {
279:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
280:     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
281:     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
282:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
283:     fd = mkstemp(pathname);
284:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
285:     PetscCallExternal(ftruncate, fd, size);
286:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
287:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
288:     PetscCallExternal(close, fd);
289:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
290:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
291:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
292:     PetscCallExternal(unlink, pathname);
293:   } else {
294:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
295:     fd = open(pathname, O_RDWR);
296:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
297:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
298:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
299:     PetscCallExternal(close, fd);
300:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
301:   }
302:   #else
303:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
304:   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
305:   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
306:   #endif
307:   ctrl->barrier = (pthread_barrier_t *)baseptr;

309:   /* omp master initializes the barrier */
310:   if (ctrl->is_omp_master) {
311:     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
312:     PetscCallExternal(pthread_barrierattr_init, &attr);
313:     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
314:     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
315:     PetscCallExternal(pthread_barrierattr_destroy, &attr);
316:   }

318:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
319:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
320:   PetscFunctionReturn(PETSC_SUCCESS);
321: }

323: /* Destroy the pthread barrier in the PETSc OpenMP controller */
324: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
325: {
326:   PetscFunctionBegin;
327:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
328:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
329:   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);

331:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
332:   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
333:   #else
334:   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
335:   #endif
336:   PetscFunctionReturn(PETSC_SUCCESS);
337: }

339: /*@C
340:     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP

342:     Input Parameters:
343: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
344: -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

346:     Output Parameter:
347: .   pctrl      - a PETSc OpenMP controller

349:     Level: developer

351:     Developer Note:
352:     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use

354: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
355: @*/
356: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
357: {
358:   PetscOmpCtrl   ctrl;
359:   unsigned long *cpu_ulongs = NULL;
360:   PetscInt       i, nr_cpu_ulongs;
361:   PetscShmComm   pshmcomm;
362:   MPI_Comm       shm_comm;
363:   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
364:   PetscInt       num_packages, num_cores;

366:   PetscFunctionBegin;
367:   PetscCall(PetscNew(&ctrl));

369:   /*=================================================================================
370:     Init hwloc
371:    ==================================================================================*/
372:   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
373:   #if HWLOC_API_VERSION >= 0x00020000
374:   /* to filter out unneeded info and have faster hwloc_topology_load */
375:   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
376:   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
377:   #endif
378:   PetscCallExternal(hwloc_topology_load, ctrl->topology);

380:   /*=================================================================================
381:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
382:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
383:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
384:     which is usually passed to third party libraries.
385:    ==================================================================================*/

387:   /* fetch the stored shared memory communicator */
388:   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
389:   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));

391:   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
392:   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));

394:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
395:   if (nthreads == -1) {
396:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
397:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
398:     nthreads     = num_cores / num_packages;
399:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
400:   }

402:   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
403:   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));

405:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
406:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
407:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
408:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
409:      Use 0 as key so that rank ordering wont change in new comm.
410:    */
411:   color = shm_rank / nthreads;
412:   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));

414:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
415:   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
416:   if (!omp_rank) {
417:     ctrl->is_omp_master = PETSC_TRUE; /* master */
418:     color               = 0;
419:   } else {
420:     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
421:     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
422:   }
423:   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));

425:   /*=================================================================================
426:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
427:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
428:     and run them on the idle CPUs.
429:    ==================================================================================*/
430:   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));

432:   /*=================================================================================
433:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
434:     is the union of the bindings of all ranks in the omp_comm
435:     =================================================================================*/

437:   ctrl->cpuset = hwloc_bitmap_alloc();
438:   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
439:   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

441:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
442:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
443:   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
444:   if (nr_cpu_ulongs == 1) {
445:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
446:   } else {
447:     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
448:   }

450:   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));

452:   if (ctrl->is_omp_master) {
453:     ctrl->omp_cpuset = hwloc_bitmap_alloc();
454:     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
455:     if (nr_cpu_ulongs == 1) {
456:   #if HWLOC_API_VERSION >= 0x00020000
457:       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
458:   #else
459:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
460:   #endif
461:     } else {
462:       for (i = 0; i < nr_cpu_ulongs; i++) {
463:   #if HWLOC_API_VERSION >= 0x00020000
464:         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
465:   #else
466:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
467:   #endif
468:       }
469:     }
470:   }
471:   PetscCall(PetscFree(cpu_ulongs));
472:   *pctrl = ctrl;
473:   PetscFunctionReturn(PETSC_SUCCESS);
474: }

476: /*@C
477:     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

479:     Input Parameter:
480: .   pctrl  - a PETSc OpenMP controller

482:     Level: developer

484: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
485: @*/
486: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
487: {
488:   PetscOmpCtrl ctrl = *pctrl;

490:   PetscFunctionBegin;
491:   hwloc_bitmap_free(ctrl->cpuset);
492:   hwloc_topology_destroy(ctrl->topology);
493:   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
494:   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
495:   if (ctrl->is_omp_master) {
496:     hwloc_bitmap_free(ctrl->omp_cpuset);
497:     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
498:   }
499:   PetscCall(PetscFree(ctrl));
500:   PetscFunctionReturn(PETSC_SUCCESS);
501: }

503: /*@C
504:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

506:     Input Parameter:
507: .   ctrl - a PETSc OMP controller

509:     Output Parameters:
510: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
511: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
512:                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
513: -   is_omp_master    - true if the calling process is an OMP master rank.

515:     Note:
516:     Any output parameter can be `NULL`. The parameter is just ignored.

518:     Level: developer

520: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
521: @*/
522: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
523: {
524:   PetscFunctionBegin;
525:   if (omp_comm) *omp_comm = ctrl->omp_comm;
526:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
527:   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
528:   PetscFunctionReturn(PETSC_SUCCESS);
529: }

531: /*@C
532:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

534:     Input Parameter:
535: .   ctrl - a PETSc OMP controller

537:     Notes:
538:     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
539:     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
540:     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

542:     A code using `PetscOmpCtrlBarrier()` would be like this,
543: .vb
544:     if (is_omp_master) {
545:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
546:       Call the library using OpenMP
547:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
548:     }
549:     PetscOmpCtrlBarrier(ctrl);
550: .ve

552:     Level: developer

554: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
555: @*/
556: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
557: {
558:   int err;

560:   PetscFunctionBegin;
561:   err = pthread_barrier_wait(ctrl->barrier);
562:   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
563:   PetscFunctionReturn(PETSC_SUCCESS);
564: }

566: /*@C
567:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

569:     Input Parameter:
570: .   ctrl - a PETSc OMP controller

572:     Note:
573:     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
574:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

576:     Level: developer

578: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
579: @*/
580: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
581: {
582:   PetscFunctionBegin;
583:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
584:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
585:   PetscFunctionReturn(PETSC_SUCCESS);
586: }

588: /*@C
589:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

591:    Input Parameter:
592: .  ctrl - a PETSc OMP controller

594:    Note:
595:    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
596:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

598:    Level: developer

600: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
601: @*/
602: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
603: {
604:   PetscFunctionBegin;
605:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
606:   omp_set_num_threads(1);
607:   PetscFunctionReturn(PETSC_SUCCESS);
608: }

610:   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
611: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */