Actual source code: mpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt shmsize;          /* size of the shared memory communicator */
  7:   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
 19: {
 20:   PetscShmComm p = (PetscShmComm)val;

 22:   PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);
 23:   MPI_Comm_free(&p->shmcomm);
 24:   PetscFree(p->globranks);
 25:   PetscFree(val);
 26:   return MPI_SUCCESS;
 27: }

 29: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 30: /* Data structures to support freeing comms created in PetscShmCommGet().
 31:   Since we predict communicators passed to PetscShmCommGet() are very likely
 32:   either a petsc inner communicator or an MPI communicator with a linked petsc
 33:   inner communicator, we use a simple static array to store dupped communicators
 34:   on rare cases otherwise.
 35:  */
 36: #define MAX_SHMCOMM_DUPPED_COMMS 16
 37: static PetscInt       num_dupped_comms=0;
 38: static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
 39: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
 40: {
 41:   PetscInt         i;
 42:   for (i=0; i<num_dupped_comms; i++) PetscCommDestroy(&shmcomm_dupped_comms[i]);
 43:   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
 44:   return 0;
 45: }
 46: #endif

 48: /*@C
 49:     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory

 51:     Collective.

 53:     Input Parameter:
 54: .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm

 56:     Output Parameter:
 57: .   pshmcomm - the PETSc shared memory communicator object

 59:     Level: developer

 61:     Notes:
 62:        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis

 64: @*/
 65: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
 66: {
 67: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 68:   MPI_Group        globgroup,shmgroup;
 69:   PetscMPIInt      *shmranks,i,flg;
 70:   PetscCommCounter *counter;

 73:   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
 74:   MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);
 75:   if (!flg) { /* globcomm is not a petsc comm */
 76:     union {MPI_Comm comm; void *ptr;} ucomm;
 77:     /* check if globcomm already has a linked petsc inner comm */
 78:     MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);
 79:     if (!flg) {
 80:       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
 82:       PetscCommDuplicate(globcomm,&globcomm,NULL);
 83:       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
 84:       if (num_dupped_comms == 0) PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);
 85:       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
 86:       num_dupped_comms++;
 87:     } else {
 88:       /* otherwise, we pull out the inner comm and use it as globcomm */
 89:       globcomm = ucomm.comm;
 90:     }
 91:   }

 93:   /* Check if globcomm already has an attached pshmcomm. If no, create one */
 94:   MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);
 95:   if (flg) return 0;

 97:   PetscNew(pshmcomm);
 98:   (*pshmcomm)->globcomm = globcomm;

100:   MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);

102:   MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);
103:   MPI_Comm_group(globcomm, &globgroup);
104:   MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);
105:   PetscMalloc1((*pshmcomm)->shmsize,&shmranks);
106:   PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);
107:   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
108:   MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);
109:   PetscFree(shmranks);
110:   MPI_Group_free(&globgroup);
111:   MPI_Group_free(&shmgroup);

113:   for (i=0; i<(*pshmcomm)->shmsize; i++) {
114:     PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);
115:   }
116:   MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);
117:   return 0;
118: #else
119:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
120: #endif
121: }

123: /*@C
124:     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

126:     Input Parameters:
127: +   pshmcomm - the shared memory communicator object
128: -   grank    - the global rank

130:     Output Parameter:
131: .   lrank - the local rank, or MPI_PROC_NULL if it does not exist

133:     Level: developer

135:     Developer Notes:
136:     Assumes the pshmcomm->globranks[] is sorted

138:     It may be better to rewrite this to map multiple global ranks to local in the same function call

140: @*/
141: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
142: {
143:   PetscMPIInt    low,high,t,i;
144:   PetscBool      flg = PETSC_FALSE;

148:   *lrank = MPI_PROC_NULL;
149:   if (grank < pshmcomm->globranks[0]) return 0;
150:   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) return 0;
151:   PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);
152:   if (flg) return 0;
153:   low  = 0;
154:   high = pshmcomm->shmsize;
155:   while (high-low > 5) {
156:     t = (low+high)/2;
157:     if (pshmcomm->globranks[t] > grank) high = t;
158:     else low = t;
159:   }
160:   for (i=low; i<high; i++) {
161:     if (pshmcomm->globranks[i] > grank) return 0;
162:     if (pshmcomm->globranks[i] == grank) {
163:       *lrank = i;
164:       return 0;
165:     }
166:   }
167:   return 0;
168: }

170: /*@C
171:     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

173:     Input Parameters:
174: +   pshmcomm - the shared memory communicator object
175: -   lrank    - the local rank in the shared memory communicator

177:     Output Parameter:
178: .   grank - the global rank in the global communicator where the shared memory communicator is built

180:     Level: developer

182: @*/
183: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
184: {
188:   *grank = pshmcomm->globranks[lrank];
189:   return 0;
190: }

192: /*@C
193:     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

195:     Input Parameter:
196: .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

198:     Output Parameter:
199: .   comm     - the MPI communicator

201:     Level: developer

203: @*/
204: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
205: {
208:   *comm = pshmcomm->shmcomm;
209:   return 0;
210: }

212: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
213: #include <pthread.h>
214: #include <hwloc.h>
215: #include <omp.h>

217: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
218:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
219:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
220:    by 50%. Until the reason is found out, we use mmap() instead.
221: */
222: #define USE_MMAP_ALLOCATE_SHARED_MEMORY

224: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
225: #include <sys/mman.h>
226: #include <sys/types.h>
227: #include <sys/stat.h>
228: #include <fcntl.h>
229: #endif

231: struct _n_PetscOmpCtrl {
232:   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
233:   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
234:   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
235:   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
236:   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
237:   pthread_barrier_t *barrier;        /* pointer to the barrier */
238:   hwloc_topology_t  topology;
239:   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
240:   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
241: };

243: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
244:    contained by the controller.

246:    PETSc OpenMP controller users do not call this function directly. This function exists
247:    only because we want to separate shared memory allocation methods from other code.
248:  */
249: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
250: {
251:   MPI_Aint              size;
252:   void                  *baseptr;
253:   pthread_barrierattr_t  attr;

255: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
256:   PetscInt              fd;
257:   PetscChar             pathname[PETSC_MAX_PATH_LEN];
258: #else
259:   PetscMPIInt           disp_unit;
260: #endif

262: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
263:   size = sizeof(pthread_barrier_t);
264:   if (ctrl->is_omp_master) {
265:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
266:     PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);
267:     PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);
268:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
270:     ftruncate(fd,size);
272:     close(fd);
273:     MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
274:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
275:     MPI_Barrier(ctrl->omp_comm);
276:     unlink(pathname);
277:   } else {
278:     MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
281:     close(fd);
282:     MPI_Barrier(ctrl->omp_comm);
283:   }
284: #else
285:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
286:   MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);
287:   MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);
288: #endif
289:   ctrl->barrier = (pthread_barrier_t*)baseptr;

291:   /* omp master initializes the barrier */
292:   if (ctrl->is_omp_master) {
293:     MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);
294:     pthread_barrierattr_init(&attr);
295:     pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
296:     pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);
297:     pthread_barrierattr_destroy(&attr);
298:   }

300:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
301:   MPI_Barrier(ctrl->omp_comm);
302:   return 0;
303: }

305: /* Destroy the pthread barrier in the PETSc OpenMP controller */
306: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
307: {
308:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
309:   MPI_Barrier(ctrl->omp_comm);
310:   if (ctrl->is_omp_master) pthread_barrier_destroy(ctrl->barrier);

312: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
313:   munmap(ctrl->barrier,sizeof(pthread_barrier_t));
314: #else
315:   MPI_Win_free(&ctrl->omp_win);
316: #endif
317:   return 0;
318: }

320: /*@C
321:     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP

323:     Input Parameters:
324: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
325: -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

327:     Output Parameter:
328: .   pctrl      - a PETSc OpenMP controller

330:     Level: developer

332:     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use

334: .seealso PetscOmpCtrlDestroy()
335: @*/
336: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
337: {
338:   PetscOmpCtrl   ctrl;
339:   unsigned long *cpu_ulongs = NULL;
340:   PetscInt       i,nr_cpu_ulongs;
341:   PetscShmComm   pshmcomm;
342:   MPI_Comm       shm_comm;
343:   PetscMPIInt    shm_rank,shm_comm_size,omp_rank,color;
344:   PetscInt       num_packages,num_cores;

346:   PetscNew(&ctrl);

348:   /*=================================================================================
349:     Init hwloc
350:    ==================================================================================*/
351:   hwloc_topology_init(&ctrl->topology);
352: #if HWLOC_API_VERSION >= 0x00020000
353:   /* to filter out unneeded info and have faster hwloc_topology_load */
354:   hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);
355:   hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);
356: #endif
357:   hwloc_topology_load(ctrl->topology);

359:   /*=================================================================================
360:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
361:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
362:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
363:     which is usually passed to third party libraries.
364:    ==================================================================================*/

366:   /* fetch the stored shared memory communicator */
367:   PetscShmCommGet(petsc_comm,&pshmcomm);
368:   PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);

370:   MPI_Comm_rank(shm_comm,&shm_rank);
371:   MPI_Comm_size(shm_comm,&shm_comm_size);

373:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
374:   if (nthreads == -1) {
375:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
376:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
377:     nthreads     = num_cores/num_packages;
378:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
379:   }

382:   if (shm_comm_size % nthreads) PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);

384:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
385:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
386:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
387:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
388:      Use 0 as key so that rank ordering wont change in new comm.
389:    */
390:   color = shm_rank / nthreads;
391:   MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);

393:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
394:   MPI_Comm_rank(ctrl->omp_comm,&omp_rank);
395:   if (!omp_rank) {
396:     ctrl->is_omp_master = PETSC_TRUE;  /* master */
397:     color = 0;
398:   } else {
399:     ctrl->is_omp_master = PETSC_FALSE; /* slave */
400:     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
401:   }
402:   MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);

404:   /*=================================================================================
405:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
406:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
407:     and run them on the idle CPUs.
408:    ==================================================================================*/
409:   PetscOmpCtrlCreateBarrier(ctrl);

411:   /*=================================================================================
412:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
413:     is the union of the bindings of all ranks in the omp_comm
414:     =================================================================================*/

417:   hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

419:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
420:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
421:   PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);
422:   if (nr_cpu_ulongs == 1) {
423:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
424:   } else {
425:     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
426:   }

428:   MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);

430:   if (ctrl->is_omp_master) {
432:     if (nr_cpu_ulongs == 1) {
433: #if HWLOC_API_VERSION >= 0x00020000
434:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
435: #else
436:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
437: #endif
438:     } else {
439:       for (i=0; i<nr_cpu_ulongs; i++)  {
440: #if HWLOC_API_VERSION >= 0x00020000
441:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
442: #else
443:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
444: #endif
445:       }
446:     }
447:   }

449:   PetscFree(cpu_ulongs);
450:   *pctrl = ctrl;
451:   return 0;
452: }

454: /*@C
455:     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

457:     Input Parameter:
458: .   pctrl  - a PETSc OpenMP controller

460:     Level: developer

462: .seealso PetscOmpCtrlCreate()
463: @*/
464: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
465: {
466:   PetscOmpCtrl ctrl = *pctrl;

468:   hwloc_bitmap_free(ctrl->cpuset);
469:   hwloc_topology_destroy(ctrl->topology);
470:   PetscOmpCtrlDestroyBarrier(ctrl);
471:   MPI_Comm_free(&ctrl->omp_comm);
472:   if (ctrl->is_omp_master) {
473:     hwloc_bitmap_free(ctrl->omp_cpuset);
474:     MPI_Comm_free(&ctrl->omp_master_comm);
475:   }
476:   PetscFree(ctrl);
477:   return 0;
478: }

480: /*@C
481:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

483:     Input Parameter:
484: .   ctrl - a PETSc OMP controller

486:     Output Parameters:
487: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
488: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
489:                        on slave ranks, MPI_COMM_NULL will be return in reality.
490: -   is_omp_master    - true if the calling process is an OMP master rank.

492:     Notes: any output parameter can be NULL. The parameter is just ignored.

494:     Level: developer
495: @*/
496: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
497: {
498:   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
499:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
500:   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
501:   return 0;
502: }

504: /*@C
505:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

507:     Input Parameter:
508: .   ctrl - a PETSc OMP controller

510:     Notes:
511:     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
512:     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
513:     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

515:     A code using PetscOmpCtrlBarrier() would be like this,

517:     if (is_omp_master) {
518:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
519:       Call the library using OpenMP
520:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
521:     }
522:     PetscOmpCtrlBarrier(ctrl);

524:     Level: developer

526: .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
527: @*/
528: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
529: {
530:   int err;

532:   err = pthread_barrier_wait(ctrl->barrier);
534:   return 0;
535: }

537: /*@C
538:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

540:     Input Parameter:
541: .   ctrl - a PETSc OMP controller

543:     Notes:
544:     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
545:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

547:     Level: developer

549: .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
550: @*/
551: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
552: {
553:   hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);
554:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
555:   return 0;
556: }

558: /*@C
559:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

561:    Input Parameter:
562: .  ctrl - a PETSc OMP controller

564:    Notes:
565:    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
566:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

568:    Level: developer

570: .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
571: @*/
572: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
573: {
574:   hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);
575:   omp_set_num_threads(1);
576:   return 0;
577: }

579: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
580: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */