Actual source code: mpishm.c
1: #include <petscsys.h>
2: #include <petsc/private/petscimpl.h>
4: struct _n_PetscShmComm {
5: PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */
6: PetscMPIInt shmsize; /* size of the shared memory communicator */
7: MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8: };
10: /*
11: Private routine to delete internal shared memory communicator when a communicator is freed.
13: This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed.
15: Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
17: */
18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19: {
20: PetscErrorCode ierr;
21: PetscShmComm p = (PetscShmComm)val;
24: PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);
25: MPI_Comm_free(&p->shmcomm);
26: PetscFree(p->globranks);
27: PetscFree(val);
28: PetscFunctionReturn(MPI_SUCCESS);
29: }
31: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
32: /* Data structures to support freeing comms created in PetscShmCommGet().
33: Since we predict communicators passed to PetscShmCommGet() are very likely
34: either a petsc inner communicator or an MPI communicator with a linked petsc
35: inner communicator, we use a simple static array to store dupped communicators
36: on rare cases otherwise.
37: */
38: #define MAX_SHMCOMM_DUPPED_COMMS 16
39: static PetscInt num_dupped_comms=0;
40: static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
41: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
42: {
43: PetscErrorCode ierr;
44: PetscInt i;
46: for (i=0; i<num_dupped_comms; i++) {PetscCommDestroy(&shmcomm_dupped_comms[i]);}
47: num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
48: return(0);
49: }
50: #endif
52: /*@C
53: PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
55: Collective.
57: Input Parameter:
58: . globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
60: Output Parameter:
61: . pshmcomm - the PETSc shared memory communicator object
63: Level: developer
65: Notes:
66: When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
68: @*/
69: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
70: {
71: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
72: PetscErrorCode ierr;
73: MPI_Group globgroup,shmgroup;
74: PetscMPIInt *shmranks,i,flg;
75: PetscCommCounter *counter;
78: /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
79: MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);
80: if (!flg) { /* globcomm is not a petsc comm */
81: union {MPI_Comm comm; void *ptr;} ucomm;
82: /* check if globcomm already has a linked petsc inner comm */
83: MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);
84: if (!flg) {
85: /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
86: if (num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS) SETERRQ1(globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
87: PetscCommDuplicate(globcomm,&globcomm,NULL);
88: /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
89: if (num_dupped_comms == 0) {PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);}
90: shmcomm_dupped_comms[num_dupped_comms] = globcomm;
91: num_dupped_comms++;
92: } else {
93: /* otherwise, we pull out the inner comm and use it as globcomm */
94: globcomm = ucomm.comm;
95: }
96: }
98: /* Check if globcomm already has an attached pshmcomm. If no, create one */
99: MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);
100: if (flg) return(0);
102: PetscNew(pshmcomm);
103: (*pshmcomm)->globcomm = globcomm;
105: MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);
107: MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);
108: MPI_Comm_group(globcomm, &globgroup);
109: MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);
110: PetscMalloc1((*pshmcomm)->shmsize,&shmranks);
111: PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);
112: for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
113: MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);
114: PetscFree(shmranks);
115: MPI_Group_free(&globgroup);
116: MPI_Group_free(&shmgroup);
118: for (i=0; i<(*pshmcomm)->shmsize; i++) {
119: PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);
120: }
121: MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);
122: return(0);
123: #else
124: SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
125: #endif
126: }
128: /*@C
129: PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
131: Input Parameters:
132: + pshmcomm - the shared memory communicator object
133: - grank - the global rank
135: Output Parameter:
136: . lrank - the local rank, or MPI_PROC_NULL if it does not exist
138: Level: developer
140: Developer Notes:
141: Assumes the pshmcomm->globranks[] is sorted
143: It may be better to rewrite this to map multiple global ranks to local in the same function call
145: @*/
146: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
147: {
148: PetscMPIInt low,high,t,i;
149: PetscBool flg = PETSC_FALSE;
153: *lrank = MPI_PROC_NULL;
154: if (grank < pshmcomm->globranks[0]) return(0);
155: if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) return(0);
156: PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);
157: if (flg) return(0);
158: low = 0;
159: high = pshmcomm->shmsize;
160: while (high-low > 5) {
161: t = (low+high)/2;
162: if (pshmcomm->globranks[t] > grank) high = t;
163: else low = t;
164: }
165: for (i=low; i<high; i++) {
166: if (pshmcomm->globranks[i] > grank) return(0);
167: if (pshmcomm->globranks[i] == grank) {
168: *lrank = i;
169: return(0);
170: }
171: }
172: return(0);
173: }
175: /*@C
176: PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
178: Input Parameters:
179: + pshmcomm - the shared memory communicator object
180: - lrank - the local rank in the shared memory communicator
182: Output Parameter:
183: . grank - the global rank in the global communicator where the shared memory communicator is built
185: Level: developer
187: @*/
188: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
189: {
191: if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);
192: *grank = pshmcomm->globranks[lrank];
193: return(0);
194: }
196: /*@C
197: PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
199: Input Parameter:
200: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
202: Output Parameter:
203: . comm - the MPI communicator
205: Level: developer
207: @*/
208: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
209: {
211: *comm = pshmcomm->shmcomm;
212: return(0);
213: }
215: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
216: #include <pthread.h>
217: #include <hwloc.h>
218: #include <omp.h>
220: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
221: otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
222: simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
223: by 50%. Until the reason is found out, we use mmap() instead.
224: */
225: #define USE_MMAP_ALLOCATE_SHARED_MEMORY
227: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
228: #include <sys/mman.h>
229: #include <sys/types.h>
230: #include <sys/stat.h>
231: #include <fcntl.h>
232: #endif
234: struct _n_PetscOmpCtrl {
235: MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
236: MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
237: PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
238: PetscBool is_omp_master; /* rank 0's in omp_comm */
239: MPI_Win omp_win; /* a shared memory window containing a barrier */
240: pthread_barrier_t *barrier; /* pointer to the barrier */
241: hwloc_topology_t topology;
242: hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
243: hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
244: };
246: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
247: contained by the controller.
249: PETSc OpenMP controller users do not call this function directly. This function exists
250: only because we want to separate shared memory allocation methods from other code.
251: */
252: PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
253: {
254: PetscErrorCode ierr;
255: MPI_Aint size;
256: void *baseptr;
257: pthread_barrierattr_t attr;
259: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
260: PetscInt fd;
261: PetscChar pathname[PETSC_MAX_PATH_LEN];
262: #else
263: PetscMPIInt disp_unit;
264: #endif
267: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
268: size = sizeof(pthread_barrier_t);
269: if (ctrl->is_omp_master) {
270: /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
271: PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);
272: PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);
273: /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
274: fd = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
275: ftruncate(fd,size);
276: baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
277: close(fd);
278: MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
279: /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
280: MPI_Barrier(ctrl->omp_comm);
281: unlink(pathname);
282: } else {
283: MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
284: fd = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
285: baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
286: close(fd);
287: MPI_Barrier(ctrl->omp_comm);
288: }
289: #else
290: size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
291: MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);
292: MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);
293: #endif
294: ctrl->barrier = (pthread_barrier_t*)baseptr;
296: /* omp master initializes the barrier */
297: if (ctrl->is_omp_master) {
298: MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);
299: pthread_barrierattr_init(&attr);
300: pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
301: pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);
302: pthread_barrierattr_destroy(&attr);
303: }
305: /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
306: MPI_Barrier(ctrl->omp_comm);
307: return(0);
308: }
310: /* Destroy the pthread barrier in the PETSc OpenMP controller */
311: PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
312: {
316: /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
317: MPI_Barrier(ctrl->omp_comm);
318: if (ctrl->is_omp_master) { pthread_barrier_destroy(ctrl->barrier); }
320: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
321: munmap(ctrl->barrier,sizeof(pthread_barrier_t));
322: #else
323: MPI_Win_free(&ctrl->omp_win);
324: #endif
325: return(0);
326: }
328: /*@C
329: PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
331: Input Parameters:
332: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
333: - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
335: Output Parameter:
336: . pctrl - a PETSc OpenMP controller
338: Level: developer
340: TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
342: .seealso PetscOmpCtrlDestroy()
343: @*/
344: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
345: {
346: PetscErrorCode ierr;
347: PetscOmpCtrl ctrl;
348: unsigned long *cpu_ulongs=NULL;
349: PetscInt i,nr_cpu_ulongs;
350: PetscShmComm pshmcomm;
351: MPI_Comm shm_comm;
352: PetscMPIInt shm_rank,shm_comm_size,omp_rank,color;
353: PetscInt num_packages,num_cores;
356: PetscNew(&ctrl);
358: /*=================================================================================
359: Init hwloc
360: ==================================================================================*/
361: hwloc_topology_init(&ctrl->topology);
362: #if HWLOC_API_VERSION >= 0x00020000
363: /* to filter out unneeded info and have faster hwloc_topology_load */
364: hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);
365: hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);
366: #endif
367: hwloc_topology_load(ctrl->topology);
369: /*=================================================================================
370: Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
371: physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
372: others are called slaves. OMP Masters make up a new comm called omp_master_comm,
373: which is usually passed to third party libraries.
374: ==================================================================================*/
376: /* fetch the stored shared memory communicator */
377: PetscShmCommGet(petsc_comm,&pshmcomm);
378: PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);
380: MPI_Comm_rank(shm_comm,&shm_rank);
381: MPI_Comm_size(shm_comm,&shm_comm_size);
383: /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
384: if (nthreads == -1) {
385: num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
386: num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
387: nthreads = num_cores/num_packages;
388: if (nthreads > shm_comm_size) nthreads = shm_comm_size;
389: }
391: if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
392: if (shm_comm_size % nthreads) { PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size); }
394: /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
395: shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
396: color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
397: be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
398: Use 0 as key so that rank ordering wont change in new comm.
399: */
400: color = shm_rank / nthreads;
401: MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);
403: /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
404: MPI_Comm_rank(ctrl->omp_comm,&omp_rank);
405: if (!omp_rank) {
406: ctrl->is_omp_master = PETSC_TRUE; /* master */
407: color = 0;
408: } else {
409: ctrl->is_omp_master = PETSC_FALSE; /* slave */
410: color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
411: }
412: MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);
414: /*=================================================================================
415: Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
416: slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
417: and run them on the idle CPUs.
418: ==================================================================================*/
419: PetscOmpCtrlCreateBarrier(ctrl);
421: /*=================================================================================
422: omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
423: is the union of the bindings of all ranks in the omp_comm
424: =================================================================================*/
426: ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
427: hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
429: /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
430: nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
431: PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);
432: if (nr_cpu_ulongs == 1) {
433: cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
434: } else {
435: for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
436: }
438: MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);
440: if (ctrl->is_omp_master) {
441: ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
442: if (nr_cpu_ulongs == 1) {
443: #if HWLOC_API_VERSION >= 0x00020000
444: hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
445: #else
446: hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
447: #endif
448: } else {
449: for (i=0; i<nr_cpu_ulongs; i++) {
450: #if HWLOC_API_VERSION >= 0x00020000
451: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
452: #else
453: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
454: #endif
455: }
456: }
457: }
459: PetscFree(cpu_ulongs);
460: *pctrl = ctrl;
461: return(0);
462: }
464: /*@C
465: PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
467: Input Parameter:
468: . pctrl - a PETSc OpenMP controller
470: Level: developer
472: .seealso PetscOmpCtrlCreate()
473: @*/
474: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
475: {
476: PetscErrorCode ierr;
477: PetscOmpCtrl ctrl = *pctrl;
480: hwloc_bitmap_free(ctrl->cpuset);
481: hwloc_topology_destroy(ctrl->topology);
482: PetscOmpCtrlDestroyBarrier(ctrl);
483: MPI_Comm_free(&ctrl->omp_comm);
484: if (ctrl->is_omp_master) {
485: hwloc_bitmap_free(ctrl->omp_cpuset);
486: MPI_Comm_free(&ctrl->omp_master_comm);
487: }
488: PetscFree(ctrl);
489: return(0);
490: }
492: /*@C
493: PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
495: Input Parameter:
496: . ctrl - a PETSc OMP controller
498: Output Parameters:
499: + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
500: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
501: on slave ranks, MPI_COMM_NULL will be return in reality.
502: - is_omp_master - true if the calling process is an OMP master rank.
504: Notes: any output parameter can be NULL. The parameter is just ignored.
506: Level: developer
507: @*/
508: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
509: {
511: if (omp_comm) *omp_comm = ctrl->omp_comm;
512: if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
513: if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
514: return(0);
515: }
517: /*@C
518: PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
520: Input Parameter:
521: . ctrl - a PETSc OMP controller
523: Notes:
524: this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
525: require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
526: MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
528: A code using PetscOmpCtrlBarrier() would be like this,
530: if (is_omp_master) {
531: PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
532: Call the library using OpenMP
533: PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
534: }
535: PetscOmpCtrlBarrier(ctrl);
537: Level: developer
539: .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
540: @*/
541: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
542: {
543: int err;
546: err = pthread_barrier_wait(ctrl->barrier);
547: if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err);
548: return(0);
549: }
551: /*@C
552: PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
554: Input Parameter:
555: . ctrl - a PETSc OMP controller
557: Notes:
558: Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
559: This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
561: Level: developer
563: .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
564: @*/
565: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
566: {
570: hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);
571: omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
572: return(0);
573: }
575: /*@C
576: PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
578: Input Parameter:
579: . ctrl - a PETSc OMP controller
581: Notes:
582: Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
583: This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
585: Level: developer
587: .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
588: @*/
589: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
590: {
594: hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);
595: omp_set_num_threads(1);
596: return(0);
597: }
599: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
600: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */