Actual source code: sfwindow.c

  1: #include <petsc/private/sfimpl.h>

  3: typedef struct _n_PetscSFDataLink *PetscSFDataLink;
  4: typedef struct _n_PetscSFWinLink  *PetscSFWinLink;

  6: typedef struct {
  7:   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
  8:   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
  9:   PetscSFWinLink          wins;   /* List of active windows */
 10:   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
 11:   PetscSF                 dynsf;
 12:   MPI_Info                info;
 13: } PetscSF_Window;

 15: struct _n_PetscSFDataLink {
 16:   MPI_Datatype    unit;
 17:   MPI_Datatype    *mine;
 18:   MPI_Datatype    *remote;
 19:   PetscSFDataLink next;
 20: };

 22: struct _n_PetscSFWinLink {
 23:   PetscBool               inuse;
 24:   size_t                  bytes;
 25:   void                    *addr;
 26:   void                    *paddr;
 27:   MPI_Win                 win;
 28:   MPI_Request             *reqs;
 29:   PetscSFWindowFlavorType flavor;
 30:   MPI_Aint                *dyn_target_addr;
 31:   PetscBool               epoch;
 32:   PetscSFWinLink          next;
 33: };

 35: const char *const PetscSFWindowSyncTypes[] = {"FENCE","LOCK","ACTIVE","PetscSFWindowSyncType","PETSCSF_WINDOW_SYNC_",NULL};
 36: const char *const PetscSFWindowFlavorTypes[] = {"CREATE","DYNAMIC","ALLOCATE","SHARED","PetscSFWindowFlavorType","PETSCSF_WINDOW_FLAVOR_",NULL};

 38: /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
 39: static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
 40: {
 41:   if (*op == MPIU_SUM) *op = MPI_SUM;
 42:   else if (*op == MPIU_MAX) *op = MPI_MAX;
 43:   else if (*op == MPIU_MIN) *op = MPI_MIN;
 44:   return 0;
 45: }

 47: /*@C
 48:    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank

 50:    Not Collective

 52:    Input Parameters:
 53: +  sf - star forest
 54: -  unit - data type for each node

 56:    Output Parameters:
 57: +  localtypes - types describing part of local leaf buffer referencing each remote rank
 58: -  remotetypes - types describing part of remote root buffer referenced for each remote rank

 60:    Level: developer

 62: .seealso: PetscSFSetGraph(), PetscSFView()
 63: @*/
 64: static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf,MPI_Datatype unit,const MPI_Datatype **localtypes,const MPI_Datatype **remotetypes)
 65: {
 66:   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
 67:   PetscSFDataLink   link;
 68:   PetscInt          i,nranks;
 69:   const PetscInt    *roffset,*rmine,*rremote;
 70:   const PetscMPIInt *ranks;

 72:   /* Look for types in cache */
 73:   for (link=w->link; link; link=link->next) {
 74:     PetscBool match;
 75:     MPIPetsc_Type_compare(unit,link->unit,&match);
 76:     if (match) {
 77:       *localtypes  = link->mine;
 78:       *remotetypes = link->remote;
 79:       return 0;
 80:     }
 81:   }

 83:   /* Create new composite types for each send rank */
 84:   PetscSFGetRootRanks(sf,&nranks,&ranks,&roffset,&rmine,&rremote);
 85:   PetscNew(&link);
 86:   MPI_Type_dup(unit,&link->unit);
 87:   PetscMalloc2(nranks,&link->mine,nranks,&link->remote);
 88:   for (i=0; i<nranks; i++) {
 89:     PetscInt    rcount = roffset[i+1] - roffset[i];
 90:     PetscMPIInt *rmine,*rremote;
 91: #if !defined(PETSC_USE_64BIT_INDICES)
 92:     rmine   = sf->rmine + sf->roffset[i];
 93:     rremote = sf->rremote + sf->roffset[i];
 94: #else
 95:     PetscInt j;
 96:     PetscMalloc2(rcount,&rmine,rcount,&rremote);
 97:     for (j=0; j<rcount; j++) {
 98:       PetscMPIIntCast(sf->rmine[sf->roffset[i]+j],rmine+j);
 99:       PetscMPIIntCast(sf->rremote[sf->roffset[i]+j],rremote+j);
100:     }
101: #endif

103:     MPI_Type_create_indexed_block(rcount,1,rmine,link->unit,&link->mine[i]);
104:     MPI_Type_create_indexed_block(rcount,1,rremote,link->unit,&link->remote[i]);
105: #if defined(PETSC_USE_64BIT_INDICES)
106:     PetscFree2(rmine,rremote);
107: #endif
108:     MPI_Type_commit(&link->mine[i]);
109:     MPI_Type_commit(&link->remote[i]);
110:   }
111:   link->next = w->link;
112:   w->link    = link;

114:   *localtypes  = link->mine;
115:   *remotetypes = link->remote;
116:   return 0;
117: }

119: /*@C
120:    PetscSFWindowSetFlavorType - Set flavor type for MPI_Win creation

122:    Logically Collective

124:    Input Parameters:
125: +  sf - star forest for communication
126: -  flavor - flavor type

128:    Options Database Key:
129: .  -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see PetscSFWindowFlavorType)

131:    Level: advanced

133:    Notes: Windows reusage follow this rules:

135:      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create

137:      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
138:        for i=1 to K
139:          PetscSFOperationBegin(rootdata1,leafdata_whatever);
140:          PetscSFOperationEnd(rootdata1,leafdata_whatever);
141:          ...
142:          PetscSFOperationBegin(rootdataN,leafdata_whatever);
143:          PetscSFOperationEnd(rootdataN,leafdata_whatever);
144:        endfor
145:        The following pattern will instead raise an error
146:          PetscSFOperationBegin(rootdata1,leafdata_whatever);
147:          PetscSFOperationEnd(rootdata1,leafdata_whatever);
148:          PetscSFOperationBegin(rank ? rootdata1 : rootdata2,leafdata_whatever);
149:          PetscSFOperationEnd(rank ? rootdata1 : rootdata2,leafdata_whatever);

151:      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use

153:      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE

155: .seealso: PetscSFSetFromOptions(), PetscSFWindowGetFlavorType()
156: @*/
157: PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf,PetscSFWindowFlavorType flavor)
158: {
161:   PetscTryMethod(sf,"PetscSFWindowSetFlavorType_C",(PetscSF,PetscSFWindowFlavorType),(sf,flavor));
162:   return 0;
163: }

165: static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType flavor)
166: {
167:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

169:   w->flavor = flavor;
170:   return 0;
171: }

173: /*@C
174:    PetscSFWindowGetFlavorType - Get flavor type for PetscSF communication

176:    Logically Collective

178:    Input Parameter:
179: .  sf - star forest for communication

181:    Output Parameter:
182: .  flavor - flavor type

184:    Level: advanced

186: .seealso: PetscSFSetFromOptions(), PetscSFWindowSetFlavorType()
187: @*/
188: PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf,PetscSFWindowFlavorType *flavor)
189: {
192:   PetscUseMethod(sf,"PetscSFWindowGetFlavorType_C",(PetscSF,PetscSFWindowFlavorType*),(sf,flavor));
193:   return 0;
194: }

196: static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType *flavor)
197: {
198:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

200:   *flavor = w->flavor;
201:   return 0;
202: }

204: /*@C
205:    PetscSFWindowSetSyncType - Set synchronization type for PetscSF communication

207:    Logically Collective

209:    Input Parameters:
210: +  sf - star forest for communication
211: -  sync - synchronization type

213:    Options Database Key:
214: .  -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see PetscSFWindowSyncType)

216:    Level: advanced

218: .seealso: PetscSFSetFromOptions(), PetscSFWindowGetSyncType()
219: @*/
220: PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf,PetscSFWindowSyncType sync)
221: {
224:   PetscTryMethod(sf,"PetscSFWindowSetSyncType_C",(PetscSF,PetscSFWindowSyncType),(sf,sync));
225:   return 0;
226: }

228: static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf,PetscSFWindowSyncType sync)
229: {
230:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

232:   w->sync = sync;
233:   return 0;
234: }

236: /*@C
237:    PetscSFWindowGetSyncType - Get synchronization type for PetscSF communication

239:    Logically Collective

241:    Input Parameter:
242: .  sf - star forest for communication

244:    Output Parameter:
245: .  sync - synchronization type

247:    Level: advanced

249: .seealso: PetscSFSetFromOptions(), PetscSFWindowSetSyncType()
250: @*/
251: PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf,PetscSFWindowSyncType *sync)
252: {
255:   PetscUseMethod(sf,"PetscSFWindowGetSyncType_C",(PetscSF,PetscSFWindowSyncType*),(sf,sync));
256:   return 0;
257: }

259: static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf,PetscSFWindowSyncType *sync)
260: {
261:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

263:   *sync = w->sync;
264:   return 0;
265: }

267: /*@C
268:    PetscSFWindowSetInfo - Set the MPI_Info handle that will be used for subsequent windows allocation

270:    Logically Collective

272:    Input Parameters:
273: +  sf - star forest for communication
274: -  info - MPI_Info handle

276:    Level: advanced

278:    Notes: the info handle is duplicated with a call to MPI_Info_dup unless info = MPI_INFO_NULL.

280: .seealso: PetscSFSetFromOptions(), PetscSFWindowGetInfo()
281: @*/
282: PetscErrorCode PetscSFWindowSetInfo(PetscSF sf,MPI_Info info)
283: {
285:   PetscTryMethod(sf,"PetscSFWindowSetInfo_C",(PetscSF,MPI_Info),(sf,info));
286:   return 0;
287: }

289: static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf,MPI_Info info)
290: {
291:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

293:   if (w->info != MPI_INFO_NULL) {
294:     MPI_Info_free(&w->info);
295:   }
296:   if (info != MPI_INFO_NULL) {
297:     MPI_Info_dup(info,&w->info);
298:   }
299:   return 0;
300: }

302: /*@C
303:    PetscSFWindowGetInfo - Get the MPI_Info handle used for windows allocation

305:    Logically Collective

307:    Input Parameter:
308: .  sf - star forest for communication

310:    Output Parameter:
311: .  info - MPI_Info handle

313:    Level: advanced

315:    Notes: if PetscSFWindowSetInfo() has not be called, this returns MPI_INFO_NULL

317: .seealso: PetscSFSetFromOptions(), PetscSFWindowSetInfo()
318: @*/
319: PetscErrorCode PetscSFWindowGetInfo(PetscSF sf,MPI_Info *info)
320: {
323:   PetscUseMethod(sf,"PetscSFWindowGetInfo_C",(PetscSF,MPI_Info*),(sf,info));
324:   return 0;
325: }

327: static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf,MPI_Info *info)
328: {
329:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

331:   *info = w->info;
332:   return 0;
333: }

335: /*
336:    PetscSFGetWindow - Get a window for use with a given data type

338:    Collective on PetscSF

340:    Input Parameters:
341: +  sf - star forest
342: .  unit - data type
343: .  array - array to be sent
344: .  sync - type of synchronization PetscSFWindowSyncType
345: .  epoch - PETSC_TRUE to acquire the window and start an epoch, PETSC_FALSE to just acquire the window
346: .  fenceassert - assert parameter for call to MPI_Win_fence(), if sync == PETSCSF_WINDOW_SYNC_FENCE
347: .  postassert - assert parameter for call to MPI_Win_post(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
348: -  startassert - assert parameter for call to MPI_Win_start(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE

350:    Output Parameters:
351: +  target_disp - target_disp argument for RMA calls (significative for PETSCSF_WINDOW_FLAVOR_DYNAMIC only)
352: +  reqs - array of requests (significative for sync == PETSCSF_WINDOW_SYNC_LOCK only)
353: -  win - window

355:    Level: developer
356: .seealso: PetscSFGetRootRanks(), PetscSFWindowGetDataTypes()
357: */
358: static PetscErrorCode PetscSFGetWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscMPIInt postassert,PetscMPIInt startassert,const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
359: {
360:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
361:   MPI_Aint       lb,lb_true,bytes,bytes_true;
362:   PetscSFWinLink link;
363: #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
364:   MPI_Aint       winaddr;
365:   PetscInt       nranks;
366: #endif
367:   PetscBool      reuse = PETSC_FALSE, update = PETSC_FALSE;
368:   PetscBool      dummy[2];
369:   MPI_Aint       wsize;

371:   MPI_Type_get_extent(unit,&lb,&bytes);
372:   MPI_Type_get_true_extent(unit,&lb_true,&bytes_true);
375:   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
376:   for (link=w->wins; reuse && link; link=link->next) {
377:     PetscBool winok = PETSC_FALSE;
378:     if (w->flavor != link->flavor) continue;
379:     switch (w->flavor) {
380:     case PETSCSF_WINDOW_FLAVOR_DYNAMIC: /* check available matching array, error if in use (we additionally check that the matching condition is the same across processes) */
381:       if (array == link->addr) {
382:         if (PetscDefined(USE_DEBUG)) {
383:           dummy[0] = PETSC_TRUE;
384:           dummy[1] = PETSC_TRUE;
385:           MPI_Allreduce(MPI_IN_PLACE,dummy,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf));
386:           MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf));
388:         }
391:         winok = PETSC_TRUE;
392:         link->paddr = array;
393:       } else if (PetscDefined(USE_DEBUG)) {
394:         dummy[0] = PETSC_FALSE;
395:         dummy[1] = PETSC_FALSE;
396:         MPI_Allreduce(MPI_IN_PLACE,dummy  ,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf));
397:         MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf));
399:       }
400:       break;
401:     case PETSCSF_WINDOW_FLAVOR_ALLOCATE: /* check available by matching size, allocate if in use */
402:     case PETSCSF_WINDOW_FLAVOR_SHARED:
403:       if (!link->inuse && bytes == (MPI_Aint)link->bytes) {
404:         update = PETSC_TRUE;
405:         link->paddr = array;
406:         winok = PETSC_TRUE;
407:       }
408:       break;
409:     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
410:     }
411:     if (winok) {
412:       *win = link->win;
413:       PetscInfo(sf,"Reusing window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));
414:       goto found;
415:     }
416:   }

418:   wsize = (MPI_Aint)bytes*sf->nroots;
419:   PetscNew(&link);
420:   link->bytes           = bytes;
421:   link->next            = w->wins;
422:   link->flavor          = w->flavor;
423:   link->dyn_target_addr = NULL;
424:   link->reqs            = NULL;
425:   w->wins               = link;
426:   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
427:     PetscInt i;

429:     PetscMalloc1(sf->nranks,&link->reqs);
430:     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
431:   }
432:   switch (w->flavor) {
433:   case PETSCSF_WINDOW_FLAVOR_CREATE:
434:     MPI_Win_create(array,wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->win);
435:     link->addr  = array;
436:     link->paddr = array;
437:     break;
438: #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
439:   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
440:     MPI_Win_create_dynamic(w->info,PetscObjectComm((PetscObject)sf),&link->win);
441: #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) /* some OpenMPI versions do not support MPI_Win_attach(win,NULL,0); */
442:     MPI_Win_attach(link->win,wsize ? array : (void*)dummy,wsize);
443: #else
444:     MPI_Win_attach(link->win,array,wsize);
445: #endif
446:     link->addr  = array;
447:     link->paddr = array;
449:     PetscSFSetUp(w->dynsf);
450:     PetscSFGetRootRanks(w->dynsf,&nranks,NULL,NULL,NULL,NULL);
451:     PetscMalloc1(nranks,&link->dyn_target_addr);
452:     MPI_Get_address(array,&winaddr);
453:     PetscSFBcastBegin(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr,MPI_REPLACE);
454:     PetscSFBcastEnd(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr,MPI_REPLACE);
455:     break;
456:   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
457:     MPI_Win_allocate(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win);
458:     update = PETSC_TRUE;
459:     link->paddr = array;
460:     break;
461: #endif
462: #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
463:   case PETSCSF_WINDOW_FLAVOR_SHARED:
464:     MPI_Win_allocate_shared(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win);
465:     update = PETSC_TRUE;
466:     link->paddr = array;
467:     break;
468: #endif
469:   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
470:   }
471:   PetscInfo(sf,"New window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));
472:   *win = link->win;

474: found:

476:   if (target_disp) *target_disp = link->dyn_target_addr;
477:   if (reqs) *reqs = link->reqs;
478:   if (update) { /* locks are needed for the "separate" memory model only, the fence guaranties memory-synchronization */
479:     PetscMPIInt rank;

481:     MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);
482:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_EXCLUSIVE,rank,MPI_MODE_NOCHECK,*win);
483:     PetscMemcpy(link->addr,array,sf->nroots*bytes);
484:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
485:       MPI_Win_unlock(rank,*win);
486:       MPI_Win_fence(0,*win);
487:     }
488:   }
489:   link->inuse = PETSC_TRUE;
490:   link->epoch = epoch;
491:   if (epoch) {
492:     switch (sync) {
493:     case PETSCSF_WINDOW_SYNC_FENCE:
494:       MPI_Win_fence(fenceassert,*win);
495:       break;
496:     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
497:       break;
498:     case PETSCSF_WINDOW_SYNC_ACTIVE: {
499:       MPI_Group   ingroup,outgroup;
500:       PetscMPIInt isize,osize;

502:       /* OpenMPI 4.0.2 with btl=vader does not like calling
503:          - MPI_Win_complete when ogroup is empty
504:          - MPI_Win_wait when igroup is empty
505:          So, we do not even issue the corresponding start and post calls
506:          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
507:          start(outgroup) has a matching post(ingroup)
508:          and this is guaranteed by PetscSF
509:       */
510:       PetscSFGetGroups(sf,&ingroup,&outgroup);
511:       MPI_Group_size(ingroup,&isize);
512:       MPI_Group_size(outgroup,&osize);
513:       if (isize) MPI_Win_post(ingroup,postassert,*win);
514:       if (osize) MPI_Win_start(outgroup,startassert,*win);
515:     } break;
516:     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
517:     }
518:   }
519:   return 0;
520: }

522: /*
523:    PetscSFFindWindow - Finds a window that is already in use

525:    Not Collective

527:    Input Parameters:
528: +  sf - star forest
529: .  unit - data type
530: -  array - array with which the window is associated

532:    Output Parameters:
533: +  win - window
534: -  reqs - outstanding requests associated to the window

536:    Level: developer

538: .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
539: */
540: static PetscErrorCode PetscSFFindWindow(PetscSF sf,MPI_Datatype unit,const void *array,MPI_Win *win,MPI_Request **reqs)
541: {
542:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
543:   PetscSFWinLink link;

545:   *win = MPI_WIN_NULL;
546:   for (link=w->wins; link; link=link->next) {
547:     if (array == link->paddr) {

549:       PetscInfo(sf,"Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));
550:       *win = link->win;
551:       *reqs = link->reqs;
552:       return 0;
553:     }
554:   }
555:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");
556: }

558: /*
559:    PetscSFRestoreWindow - Restores a window obtained with PetscSFGetWindow()

561:    Collective

563:    Input Parameters:
564: +  sf - star forest
565: .  unit - data type
566: .  array - array associated with window
567: .  sync - type of synchronization PetscSFWindowSyncType
568: .  epoch - close an epoch, must match argument to PetscSFGetWindow()
569: .  update - if we have to update the local window array
570: -  win - window

572:    Level: developer

574: .seealso: PetscSFFindWindow()
575: */
576: static PetscErrorCode PetscSFRestoreWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscBool update,MPI_Win *win)
577: {
578:   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
579:   PetscSFWinLink          *p,link;
580:   PetscBool               reuse = PETSC_FALSE;
581:   PetscSFWindowFlavorType flavor;
582:   void*                   laddr;
583:   size_t                  bytes;

585:   for (p=&w->wins; *p; p=&(*p)->next) {
586:     link = *p;
587:     if (*win == link->win) {
589:       if (epoch != link->epoch) {
591:         else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Restoring window without ending epoch");
592:       }
593:       laddr = link->addr;
594:       flavor = link->flavor;
595:       bytes = link->bytes;
596:       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
597:       else { *p = link->next; update = PETSC_FALSE; } /* remove from list */
598:       goto found;
599:     }
600:   }
601:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");

603: found:
604:   PetscInfo(sf,"Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));
605:   if (epoch) {
606:     switch (sync) {
607:     case PETSCSF_WINDOW_SYNC_FENCE:
608:       MPI_Win_fence(fenceassert,*win);
609:       break;
610:     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
611:       break;
612:     case PETSCSF_WINDOW_SYNC_ACTIVE: {
613:       MPI_Group   ingroup,outgroup;
614:       PetscMPIInt isize,osize;

616:       /* OpenMPI 4.0.2 with btl=wader does not like calling
617:          - MPI_Win_complete when ogroup is empty
618:          - MPI_Win_wait when igroup is empty
619:          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
620:          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
621:          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
622:       */
623:       PetscSFGetGroups(sf,&ingroup,&outgroup);
624:       MPI_Group_size(ingroup,&isize);
625:       MPI_Group_size(outgroup,&osize);
626:       if (osize) MPI_Win_complete(*win);
627:       if (isize) MPI_Win_wait(*win);
628:     } break;
629:     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
630:     }
631:   }
632:   if (update) {
633:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
634:       MPI_Win_fence(MPI_MODE_NOPUT|MPI_MODE_NOSUCCEED,*win);
635:     }
636:     PetscMemcpy(array,laddr,sf->nroots*bytes);
637:   }
638:   link->epoch = PETSC_FALSE;
639:   link->inuse = PETSC_FALSE;
640:   link->paddr = NULL;
641:   if (!reuse) {
642:     PetscFree(link->dyn_target_addr);
643:     PetscFree(link->reqs);
644:     MPI_Win_free(&link->win);
645:     PetscFree(link);
646:     *win = MPI_WIN_NULL;
647:   }
648:   return 0;
649: }

651: static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
652: {
653:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
654:   MPI_Group      ingroup,outgroup;

656:   PetscSFSetUpRanks(sf,MPI_GROUP_EMPTY);
657:   if (!w->dynsf) {
658:     PetscInt    i;
659:     PetscSFNode *remotes;

661:     PetscMalloc1(sf->nranks,&remotes);
662:     for (i=0;i<sf->nranks;i++) {
663:       remotes[i].rank  = sf->ranks[i];
664:       remotes[i].index = 0;
665:     }
666:     PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&w->dynsf);
667:     PetscSFWindowSetFlavorType(w->dynsf,PETSCSF_WINDOW_FLAVOR_CREATE); /* break recursion */
668:     PetscSFSetGraph(w->dynsf,1,sf->nranks,NULL,PETSC_OWN_POINTER,remotes,PETSC_OWN_POINTER);
669:     PetscLogObjectParent((PetscObject)sf,(PetscObject)w->dynsf);
670:   }
671:   switch (w->sync) {
672:   case PETSCSF_WINDOW_SYNC_ACTIVE:
673:     PetscSFGetGroups(sf,&ingroup,&outgroup);
674:   default:
675:     break;
676:   }
677:   return 0;
678: }

680: static PetscErrorCode PetscSFSetFromOptions_Window(PetscOptionItems *PetscOptionsObject,PetscSF sf)
681: {
682:   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
683:   PetscSFWindowFlavorType flavor = w->flavor;

685:   PetscOptionsHead(PetscOptionsObject,"PetscSF Window options");
686:   PetscOptionsEnum("-sf_window_sync","synchronization type to use for PetscSF Window communication","PetscSFWindowSetSyncType",PetscSFWindowSyncTypes,(PetscEnum)w->sync,(PetscEnum*)&w->sync,NULL);
687:   PetscOptionsEnum("-sf_window_flavor","flavor to use for PetscSF Window creation","PetscSFWindowSetFlavorType",PetscSFWindowFlavorTypes,(PetscEnum)flavor,(PetscEnum*)&flavor,NULL);
688:   PetscSFWindowSetFlavorType(sf,flavor);
689:   PetscOptionsTail();
690:   return 0;
691: }

693: static PetscErrorCode PetscSFReset_Window(PetscSF sf)
694: {
695:   PetscSF_Window  *w = (PetscSF_Window*)sf->data;
696:   PetscSFDataLink link,next;
697:   PetscSFWinLink  wlink,wnext;
698:   PetscInt        i;

700:   for (link=w->link; link; link=next) {
701:     next = link->next;
702:     MPI_Type_free(&link->unit);
703:     for (i=0; i<sf->nranks; i++) {
704:       MPI_Type_free(&link->mine[i]);
705:       MPI_Type_free(&link->remote[i]);
706:     }
707:     PetscFree2(link->mine,link->remote);
708:     PetscFree(link);
709:   }
710:   w->link = NULL;
711:   for (wlink=w->wins; wlink; wlink=wnext) {
712:     wnext = wlink->next;
714:     PetscFree(wlink->dyn_target_addr);
715:     PetscFree(wlink->reqs);
716:     MPI_Win_free(&wlink->win);
717:     PetscFree(wlink);
718:   }
719:   w->wins = NULL;
720:   PetscSFDestroy(&w->dynsf);
721:   if (w->info != MPI_INFO_NULL) {
722:     MPI_Info_free(&w->info);
723:   }
724:   return 0;
725: }

727: static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
728: {
729:   PetscSFReset_Window(sf);
730:   PetscFree(sf->data);
731:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",NULL);
732:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",NULL);
733:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",NULL);
734:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",NULL);
735:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",NULL);
736:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",NULL);
737:   return 0;
738: }

740: static PetscErrorCode PetscSFView_Window(PetscSF sf,PetscViewer viewer)
741: {
742:   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
743:   PetscBool         iascii;
744:   PetscViewerFormat format;

746:   PetscViewerGetFormat(viewer,&format);
747:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
748:   if (iascii) {
749:     PetscViewerASCIIPrintf(viewer,"  current flavor=%s synchronization=%s MultiSF sort=%s\n",PetscSFWindowFlavorTypes[w->flavor],PetscSFWindowSyncTypes[w->sync],sf->rankorder ? "rank-order" : "unordered");
750:     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
751:       if (w->info != MPI_INFO_NULL) {
752:         PetscMPIInt k,nkeys;
753:         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];

755:         MPI_Info_get_nkeys(w->info,&nkeys);
756:         PetscViewerASCIIPrintf(viewer,"    current info with %d keys. Ordered key-value pairs follow:\n",nkeys);
757:         for (k = 0; k < nkeys; k++) {
758:           PetscMPIInt flag;

760:           MPI_Info_get_nthkey(w->info,k,key);
761:           MPI_Info_get(w->info,key,MPI_MAX_INFO_VAL,value,&flag);
763:           PetscViewerASCIIPrintf(viewer,"      %s = %s\n",key,value);
764:         }
765:       } else {
766:         PetscViewerASCIIPrintf(viewer,"    current info=MPI_INFO_NULL\n");
767:       }
768:     }
769:   }
770:   return 0;
771: }

773: static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf,PetscSFDuplicateOption opt,PetscSF newsf)
774: {
775:   PetscSF_Window        *w = (PetscSF_Window*)sf->data;
776:   PetscSFWindowSyncType synctype;

778:   synctype = w->sync;
779:   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
780:   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
781:   PetscSFWindowSetSyncType(newsf,synctype);
782:   PetscSFWindowSetFlavorType(newsf,w->flavor);
783:   PetscSFWindowSetInfo(newsf,w->info);
784:   return 0;
785: }

787: static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
788: {
789:   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
790:   PetscInt           i,nranks;
791:   const PetscMPIInt  *ranks;
792:   const MPI_Aint     *target_disp;
793:   const MPI_Datatype *mine,*remote;
794:   MPI_Request        *reqs;
795:   MPI_Win            win;

798:   PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
799:   PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);
800:   PetscSFGetWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPUT|MPI_MODE_NOPRECEDE,MPI_MODE_NOPUT,0,&target_disp,&reqs,&win);
801:   for (i=0; i<nranks; i++) {
802:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

804:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
805:       MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win);
806: #if defined(PETSC_HAVE_MPI_RGET)
807:       MPI_Rget(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win,&reqs[i]);
808: #else
809:       MPI_Get(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win);
810: #endif
811:     } else {
812:       MPI_Get(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win);
813:     }
814:   }
815:   return 0;
816: }

818: PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
819: {
820:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
821:   MPI_Win        win;
822:   MPI_Request    *reqs = NULL;

824:   PetscSFFindWindow(sf,unit,rootdata,&win,&reqs);
825:   if (reqs) MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE);
826:   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
827:     PetscInt           i,nranks;
828:     const PetscMPIInt  *ranks;

830:     PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
831:     for (i=0; i<nranks; i++) {
832:       MPI_Win_unlock(ranks[i],win);
833:     }
834:   }
835:   PetscSFRestoreWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSTORE|MPI_MODE_NOSUCCEED,PETSC_FALSE,&win);
836:   return 0;
837: }

839: PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
840: {
841:   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
842:   PetscInt           i,nranks;
843:   const PetscMPIInt  *ranks;
844:   const MPI_Aint     *target_disp;
845:   const MPI_Datatype *mine,*remote;
846:   MPI_Win            win;

848:   PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
849:   PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);
850:   PetscSFWindowOpTranslate(&op);
851:   PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,NULL,&win);
852:   for (i=0; i<nranks; i++) {
853:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

855:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win);
856:     MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
857:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_unlock(ranks[i],win);
858:   }
859:   return 0;
860: }

862: static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
863: {
864:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
865:   MPI_Win        win;
866:   MPI_Request    *reqs = NULL;

868:   PetscSFFindWindow(sf,unit,rootdata,&win,&reqs);
869:   if (reqs) MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE);
870:   PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win);
871:   return 0;
872: }

874: static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
875: {
876:   PetscInt           i,nranks;
877:   const PetscMPIInt  *ranks;
878:   const MPI_Datatype *mine,*remote;
879:   const MPI_Aint     *target_disp;
880:   MPI_Win            win;
881:   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
882: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
883:   PetscSFWindowFlavorType oldf;
884: #endif

886:   PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
887:   PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);
888:   PetscSFWindowOpTranslate(&op);
889: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
890:   /* FetchAndOp without MPI_Get_Accumulate requires locking.
891:      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
892:   oldf = w->flavor;
893:   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
894:   PetscSFGetWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,0,0,&target_disp,NULL,&win);
895: #else
896:   PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,NULL,&win);
897: #endif
898:   for (i=0; i<nranks; i++) {
899:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

901: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
902:     MPI_Win_lock(MPI_LOCK_EXCLUSIVE,ranks[i],0,win);
903:     MPI_Get(leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],win);
904:     MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
905:     MPI_Win_unlock(ranks[i],win);
906: #else
907:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],0,win);
908:     MPI_Get_accumulate((void*)leafdata,1,mine[i],leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
909:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_unlock(ranks[i],win);
910: #endif
911:   }
912: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
913:   w->flavor = oldf;
914: #endif
915:   return 0;
916: }

918: static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
919: {
920:   MPI_Win        win;
921: #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
922:   PetscSF_Window *w = (PetscSF_Window*)sf->data;
923: #endif
924:   MPI_Request    *reqs = NULL;

926:   PetscSFFindWindow(sf,unit,rootdata,&win,&reqs);
927:   if (reqs) MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE);
928: #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
929:   PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win);
930: #else
931:   PetscSFRestoreWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,PETSC_TRUE,&win);
932: #endif
933:   return 0;
934: }

936: PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
937: {
938:   PetscSF_Window *w = (PetscSF_Window*)sf->data;

940:   sf->ops->SetUp           = PetscSFSetUp_Window;
941:   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
942:   sf->ops->Reset           = PetscSFReset_Window;
943:   sf->ops->Destroy         = PetscSFDestroy_Window;
944:   sf->ops->View            = PetscSFView_Window;
945:   sf->ops->Duplicate       = PetscSFDuplicate_Window;
946:   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
947:   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
948:   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
949:   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
950:   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
951:   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;

953:   PetscNewLog(sf,&w);
954:   sf->data  = (void*)w;
955:   w->sync   = PETSCSF_WINDOW_SYNC_FENCE;
956:   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
957:   w->info   = MPI_INFO_NULL;

959:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",PetscSFWindowSetSyncType_Window);
960:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",PetscSFWindowGetSyncType_Window);
961:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",PetscSFWindowSetFlavorType_Window);
962:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",PetscSFWindowGetFlavorType_Window);
963:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",PetscSFWindowSetInfo_Window);
964:   PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",PetscSFWindowGetInfo_Window);

966: #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 1 || (OMPI_MAJOR_VERSION == 1 && OMPI_MINOR_VERSION <= 6))
967:   {
968:     PetscBool ackbug = PETSC_FALSE;
969:     PetscOptionsGetBool(NULL,NULL,"-acknowledge_ompi_onesided_bug",&ackbug,NULL);
970:     if (ackbug) {
971:       PetscInfo(sf,"Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n");
972:     } else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_LIB,"Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
973:   }
974: #endif
975:   return 0;
976: }