Actual source code: mpits.c

petsc-3.10.5 2019-03-28
Report Typos and Errors
  1:  #include <petscsys.h>
  2:  #include <petsc/private/petscimpl.h>

  4: PetscLogEvent PETSC_BuildTwoSided;
  5: PetscLogEvent PETSC_BuildTwoSidedF;

  7: const char *const PetscBuildTwoSidedTypes[] = {
  8:   "ALLREDUCE",
  9:   "IBARRIER",
 10:   "REDSCATTER",
 11:   "PetscBuildTwoSidedType",
 12:   "PETSC_BUILDTWOSIDED_",
 13:   0
 14: };

 16: static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;

 18: /*@
 19:    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication

 21:    Logically Collective

 23:    Input Arguments:
 24: +  comm - PETSC_COMM_WORLD
 25: -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()

 27:    Level: developer

 29:    Note:
 30:    This option is currently global, but could be made per-communicator.

 32: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
 33: @*/
 34: PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
 35: {
 37: #if defined(PETSC_USE_DEBUG)
 39:     PetscMPIInt ierr;
 40:     PetscMPIInt b1[2],b2[2];
 41:     b1[0] = -(PetscMPIInt)twosided;
 42:     b1[1] = (PetscMPIInt)twosided;
 43:     MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);
 44:     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
 45:   }
 46: #endif
 47:   _twosided_type = twosided;
 48:   return(0);
 49: }

 51: /*@
 52:    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication

 54:    Logically Collective

 56:    Output Arguments:
 57: +  comm - communicator on which to query algorithm
 58: -  twosided - algorithm to use for PetscCommBuildTwoSided()

 60:    Level: developer

 62: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
 63: @*/
 64: PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
 65: {

 69:   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
 70:   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
 71: #if defined(PETSC_HAVE_MPI_IBARRIER)
 72: #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
 73:     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
 74:     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
 75: #  else
 76:     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
 77: #  endif
 78: #else
 79:     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
 80: #endif
 81:     PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);
 82:   }
 83:   *twosided = _twosided_type;
 84:   return(0);
 85: }

 87: #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)

 89: static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
 90: {
 92:   PetscMPIInt    nrecvs,tag,done,i;
 93:   MPI_Aint       lb,unitbytes;
 94:   char           *tdata;
 95:   MPI_Request    *sendreqs,barrier;
 96:   PetscSegBuffer segrank,segdata;
 97:   PetscBool      barrier_started;

100:   PetscCommDuplicate(comm,&comm,&tag);
101:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
102:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
103:   tdata = (char*)todata;
104:   PetscMalloc1(nto,&sendreqs);
105:   for (i=0; i<nto; i++) {
106:     MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
107:   }
108:   PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);
109:   PetscSegBufferCreate(unitbytes,4*count,&segdata);

111:   nrecvs  = 0;
112:   barrier = MPI_REQUEST_NULL;
113:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
114:    * but we need to work around it. */
115:   barrier_started = PETSC_FALSE;
116:   for (done=0; !done; ) {
117:     PetscMPIInt flag;
118:     MPI_Status  status;
119:     MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);
120:     if (flag) {                 /* incoming message */
121:       PetscMPIInt *recvrank;
122:       void        *buf;
123:       PetscSegBufferGet(segrank,1,&recvrank);
124:       PetscSegBufferGet(segdata,count,&buf);
125:       *recvrank = status.MPI_SOURCE;
126:       MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);
127:       nrecvs++;
128:     }
129:     if (!barrier_started) {
130:       PetscMPIInt sent,nsends;
131:       PetscMPIIntCast(nto,&nsends);
132:       MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);
133:       if (sent) {
134: #if defined(PETSC_HAVE_MPI_IBARRIER)
135:         MPI_Ibarrier(comm,&barrier);
136: #elif defined(PETSC_HAVE_MPIX_IBARRIER)
137:         MPIX_Ibarrier(comm,&barrier);
138: #endif
139:         barrier_started = PETSC_TRUE;
140:         PetscFree(sendreqs);
141:       }
142:     } else {
143:       MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);
144:     }
145:   }
146:   *nfrom = nrecvs;
147:   PetscSegBufferExtractAlloc(segrank,fromranks);
148:   PetscSegBufferDestroy(&segrank);
149:   PetscSegBufferExtractAlloc(segdata,fromdata);
150:   PetscSegBufferDestroy(&segdata);
151:   PetscCommDestroy(&comm);
152:   return(0);
153: }
154: #endif

156: static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
157: {
159:   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
160:   MPI_Aint       lb,unitbytes;
161:   char           *tdata,*fdata;
162:   MPI_Request    *reqs,*sendreqs;
163:   MPI_Status     *statuses;

166:   MPI_Comm_size(comm,&size);
167:   PetscCalloc1(size,&iflags);
168:   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
169:   PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);
170:   PetscFree(iflags);

172:   PetscCommDuplicate(comm,&comm,&tag);
173:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
174:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
175:   PetscMalloc(nrecvs*count*unitbytes,&fdata);
176:   tdata    = (char*)todata;
177:   PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);
178:   sendreqs = reqs + nrecvs;
179:   for (i=0; i<nrecvs; i++) {
180:     MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);
181:   }
182:   for (i=0; i<nto; i++) {
183:     MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
184:   }
185:   MPI_Waitall(nto+nrecvs,reqs,statuses);
186:   PetscMalloc1(nrecvs,&franks);
187:   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
188:   PetscFree2(reqs,statuses);
189:   PetscCommDestroy(&comm);

191:   *nfrom            = nrecvs;
192:   *fromranks        = franks;
193:   *(void**)fromdata = fdata;
194:   return(0);
195: }

197: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
198: static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
199: {
201:   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
202:   MPI_Aint       lb,unitbytes;
203:   char           *tdata,*fdata;
204:   MPI_Request    *reqs,*sendreqs;
205:   MPI_Status     *statuses;

208:   MPI_Comm_size(comm,&size);
209:   PetscMalloc1(size,&iflags);
210:   PetscMemzero(iflags,size*sizeof(*iflags));
211:   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
212:   MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);
213:   PetscFree(iflags);

215:   PetscCommDuplicate(comm,&comm,&tag);
216:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
217:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
218:   PetscMalloc(nrecvs*count*unitbytes,&fdata);
219:   tdata    = (char*)todata;
220:   PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);
221:   sendreqs = reqs + nrecvs;
222:   for (i=0; i<nrecvs; i++) {
223:     MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);
224:   }
225:   for (i=0; i<nto; i++) {
226:     MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
227:   }
228:   MPI_Waitall(nto+nrecvs,reqs,statuses);
229:   PetscMalloc1(nrecvs,&franks);
230:   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
231:   PetscFree2(reqs,statuses);
232:   PetscCommDestroy(&comm);

234:   *nfrom            = nrecvs;
235:   *fromranks        = franks;
236:   *(void**)fromdata = fdata;
237:   return(0);
238: }
239: #endif

241: /*@C
242:    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)

244:    Collective on MPI_Comm

246:    Input Arguments:
247: +  comm - communicator
248: .  count - number of entries to send/receive (must match on all ranks)
249: .  dtype - datatype to send/receive from each rank (must match on all ranks)
250: .  nto - number of ranks to send data to
251: .  toranks - ranks to send to (array of length nto)
252: -  todata - data to send to each rank (packed)

254:    Output Arguments:
255: +  nfrom - number of ranks receiving messages from
256: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
257: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())

259:    Level: developer

261:    Options Database Keys:
262: .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication

264:    Notes:
265:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
266:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.

268:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

270:    References:
271: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
272:    Scalable communication protocols for dynamic sparse data exchange, 2010.

274: .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
275: @*/
276: PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
277: {
278:   PetscErrorCode         ierr;
279:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;

282:   PetscSysInitializePackage();
283:   PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);
284:   PetscCommBuildTwoSidedGetType(comm,&buildtype);
285:   switch (buildtype) {
286:   case PETSC_BUILDTWOSIDED_IBARRIER:
287: #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
288:     PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
289: #else
290:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
291: #endif
292:     break;
293:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
294:     PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
295:     break;
296:   case PETSC_BUILDTWOSIDED_REDSCATTER:
297: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
298:     PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
299: #else
300:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
301: #endif
302:     break;
303:   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
304:   }
305:   PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);
306:   return(0);
307: }

309: static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
310:                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
311:                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
312:                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
313: {
315:   PetscMPIInt i,*tag;
316:   MPI_Aint    lb,unitbytes;
317:   MPI_Request *sendreq,*recvreq;

320:   PetscMalloc1(ntags,&tag);
321:   if (ntags > 0) {
322:     PetscCommDuplicate(comm,&comm,&tag[0]);
323:   }
324:   for (i=1; i<ntags; i++) {
325:     PetscCommGetNewTag(comm,&tag[i]);
326:   }

328:   /* Perform complete initial rendezvous */
329:   PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);

331:   PetscMalloc1(nto*ntags,&sendreq);
332:   PetscMalloc1(*nfrom*ntags,&recvreq);

334:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
335:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
336:   for (i=0; i<nto; i++) {
337:     PetscMPIInt k;
338:     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
339:     (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);
340:   }
341:   for (i=0; i<*nfrom; i++) {
342:     void *header = (*(char**)fromdata) + count*unitbytes*i;
343:     PetscMPIInt k;
344:     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
345:     (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);
346:   }
347:   PetscFree(tag);
348:   PetscCommDestroy(&comm);
349:   *toreqs = sendreq;
350:   *fromreqs = recvreq;
351:   return(0);
352: }

354: #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)

356: static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
357:                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
358:                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
359:                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
360: {
362:   PetscMPIInt    nrecvs,tag,*tags,done,i;
363:   MPI_Aint       lb,unitbytes;
364:   char           *tdata;
365:   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
366:   PetscSegBuffer segrank,segdata,segreq;
367:   PetscBool      barrier_started;

370:   PetscCommDuplicate(comm,&comm,&tag);
371:   PetscMalloc1(ntags,&tags);
372:   for (i=0; i<ntags; i++) {
373:     PetscCommGetNewTag(comm,&tags[i]);
374:   }
375:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
376:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
377:   tdata = (char*)todata;
378:   PetscMalloc1(nto,&sendreqs);
379:   PetscMalloc1(nto*ntags,&usendreqs);
380:   /* Post synchronous sends */
381:   for (i=0; i<nto; i++) {
382:     MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
383:   }
384:   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
385:    * synchronous messages above. */
386:   for (i=0; i<nto; i++) {
387:     PetscMPIInt k;
388:     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
389:     (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);
390:   }

392:   PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);
393:   PetscSegBufferCreate(unitbytes,4*count,&segdata);
394:   PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);

396:   nrecvs  = 0;
397:   barrier = MPI_REQUEST_NULL;
398:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
399:    * but we need to work around it. */
400:   barrier_started = PETSC_FALSE;
401:   for (done=0; !done; ) {
402:     PetscMPIInt flag;
403:     MPI_Status  status;
404:     MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);
405:     if (flag) {                 /* incoming message */
406:       PetscMPIInt *recvrank,k;
407:       void        *buf;
408:       PetscSegBufferGet(segrank,1,&recvrank);
409:       PetscSegBufferGet(segdata,count,&buf);
410:       *recvrank = status.MPI_SOURCE;
411:       MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);
412:       PetscSegBufferGet(segreq,ntags,&req);
413:       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
414:       (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);
415:       nrecvs++;
416:     }
417:     if (!barrier_started) {
418:       PetscMPIInt sent,nsends;
419:       PetscMPIIntCast(nto,&nsends);
420:       MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);
421:       if (sent) {
422: #if defined(PETSC_HAVE_MPI_IBARRIER)
423:         MPI_Ibarrier(comm,&barrier);
424: #elif defined(PETSC_HAVE_MPIX_IBARRIER)
425:         MPIX_Ibarrier(comm,&barrier);
426: #endif
427:         barrier_started = PETSC_TRUE;
428:       }
429:     } else {
430:       MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);
431:     }
432:   }
433:   *nfrom = nrecvs;
434:   PetscSegBufferExtractAlloc(segrank,fromranks);
435:   PetscSegBufferDestroy(&segrank);
436:   PetscSegBufferExtractAlloc(segdata,fromdata);
437:   PetscSegBufferDestroy(&segdata);
438:   *toreqs = usendreqs;
439:   PetscSegBufferExtractAlloc(segreq,fromreqs);
440:   PetscSegBufferDestroy(&segreq);
441:   PetscFree(sendreqs);
442:   PetscFree(tags);
443:   PetscCommDestroy(&comm);
444:   return(0);
445: }
446: #endif

448: /*@C
449:    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous

451:    Collective on MPI_Comm

453:    Input Arguments:
454: +  comm - communicator
455: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
456: .  dtype - datatype to send/receive from each rank (must match on all ranks)
457: .  nto - number of ranks to send data to
458: .  toranks - ranks to send to (array of length nto)
459: .  todata - data to send to each rank (packed)
460: .  ntags - number of tags needed by send/recv callbacks
461: .  send - callback invoked on sending process when ready to send primary payload
462: .  recv - callback invoked on receiving process after delivery of rendezvous message
463: -  ctx - context for callbacks

465:    Output Arguments:
466: +  nfrom - number of ranks receiving messages from
467: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
468: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())

470:    Level: developer

472:    Notes:
473:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
474:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.

476:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

478:    References:
479: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
480:    Scalable communication protocols for dynamic sparse data exchange, 2010.

482: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
483: @*/
484: PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
485:                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
486:                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
487: {
489:   MPI_Request    *toreqs,*fromreqs;

492:   PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);
493:   MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);
494:   MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);
495:   PetscFree(toreqs);
496:   PetscFree(fromreqs);
497:   return(0);
498: }

500: /*@C
501:    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests

503:    Collective on MPI_Comm

505:    Input Arguments:
506: +  comm - communicator
507: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
508: .  dtype - datatype to send/receive from each rank (must match on all ranks)
509: .  nto - number of ranks to send data to
510: .  toranks - ranks to send to (array of length nto)
511: .  todata - data to send to each rank (packed)
512: .  ntags - number of tags needed by send/recv callbacks
513: .  send - callback invoked on sending process when ready to send primary payload
514: .  recv - callback invoked on receiving process after delivery of rendezvous message
515: -  ctx - context for callbacks

517:    Output Arguments:
518: +  nfrom - number of ranks receiving messages from
519: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
520: .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
521: .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
522: -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())

524:    Level: developer

526:    Notes:
527:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
528:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.

530:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

532:    References:
533: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
534:    Scalable communication protocols for dynamic sparse data exchange, 2010.

536: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
537: @*/
538: PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
539:                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
540:                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
541:                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
542: {
543:   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
544:                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
545:                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
546:                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
547:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
548:   PetscMPIInt i,size;

551:   PetscSysInitializePackage();
552:   MPI_Comm_size(comm,&size);
553:   for (i=0; i<nto; i++) {
554:     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
555:   }
556:   PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);
557:   PetscCommBuildTwoSidedGetType(comm,&buildtype);
558:   switch (buildtype) {
559:   case PETSC_BUILDTWOSIDED_IBARRIER:
560: #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
561:     f = PetscCommBuildTwoSidedFReq_Ibarrier;
562: #else
563:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
564: #endif
565:     break;
566:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
567:   case PETSC_BUILDTWOSIDED_REDSCATTER:
568:     f = PetscCommBuildTwoSidedFReq_Reference;
569:     break;
570:   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
571:   }
572:   (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);
573:   PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);
574:   return(0);
575: }