Actual source code: vscat.c

  1: #include "petscsf.h"
  2: #include "petscsystypes.h"
  3: #include "petscvec.h"
  4: #include <petsc/private/sfimpl.h>
  5: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  6: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  7: #include <petsc/private/vecimpl.h>

  9: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

 11: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
 12: {
 14:   PetscBool      same;

 17:   *id  = IS_INVALID;
 18:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
 19:   if (same) {*id = IS_GENERAL; goto functionend;}
 20:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
 21:   if (same) {*id = IS_BLOCK; goto functionend;}
 22:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
 23:   if (same) {*id = IS_STRIDE; goto functionend;}
 24: functionend:
 25:   return(0);
 26: }

 28: static PetscErrorCode VecScatterBegin_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 29: {
 31:   PetscSF        wsf=NULL; /* either sf or its local part */
 32:   MPI_Op         mop=MPI_OP_NULL;
 33:   PetscMPIInt    size;
 34:   PetscMemType   xmtype=PETSC_MEMTYPE_HOST,ymtype=PETSC_MEMTYPE_HOST;

 37:   if (x != y) {VecLockReadPush(x);}
 38:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {
 39:     VecGetArrayReadAndMemType(x,&sf->vscat.xdata,&xmtype);
 40:   } else {
 41:     VecGetArrayRead(x,&sf->vscat.xdata);
 42:   }

 44:   if (x != y) {
 45:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecGetArrayAndMemType(y,&sf->vscat.ydata,&ymtype);}
 46:     else {VecGetArray(y,&sf->vscat.ydata);}
 47:   } else {
 48:     sf->vscat.ydata = (PetscScalar *)sf->vscat.xdata;
 49:     ymtype          = xmtype;
 50:   }
 51:   VecLockWriteSet_Private(y,PETSC_TRUE);

 53:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 54:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 55:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of sf->vscat.lsf since SCATTER_LOCAL is uncommon */
 56:     if (!sf->vscat.lsf) {PetscSFCreateLocalSF_Private(sf,&sf->vscat.lsf);}
 57:     wsf = sf->vscat.lsf;
 58:   } else {
 59:     wsf = sf;
 60:   }

 62:   /* Note xdata/ydata is always recorded on sf (not lsf) above */
 63:   if (addv == INSERT_VALUES)   mop = MPI_REPLACE;
 64:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 65:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 66:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 67:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 69:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 70:     PetscSFReduceWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 71:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 72:     PetscSFBcastWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 73:   }
 74:   return(0);
 75: }

 77: static PetscErrorCode VecScatterEnd_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 78: {
 80:   PetscSF        wsf=NULL;
 81:   MPI_Op         mop=MPI_OP_NULL;
 82:   PetscMPIInt    size;

 85:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 86:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 87:   wsf  = ((mode & SCATTER_LOCAL) && size > 1) ? sf->vscat.lsf : sf;

 89:   if (addv == INSERT_VALUES)   mop = MPI_REPLACE;
 90:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 91:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 92:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 93:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 95:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 96:     PetscSFReduceEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 97:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 98:     PetscSFBcastEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 99:   }

101:   if (x != y) {
102:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayReadAndMemType(x,&sf->vscat.xdata);}
103:     else {VecRestoreArrayRead(x,&sf->vscat.xdata);}
104:     VecLockReadPop(x);
105:   }

107:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayAndMemType(y,&sf->vscat.ydata);}
108:   else {VecRestoreArray(y,&sf->vscat.ydata);}
109:   VecLockWriteSet_Private(y,PETSC_FALSE);

111:   return(0);
112: }

114: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input sf scatters
115:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
116:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
117:  */
118: static PetscErrorCode VecScatterRemap_Internal(VecScatter sf,const PetscInt *tomap,const PetscInt *frommap)
119: {
120:   PetscInt       i,bs = sf->vscat.bs;
121:   PetscMPIInt    size;
122:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
123:   PetscSFType    type;
124:   PetscSF_Basic  *bas = NULL;

128:   /* check if it is an identity map. If it is, do nothing */
129:   if (tomap) {
130:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
131:     if (ident) return(0);
132:   }
133:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
134:   if (!tomap) return(0);

136:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);

138:   /* Since the indices changed, we must also update the local SF. But we do not do it since
139:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated sf.
140:   */
141:   if (sf->vscat.lsf) {PetscSFDestroy(&sf->vscat.lsf);}

143:   PetscSFGetType(sf,&type);
144:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
145:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
146:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

148:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

150:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
151:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
152:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
153:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
154:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
155:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
156:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
157:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
158:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
159:   */
160:   sf->remote = NULL;
161:   PetscFree(sf->remote_alloc);
162:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
163:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

165:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
166:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
167:      after the remapping, we have to shrink them back.
168:    */
169:   bas = (PetscSF_Basic*)sf->data;
170:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
171: #if defined(PETSC_HAVE_DEVICE)
172:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
173:   for (i=0; i<2; i++) {PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[i]);}
174: #endif
175:   /* Destroy and then rebuild root packing optimizations since indices are changed */
176:   PetscSFResetPackFields(sf);
177:   PetscSFSetUpPackFields(sf);
178:   return(0);
179: }

181: /* Given a parallel VecScatter context, return number of procs and vector entries involved in remote (i.e., off-process) communication

183:   Input Parameters:
184: + sf   - the context (must be a parallel vecscatter)
185: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

187:   Output parameters:
188: + num_procs   - number of remote processors
189: - num_entries - number of vector entries to send or recv

191:   .seealso: VecScatterGetRemote_Private(), VecScatterGetRemoteOrdered_Private()

193:   Notes:
194:   Sometimes PETSc internally needs to use the matrix-vector-multiply vecscatter context for other purposes. The client code
195:   usually only uses MPI_Send/Recv. This group of subroutines provides info needed for such uses.
196:  */
197: PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter sf,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
198: {
199:   PetscErrorCode    ierr;
200:   PetscInt          nranks,remote_start;
201:   PetscMPIInt       rank;
202:   const PetscInt    *offset;
203:   const PetscMPIInt *ranks;

206:   PetscSFSetUp(sf);
207:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

209:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
210:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
211:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
212:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
213:   */
214:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
215:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
216:   if (nranks) {
217:     remote_start = (rank == ranks[0])? 1 : 0;
218:     if (num_procs)   *num_procs   = nranks - remote_start;
219:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
220:   } else {
221:     if (num_procs)   *num_procs   = 0;
222:     if (num_entries) *num_entries = 0;
223:   }
224:   return(0);
225: }

227: /* Given a parallel VecScatter context, return a plan that represents the remote communication.
228:    Any output parameter can be NULL.

230:   Input Parameters:
231: + sf   - the context
232: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

234:   Output parameters:
235: + n        - number of remote processors
236: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
237:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
238:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
239:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
240: . indices  - indices of entries to send/recv
241: . procs    - ranks of remote processors
242: - bs       - block size

244:   .seealso: VecScatterRestoreRemote_Private(), VecScatterGetRemoteOrdered_Private()
245:  */
246: PetscErrorCode VecScatterGetRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
247: {
248:   PetscErrorCode    ierr;
249:   PetscInt          nranks,remote_start;
250:   PetscMPIInt       rank;
251:   const PetscInt    *offset,*location;
252:   const PetscMPIInt *ranks;

255:   PetscSFSetUp(sf);
256:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

258:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
259:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

261:   if (nranks) {
262:     remote_start = (rank == ranks[0])? 1 : 0;
263:     if (n)       *n       = nranks - remote_start;
264:     if (starts)  *starts  = &offset[remote_start];
265:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
266:     if (procs)   *procs   = &ranks[remote_start];
267:   } else {
268:     if (n)       *n       = 0;
269:     if (starts)  *starts  = NULL;
270:     if (indices) *indices = NULL;
271:     if (procs)   *procs   = NULL;
272:   }

274:   if (bs) *bs = 1;
275:   return(0);
276: }

278: /* Given a parallel VecScatter context, return a plan that represents the remote communication. Ranks of remote
279:    processors returned in procs must be sorted in ascending order. Any output parameter can be NULL.

281:   Input Parameters:
282: + sf   - the context
283: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

285:   Output parameters:
286: + n        - number of remote processors
287: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
288:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
289:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
290:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
291: . indices  - indices of entries to send/recv
292: . procs    - ranks of remote processors
293: - bs       - block size

295:   .seealso: VecScatterRestoreRemoteOrdered_Private(), VecScatterGetRemote_Private()

297:   Notes:
298:   Output parameters like starts, indices must also be adapted according to the sorted ranks.
299:  */
300: PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
301: {

305:   VecScatterGetRemote_Private(sf,send,n,starts,indices,procs,bs);
306:   if (PetscUnlikelyDebug(n && procs)) {
307:     PetscInt i;
308:     /* from back to front to also handle cases *n=0 */
309:     for (i=*n-1; i>0; i--) { if ((*procs)[i-1] > (*procs)[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"procs[] are not ordered"); }
310:   }
311:   return(0);
312: }

314: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemote_Private. This gives a chance for
315:    an implementation to free memory allocated in the VecScatterGetRemote_Private call.

317:   Input Parameters:
318: + sf   - the context
319: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

321:   Output parameters:
322: + n        - number of remote processors
323: . starts   - starting point in indices for each proc
324: . indices  - indices of entries to send/recv
325: . procs    - ranks of remote processors
326: - bs       - block size

328:   .seealso: VecScatterGetRemote_Private()
329:  */
330: PetscErrorCode VecScatterRestoreRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
331: {
333:   if (starts)   *starts  = NULL;
334:   if (indices)  *indices = NULL;
335:   if (procs)    *procs   = NULL;
336:   return(0);
337: }

339: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemoteOrdered_Private. This gives a chance for
340:    an implementation to free memory allocated in the VecScatterGetRemoteOrdered_Private call.

342:   Input Parameters:
343: + sf   - the context
344: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

346:   Output parameters:
347: + n        - number of remote processors
348: . starts   - starting point in indices for each proc
349: . indices  - indices of entries to send/recv
350: . procs    - ranks of remote processors
351: - bs       - block size

353:   .seealso: VecScatterGetRemoteOrdered_Private()
354:  */
355: PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
356: {
359:   VecScatterRestoreRemote_Private(sf,send,n,starts,indices,procs,bs);
360:   return(0);
361: }

363: /*@
364:    VecScatterSetUp - Sets up the VecScatter to be able to actually scatter information between vectors

366:    Collective on VecScatter

368:    Input Parameter:
369: .  sf - the scatter context

371:    Level: intermediate

373: .seealso: VecScatterCreate(), VecScatterCopy()
374: @*/
375: PetscErrorCode VecScatterSetUp(VecScatter sf)
376: {
379:   PetscSFSetUp(sf);
380:   return(0);
381: }

383: /*@C
384:   VecScatterSetType - Builds a vector scatter, for a particular vector scatter implementation.

386:   Collective on VecScatter

388:   Input Parameters:
389: + sf - The VecScatter (SF) object
390: - type - The name of the vector scatter type

392:   Options Database Key:
393: . -sf_type <type> - Sets the VecScatter (SF) type

395:   Notes:
396:   Use VecScatterDuplicate() to form additional vectors scatter of the same type as an existing vector scatter.

398:   Level: intermediate

400: .seealso: VecScatterGetType(), VecScatterCreate()
401: @*/
402: PetscErrorCode VecScatterSetType(VecScatter sf, VecScatterType type)
403: {

407:   PetscSFSetType(sf,type);
408:   return(0);
409: }

411: /*@C
412:   VecScatterGetType - Gets the vector scatter type name (as a string) from the VecScatter.

414:   Not Collective

416:   Input Parameter:
417: . sf  - The vector scatter (SF)

419:   Output Parameter:
420: . type - The vector scatter type name

422:   Level: intermediate

424: .seealso: VecScatterSetType(), VecScatterCreate()
425: @*/
426: PetscErrorCode VecScatterGetType(VecScatter sf, VecScatterType *type)
427: {
430:   PetscSFGetType(sf,type);
431:   return(0);
432: }

434: /*@C
435:   VecScatterRegister -  Adds a new vector scatter component implementation

437:   Not Collective

439:   Input Parameters:
440: + name        - The name of a new user-defined creation routine
441: - create_func - The creation routine itself

443:   Level: advanced

445: .seealso: VecRegister()
446: @*/
447: PetscErrorCode VecScatterRegister(const char sname[], PetscErrorCode (*function)(VecScatter))
448: {
451:   PetscSFRegister(sname,function);
452:   return(0);
453: }

455: /* ------------------------------------------------------------------*/
456: /*@
457:    VecScatterGetMerged - Returns true if the scatter is completed in the VecScatterBegin()
458:       and the VecScatterEnd() does nothing

460:    Not Collective

462:    Input Parameter:
463: .   sf - scatter context created with VecScatterCreate()

465:    Output Parameter:
466: .   flg - PETSC_TRUE if the VecScatterBegin/End() are all done during the VecScatterBegin()

468:    Level: developer

470: .seealso: VecScatterCreate(), VecScatterEnd(), VecScatterBegin()
471: @*/
472: PetscErrorCode  VecScatterGetMerged(VecScatter sf,PetscBool *flg)
473: {
476:   if (flg) *flg = sf->vscat.beginandendtogether;
477:   return(0);
478: }
479: /*@C
480:    VecScatterDestroy - Destroys a scatter context created by VecScatterCreate()

482:    Collective on VecScatter

484:    Input Parameter:
485: .  sf - the scatter context

487:    Level: intermediate

489: .seealso: VecScatterCreate(), VecScatterCopy()
490: @*/
491: PetscErrorCode VecScatterDestroy(VecScatter *sf)
492: {

496:   PetscSFDestroy(sf);
497:   return(0);
498: }

500: /*@
501:    VecScatterCopy - Makes a copy of a scatter context.

503:    Collective on VecScatter

505:    Input Parameter:
506: .  sf - the scatter context

508:    Output Parameter:
509: .  newsf - the context copy

511:    Level: advanced

513: .seealso: VecScatterCreate(), VecScatterDestroy()
514: @*/
515: PetscErrorCode  VecScatterCopy(VecScatter sf,VecScatter *newsf)
516: {

521:   PetscSFDuplicate(sf,PETSCSF_DUPLICATE_GRAPH,newsf);
522:   PetscSFSetUp(*newsf);
523:   return(0);
524: }

526: /*@C
527:    VecScatterViewFromOptions - View from Options

529:    Collective on VecScatter

531:    Input Parameters:
532: +  sf - the scatter context
533: .  obj - Optional object
534: -  name - command line option

536:    Level: intermediate
537: .seealso:  VecScatter, VecScatterView, PetscObjectViewFromOptions(), VecScatterCreate()
538: @*/
539: PetscErrorCode  VecScatterViewFromOptions(VecScatter sf,PetscObject obj,const char name[])
540: {

545:   PetscObjectViewFromOptions((PetscObject)sf,obj,name);
546:   return(0);
547: }

549: /* ------------------------------------------------------------------*/
550: /*@C
551:    VecScatterView - Views a vector scatter context.

553:    Collective on VecScatter

555:    Input Parameters:
556: +  sf - the scatter context
557: -  viewer - the viewer for displaying the context

559:    Level: intermediate

561: @*/
562: PetscErrorCode  VecScatterView(VecScatter sf,PetscViewer viewer)
563: {

567:   PetscSFView(sf,viewer);
568:   return(0);
569: }

571: /*@C
572:    VecScatterRemap - Remaps the "from" and "to" indices in a
573:    vector scatter context. FOR EXPERTS ONLY!

575:    Collective on VecScatter

577:    Input Parameters:
578: +  sf    - vector scatter context
579: .  tomap   - remapping plan for "to" indices (may be NULL).
580: -  frommap - remapping plan for "from" indices (may be NULL)

582:    Level: developer

584:    Notes:
585:      In the parallel case the todata contains indices from where the data is taken
586:      (and then sent to others)! The fromdata contains indices from where the received
587:      data is finally put locally.

589:      In the sequential case the todata contains indices from where the data is put
590:      and the fromdata contains indices from where the data is taken from.
591:      This is backwards from the paralllel case!

593: @*/
594: PetscErrorCode  VecScatterRemap(VecScatter sf,PetscInt tomap[],PetscInt frommap[])
595: {
596:   PetscInt               ierr;

601:   VecScatterRemap_Internal(sf,tomap,frommap);
602:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
603:   /* Mark then vector lengths as unknown because we do not know the lengths of the remapped vectors */
604:   sf->vscat.from_n = -1;
605:   sf->vscat.to_n   = -1;
606:   return(0);
607: }

609: /*@
610:   VecScatterSetFromOptions - Configures the vector scatter from the options database.

612:   Collective on VecScatter

614:   Input Parameter:
615: . sf - The vector scatter

617:   Notes:
618:     To see all options, run your program with the -help option, or consult the users manual.
619:           Must be called before VecScatterSetUp() but before the vector scatter is used.

621:   Level: beginner

623: .seealso: VecScatterCreate(), VecScatterDestroy(), VecScatterSetUp()
624: @*/
625: PetscErrorCode VecScatterSetFromOptions(VecScatter sf)
626: {

631:   PetscObjectOptionsBegin((PetscObject)sf);

633:   sf->vscat.beginandendtogether = PETSC_FALSE;
634:   PetscOptionsBool("-vecscatter_merge","Use combined (merged) vector scatter begin and end","VecScatterCreate",sf->vscat.beginandendtogether,&sf->vscat.beginandendtogether,NULL);
635:   if (sf->vscat.beginandendtogether) {PetscInfo(sf,"Using combined (merged) vector scatter begin and end\n");}

637:   sf->vscat.packongpu = PETSC_TRUE;
638:   PetscOptionsBool("-vecscatter_packongpu","For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI","VecScatterCreate",sf->vscat.packongpu,&sf->vscat.packongpu,NULL);
639:   if (sf->vscat.packongpu) {PetscInfo(sf,"For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI\n");}
640:   PetscOptionsEnd();
641:   return(0);
642: }

644: /* ---------------------------------------------------------------- */
645: /*@
646:    VecScatterCreate - Creates a vector scatter context.

648:    Collective on Vec

650:    Input Parameters:
651: +  xin - a vector that defines the shape (parallel data layout of the vector)
652:          of vectors from which we scatter
653: .  yin - a vector that defines the shape (parallel data layout of the vector)
654:          of vectors to which we scatter
655: .  ix - the indices of xin to scatter (if NULL scatters all values)
656: -  iy - the indices of yin to hold results (if NULL fills entire vector yin)

658:    Output Parameter:
659: .  newsf - location to store the new scatter (SF) context

661:    Options Database Keys:
662: +  -vecscatter_view         - Prints detail of communications
663: .  -vecscatter_view ::ascii_info    - Print less details about communication
664: .  -vecscatter_merge        - VecScatterBegin() handles all of the communication, VecScatterEnd() is a nop
665:                               eliminates the chance for overlap of computation and communication
666: -  -vecscatter_packongpu    - For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI.
667:                               Otherwise, we might copy a segment encompassing needed entries. Default is TRUE.

669:     Level: intermediate

671:   Notes:
672:    If both xin and yin are parallel, their communicator must be on the same
673:    set of processes, but their process order can be different.
674:    In calls to VecScatter() you can use different vectors than the xin and
675:    yin you used above; BUT they must have the same parallel data layout, for example,
676:    they could be obtained from VecDuplicate().
677:    A VecScatter context CANNOT be used in two or more simultaneous scatters;
678:    that is you cannot call a second VecScatterBegin() with the same scatter
679:    context until the VecScatterEnd() has been called on the first VecScatterBegin().
680:    In this case a separate VecScatter is needed for each concurrent scatter.

682:    Currently the MPI_Send() use PERSISTENT versions.
683:    (this unfortunately requires that the same in and out arrays be used for each use, this
684:     is why  we always need to pack the input into the work array before sending
685:     and unpack upon receiving instead of using MPI datatypes to avoid the packing/unpacking).

687:    Both ix and iy cannot be NULL at the same time.

689:    Use VecScatterCreateToAll() to create a vecscatter that copies an MPI vector to sequential vectors on all MPI ranks.
690:    Use VecScatterCreateToZero() to create a vecscatter that copies an MPI vector to a sequential vector on MPI rank 0.
691:    These special vecscatters have better performance than general ones.

693: .seealso: VecScatterDestroy(), VecScatterCreateToAll(), VecScatterCreateToZero(), PetscSFCreate()
694: @*/
695: PetscErrorCode VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newsf)
696: {
698:   MPI_Comm       xcomm,ycomm,bigcomm;
699:   Vec            xx,yy;
700:   IS             ix_old=ix,iy_old=iy,ixx,iyy;
701:   PetscMPIInt    xcommsize,ycommsize,rank,result;
702:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
703:   const PetscInt *xindices,*yindices;
704:   PetscSFNode    *iremote;
705:   PetscLayout    xlayout,ylayout;
706:   ISTypeID       ixid,iyid;
707:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
708:   PetscBool      can_do_block_opt=PETSC_FALSE;
709:   PetscSF        sf;

713:   if (!ix && !iy) SETERRQ(PetscObjectComm((PetscObject)x),PETSC_ERR_SUP,"Cannot pass default in for both input and output indices");

715:   /* Get comm from x and y */
716:   PetscObjectGetComm((PetscObject)x,&xcomm);
717:   MPI_Comm_size(xcomm,&xcommsize);
718:   PetscObjectGetComm((PetscObject)y,&ycomm);
719:   MPI_Comm_size(ycomm,&ycommsize);
720:   if (xcommsize > 1 && ycommsize > 1) {
721:     MPI_Comm_compare(xcomm,ycomm,&result);
722:     if (result == MPI_UNEQUAL) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NOTSAMECOMM,"VecScatterCreate: parallel vectors x and y must have identical/congruent/similar communicators");
723:   }
724:   bs = 1; /* default, no blocking */

726:   /*
727:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
728:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
729:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

731:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
732:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
733:   */

735:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
736:   if (!ix) {
737:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
738:       VecGetSize(x,&N);
739:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
740:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
741:       VecGetLocalSize(x,&n);
742:       VecGetOwnershipRange(x,&xstart,NULL);
743:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
744:     }
745:   }

747:   if (!iy) {
748:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
749:       VecGetSize(y,&N);
750:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
751:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
752:       VecGetLocalSize(y,&n);
753:       VecGetOwnershipRange(y,&ystart,NULL);
754:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
755:     }
756:   }

758:   /* Do error checking immediately after we have non-empty ix, iy */
759:   ISGetLocalSize(ix,&ixsize);
760:   ISGetLocalSize(iy,&iysize);
761:   VecGetSize(x,&xlen);
762:   VecGetSize(y,&ylen);
763:   if (ixsize != iysize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally ix=%D iy=%D",ixsize,iysize);
764:   ISGetMinMax(ix,&min,&max);
765:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
766:   ISGetMinMax(iy,&min,&max);
767:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

769:   /* Extract info about ix, iy for further test */
770:   ISGetTypeID_Private(ix,&ixid);
771:   ISGetTypeID_Private(iy,&iyid);
772:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
773:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

775:   if (iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
776:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

778:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
779:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
780:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

782:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
783:   */
784:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
785:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
786:     PetscLayout map;

788:     MPI_Comm_rank(xcomm,&rank);
789:     VecGetLayout(x,&map);
790:     if (rank == 0) {
791:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
792:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
793:         pattern[0] = pattern[1] = 1;
794:       }
795:     } else {
796:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
797:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
798:         pattern[0] = 1;
799:       } else if (ixsize == 0) {
800:         /* Other ranks do nothing, so it is a ToZero candiate */
801:         pattern[1] = 1;
802:       }
803:     }

805:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
806:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

808:     if (pattern[0] || pattern[1]) {
809:       PetscSFCreate(xcomm,&sf);
810:       PetscSFSetFromOptions(sf);
811:       PetscSFSetGraphWithPattern(sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
812:       goto functionend; /* No further analysis needed. What a big win! */
813:     }
814:   }

816:   /* Continue ...
817:      Do block optimization by taking advantage of high level info available in ix, iy.
818:      The block optimization is valid when all of the following conditions are met:
819:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
820:      2) ix, iy have the same block size;
821:      3) all processors agree on one block size;
822:      4) no blocks span more than one process;
823:    */
824:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

826:   /* Processors could go through different path in this if-else test */
827:   m[0] = m[1] = PETSC_MPI_INT_MIN;
828:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
829:     m[0] = PetscMax(bsx,bsy);
830:     m[1] = -PetscMin(bsx,bsy);
831:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
832:     m[0] = bsx;
833:     m[1] = -bsx;
834:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
835:     m[0] = bsy;
836:     m[1] = -bsy;
837:   }
838:   /* Get max and min of bsx,bsy over all processes in one allreduce */
839:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
840:   max = m[0]; min = -m[1];

842:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
843:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
844:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
845:    */
846:   if (min == max && min > 1) {
847:     VecGetLocalSize(x,&xlen);
848:     VecGetLocalSize(y,&ylen);
849:     m[0] = xlen%min;
850:     m[1] = ylen%min;
851:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
852:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
853:   }

855:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
856:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
857:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

859:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
860:      we can treat PtoP and StoP uniformly as PtoS.
861:    */
862:   if (can_do_block_opt) {
863:     const PetscInt *indices;

865:     /* Shrink x and ix */
866:     bs   = min;
867:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
868:     if (ixid == IS_BLOCK) {
869:       ISBlockGetIndices(ix,&indices);
870:       ISBlockGetLocalSize(ix,&ixsize);
871:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
872:       ISBlockRestoreIndices(ix,&indices);
873:     } else { /* ixid == IS_STRIDE */
874:       ISGetLocalSize(ix,&ixsize);
875:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
876:     }

878:     /* Shrink y and iy */
879:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
880:     if (iyid == IS_BLOCK) {
881:       ISBlockGetIndices(iy,&indices);
882:       ISBlockGetLocalSize(iy,&iysize);
883:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
884:       ISBlockRestoreIndices(iy,&indices);
885:     } else { /* iyid == IS_STRIDE */
886:       ISGetLocalSize(iy,&iysize);
887:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
888:     }
889:   } else {
890:     ixx = ix;
891:     iyy = iy;
892:     yy  = y;
893:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
894:   }

896:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
897:   ISGetIndices(ixx,&xindices);
898:   ISGetIndices(iyy,&yindices);
899:   VecGetLayout(xx,&xlayout);

901:   if (ycommsize > 1) {
902:     /* PtoP or StoP */

904:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
905:        to owner process of yindices[i] according to ylayout, i = 0..n.

907:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
908:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
909:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
910:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
911:      */
912: #if 0
913:     const PetscInt *degree;
914:     PetscSF        tmpsf;
915:     PetscInt       inedges=0,*leafdata,*rootdata;

917:     VecGetOwnershipRange(xx,&xstart,NULL);
918:     VecGetLayout(yy,&ylayout);
919:     VecGetOwnershipRange(yy,&ystart,NULL);

921:     VecGetLocalSize(yy,&nroots);
922:     ISGetLocalSize(iyy,&nleaves);
923:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

925:     for (i=0; i<nleaves; i++) {
926:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
927:       leafdata[2*i]   = yindices[i];
928:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
929:     }

931:     PetscSFCreate(ycomm,&tmpsf);
932:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

934:     PetscSFComputeDegreeBegin(tmpsf,&degree);
935:     PetscSFComputeDegreeEnd(tmpsf,&degree);

937:     for (i=0; i<nroots; i++) inedges += degree[i];
938:     PetscMalloc1(inedges*2,&rootdata);
939:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
940:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

942:     PetscFree2(iremote,leafdata);
943:     PetscSFDestroy(&tmpsf);

945:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
946:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
947:      */
948:     nleaves = inedges;
949:     VecGetLocalSize(xx,&nroots);
950:     PetscMalloc1(nleaves,&ilocal);
951:     PetscMalloc1(nleaves,&iremote);

953:     for (i=0; i<inedges; i++) {
954:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
955:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
956:     }
957:     PetscFree(rootdata);
958: #else
959:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
960:     const PetscInt *yrange;
961:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
962:     PetscInt       *rxindices,*ryindices;
963:     MPI_Request    *reqs,*sreqs,*rreqs;

965:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
966:        yindices_sorted - sorted yindices
967:        xindices_sorted - xindices sorted along with yindces
968:      */
969:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
970:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
971:     PetscArraycpy(xindices_sorted,xindices,n);
972:     PetscArraycpy(yindices_sorted,yindices,n);
973:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
974:     VecGetOwnershipRange(xx,&xstart,NULL);
975:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

977:     /*=============================================================================
978:              Calculate info about messages I need to send
979:       =============================================================================*/
980:     /* nsend    - number of non-empty messages to send
981:        sendto   - [nsend] ranks I will send messages to
982:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
983:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
984:      */
985:     VecGetLayout(yy,&ylayout);
986:     PetscLayoutGetRanges(ylayout,&yrange);
987:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

989:     i = j = nsend = 0;
990:     while (i < n) {
991:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
992:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
993:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
994:       }
995:       i++;
996:       if (!slens[j]++) nsend++;
997:     }

999:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

1001:     sstart[0] = 0;
1002:     for (i=j=0; i<ycommsize; i++) {
1003:       if (slens[i]) {
1004:         sendto[j]   = (PetscMPIInt)i;
1005:         sstart[j+1] = sstart[j] + slens[i];
1006:         j++;
1007:       }
1008:     }

1010:     /*=============================================================================
1011:       Calculate the reverse info about messages I will recv
1012:       =============================================================================*/
1013:     /* nrecv     - number of messages I will recv
1014:        recvfrom  - [nrecv] ranks I recv from
1015:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
1016:        rlentotal - sum of rlens[]
1017:        rxindices - [rlentotal] recv buffer for xindices_sorted
1018:        ryindices - [rlentotal] recv buffer for yindices_sorted
1019:      */
1020:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
1021:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
1022:     PetscFree(slens); /* Free the O(P) array ASAP */
1023:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

1025:     /*=============================================================================
1026:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
1027:       ============================================================================*/
1028:     PetscCommGetNewTag(ycomm,&tag1);
1029:     PetscCommGetNewTag(ycomm,&tag2);
1030:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
1031:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
1032:     PetscMalloc1(nreq,&reqs);
1033:     sreqs = reqs;
1034:     rreqs = reqs + nsend*2;

1036:     for (i=disp=0; i<nrecv; i++) {
1037:       count = rlens[i];
1038:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
1039:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
1040:       disp += rlens[i];
1041:     }

1043:     for (i=0; i<nsend; i++) {
1044:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
1045:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
1046:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
1047:     }
1048:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

1050:     /* Transform VecScatter into SF */
1051:     nleaves = rlentotal;
1052:     PetscMalloc1(nleaves,&ilocal);
1053:     PetscMalloc1(nleaves,&iremote);
1054:     MPI_Comm_rank(ycomm,&yrank);
1055:     for (i=disp=0; i<nrecv; i++) {
1056:       for (j=0; j<rlens[i]; j++) {
1057:         k               = disp + j; /* k-th index pair */
1058:         ilocal[k]       = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
1059:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
1060:         iremote[k].rank = rank;
1061:       }
1062:       disp += rlens[i];
1063:     }

1065:     PetscFree2(sstart,sendto);
1066:     PetscFree(slens);
1067:     PetscFree(rlens);
1068:     PetscFree(recvfrom);
1069:     PetscFree(reqs);
1070:     PetscFree2(rxindices,ryindices);
1071:     PetscFree2(xindices_sorted,yindices_sorted);
1072: #endif
1073:   } else {
1074:     /* PtoS or StoS */
1075:     ISGetLocalSize(iyy,&nleaves);
1076:     PetscMalloc1(nleaves,&ilocal);
1077:     PetscMalloc1(nleaves,&iremote);
1078:     PetscArraycpy(ilocal,yindices,nleaves);
1079:     for (i=0; i<nleaves; i++) {
1080:       PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
1081:       iremote[i].rank = rank;
1082:     }
1083:   }

1085:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
1086:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
1087:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
1088:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&sf);
1089:   PetscSFSetFromOptions(sf);
1090:   VecGetLocalSize(xx,&nroots);
1091:   PetscSFSetGraph(sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

1093:   /* Free memory no longer needed */
1094:   ISRestoreIndices(ixx,&xindices);
1095:   ISRestoreIndices(iyy,&yindices);
1096:   if (can_do_block_opt) {
1097:     VecDestroy(&xx);
1098:     VecDestroy(&yy);
1099:     ISDestroy(&ixx);
1100:     ISDestroy(&iyy);
1101:   } else if (xcommsize == 1) {
1102:     VecDestroy(&xx);
1103:   }

1105: functionend:
1106:   sf->vscat.bs = bs;
1107:   if (sf->vscat.bs > 1) {
1108:     MPI_Type_contiguous(sf->vscat.bs,MPIU_SCALAR,&sf->vscat.unit);
1109:     MPI_Type_commit(&sf->vscat.unit);
1110:   } else {
1111:     sf->vscat.unit = MPIU_SCALAR;
1112:   }
1113:   VecGetLocalSize(x,&sf->vscat.from_n);
1114:   VecGetLocalSize(y,&sf->vscat.to_n);
1115:   if (!ix_old) {ISDestroy(&ix);} /* We created helper ix, iy. Free them */
1116:   if (!iy_old) {ISDestroy(&iy);}

1118:   /* Set default */
1119:   VecScatterSetFromOptions(sf);

1121:   *newsf = sf;
1122:   return(0);
1123: }

1125: /*@C
1126:       VecScatterCreateToAll - Creates a vector and a scatter context that copies all
1127:           vector values to each processor

1129:   Collective on Vec

1131:   Input Parameter:
1132: .  vin  - input MPIVEC

1134:   Output Parameters:
1135: +  ctx - scatter context
1136: -  vout - output SEQVEC that is large enough to scatter into

1138:   Level: intermediate

1140:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1141:    need to have it created

1143:    Usage:
1144: $        VecScatterCreateToAll(vin,&ctx,&vout);
1145: $
1146: $        // scatter as many times as you need
1147: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1148: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1149: $
1150: $        // destroy scatter context and local vector when no longer needed
1151: $        VecScatterDestroy(&ctx);
1152: $        VecDestroy(&vout);

1154:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1155:   automatically (unless you pass NULL in for that argument if you do not need it).

1157: .seealso VecScatterCreate(), VecScatterCreateToZero(), VecScatterBegin(), VecScatterEnd()

1159: @*/
1160: PetscErrorCode  VecScatterCreateToAll(Vec vin,VecScatter *ctx,Vec *vout)
1161: {

1164:   PetscInt       N;
1165:   IS             is;
1166:   Vec            tmp;
1167:   Vec            *tmpv;
1168:   PetscBool      tmpvout = PETSC_FALSE;

1174:   if (vout) {
1176:     tmpv = vout;
1177:   } else {
1178:     tmpvout = PETSC_TRUE;
1179:     tmpv    = &tmp;
1180:   }

1182:   /* Create seq vec on each proc, with the same size of the original mpi vec */
1183:   VecGetSize(vin,&N);
1184:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1185:   VecSetFromOptions(*tmpv);
1186:   /* Create the VecScatter ctx with the communication info */
1187:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1188:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1189:   ISDestroy(&is);
1190:   if (tmpvout) {VecDestroy(tmpv);}
1191:   return(0);
1192: }

1194: /*@C
1195:       VecScatterCreateToZero - Creates an output vector and a scatter context used to
1196:               copy all vector values into the output vector on the zeroth processor

1198:   Collective on Vec

1200:   Input Parameter:
1201: .  vin  - input MPIVEC

1203:   Output Parameters:
1204: +  ctx - scatter context
1205: -  vout - output SEQVEC that is large enough to scatter into on processor 0 and
1206:           of length zero on all other processors

1208:   Level: intermediate

1210:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1211:    need to have it created

1213:    Usage:
1214: $        VecScatterCreateToZero(vin,&ctx,&vout);
1215: $
1216: $        // scatter as many times as you need
1217: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1218: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1219: $
1220: $        // destroy scatter context and local vector when no longer needed
1221: $        VecScatterDestroy(&ctx);
1222: $        VecDestroy(&vout);

1224: .seealso VecScatterCreate(), VecScatterCreateToAll(), VecScatterBegin(), VecScatterEnd()

1226:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1227:   automatically (unless you pass NULL in for that argument if you do not need it).

1229: @*/
1230: PetscErrorCode  VecScatterCreateToZero(Vec vin,VecScatter *ctx,Vec *vout)
1231: {

1234:   PetscInt       N;
1235:   PetscMPIInt    rank;
1236:   IS             is;
1237:   Vec            tmp;
1238:   Vec            *tmpv;
1239:   PetscBool      tmpvout = PETSC_FALSE;

1245:   if (vout) {
1247:     tmpv = vout;
1248:   } else {
1249:     tmpvout = PETSC_TRUE;
1250:     tmpv    = &tmp;
1251:   }

1253:   /* Create vec on each proc, with the same size of the original mpi vec (all on process 0)*/
1254:   VecGetSize(vin,&N);
1255:   MPI_Comm_rank(PetscObjectComm((PetscObject)vin),&rank);
1256:   if (rank) N = 0;
1257:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1258:   VecSetFromOptions(*tmpv);
1259:   /* Create the VecScatter ctx with the communication info */
1260:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1261:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1262:   ISDestroy(&is);
1263:   if (tmpvout) {VecDestroy(tmpv);}
1264:   return(0);
1265: }

1267: /*@
1268:    VecScatterBegin - Begins a generalized scatter from one vector to
1269:    another. Complete the scattering phase with VecScatterEnd().

1271:    Neighbor-wise Collective on VecScatter

1273:    Input Parameters:
1274: +  sf - scatter context generated by VecScatterCreate()
1275: .  x - the vector from which we scatter
1276: .  y - the vector to which we scatter
1277: .  addv - either ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES, with INSERT_VALUES mode any location
1278:           not scattered to retains its old value; i.e. the vector is NOT first zeroed.
1279: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1280:     SCATTER_FORWARD or SCATTER_REVERSE

1282:    Level: intermediate

1284:    Options Database: See VecScatterCreate()

1286:    Notes:
1287:    The vectors x and y need not be the same vectors used in the call
1288:    to VecScatterCreate(), but x must have the same parallel data layout
1289:    as that passed in as the x to VecScatterCreate(), similarly for the y.
1290:    Most likely they have been obtained from VecDuplicate().

1292:    You cannot change the values in the input vector between the calls to VecScatterBegin()
1293:    and VecScatterEnd().

1295:    If you use SCATTER_REVERSE the two arguments x and y should be reversed, from
1296:    the SCATTER_FORWARD.

1298:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1300:    This scatter is far more general than the conventional
1301:    scatter, since it can be a gather or a scatter or a combination,
1302:    depending on the indices ix and iy.  If x is a parallel vector and y
1303:    is sequential, VecScatterBegin() can serve to gather values to a
1304:    single processor.  Similarly, if y is parallel and x sequential, the
1305:    routine can scatter from one processor to many processors.

1307: .seealso: VecScatterCreate(), VecScatterEnd()
1308: @*/
1309: PetscErrorCode  VecScatterBegin(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1310: {
1312:   PetscInt       to_n,from_n;

1318:   if (PetscDefined(USE_DEBUG)) {
1319:     /*
1320:      Error checking to make sure these vectors match the vectors used
1321:      to create the vector scatter context. -1 in the from_n and to_n indicate the
1322:      vector lengths are unknown (for example with mapped scatters) and thus
1323:      no error checking is performed.
1324:      */
1325:     if (sf->vscat.from_n >= 0 && sf->vscat.to_n >= 0) {
1326:       VecGetLocalSize(x,&from_n);
1327:       VecGetLocalSize(y,&to_n);
1328:       if (mode & SCATTER_REVERSE) {
1329:         if (to_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector to != sf from size)",to_n,sf->vscat.from_n);
1330:         if (from_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector from != sf to size)",from_n,sf->vscat.to_n);
1331:       } else {
1332:         if (to_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector to != sf to size)",to_n,sf->vscat.to_n);
1333:         if (from_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector from != sf from size)",from_n,sf->vscat.from_n);
1334:       }
1335:     }
1336:   }

1338:   sf->vscat.logging = PETSC_TRUE;
1339:   PetscLogEventBegin(VEC_ScatterBegin,sf,x,y,0);
1340:   VecScatterBegin_Internal(sf,x,y,addv,mode);
1341:   if (sf->vscat.beginandendtogether) {
1342:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1343:   }
1344:   PetscLogEventEnd(VEC_ScatterBegin,sf,x,y,0);
1345:   sf->vscat.logging = PETSC_FALSE;
1346:   return(0);
1347: }

1349: /*@
1350:    VecScatterEnd - Ends a generalized scatter from one vector to another.  Call
1351:    after first calling VecScatterBegin().

1353:    Neighbor-wise Collective on VecScatter

1355:    Input Parameters:
1356: +  sf - scatter context generated by VecScatterCreate()
1357: .  x - the vector from which we scatter
1358: .  y - the vector to which we scatter
1359: .  addv - one of ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES
1360: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1361:      SCATTER_FORWARD, SCATTER_REVERSE

1363:    Level: intermediate

1365:    Notes:
1366:    If you use SCATTER_REVERSE the arguments x and y should be reversed, from the SCATTER_FORWARD.

1368:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1370: .seealso: VecScatterBegin(), VecScatterCreate()
1371: @*/
1372: PetscErrorCode  VecScatterEnd(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1373: {

1380:   if (!sf->vscat.beginandendtogether) {
1381:     sf->vscat.logging = PETSC_TRUE;
1382:     PetscLogEventBegin(VEC_ScatterEnd,sf,x,y,0);
1383:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1384:     PetscLogEventEnd(VEC_ScatterEnd,sf,x,y,0);
1385:     sf->vscat.logging = PETSC_FALSE;
1386:   }
1387:   return(0);
1388: }