Actual source code: vscatsf.c
petsc-3.14.6 2021-03-30
1: #include <petsc/private/vecscatterimpl.h>
2: #include <petsc/private/sfimpl.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4: #include <../src/vec/is/sf/impls/basic/sfpack.h>
6: #if defined(PETSC_HAVE_CUDA)
7: #include <petsc/private/cudavecimpl.h>
8: #endif
10: typedef struct {
11: PetscSF sf; /* the whole scatter, including local and remote */
12: PetscSF lsf; /* the local part of the scatter, used for SCATTER_LOCAL */
13: PetscInt bs; /* block size */
14: MPI_Datatype unit; /* one unit = bs PetscScalars */
15: } VecScatter_SF;
17: static PetscErrorCode VecScatterBegin_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
18: {
20: VecScatter_SF *data=(VecScatter_SF*)vscat->data;
21: PetscSF sf=data->sf;
22: MPI_Op mop=MPI_OP_NULL;
23: PetscMPIInt size;
24: PetscMemType xmtype=PETSC_MEMTYPE_HOST,ymtype=PETSC_MEMTYPE_HOST;
27: if (x != y) {VecLockReadPush(x);}
28: if (sf->use_gpu_aware_mpi || vscat->packongpu) {
29: VecGetArrayReadInPlace_Internal(x,&vscat->xdata,&xmtype);
30: } else {
31: #if defined(PETSC_HAVE_CUDA)
32: PetscBool is_cudatype = PETSC_FALSE;
33: PetscObjectTypeCompareAny((PetscObject)x,&is_cudatype,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
34: if (is_cudatype) {
35: VecCUDAAllocateCheckHost(x);
36: if (x->offloadmask == PETSC_OFFLOAD_GPU) {
37: if (x->spptr && vscat->spptr) {VecCUDACopyFromGPUSome_Public(x,(PetscCUDAIndices)vscat->spptr,mode);}
38: else {VecCUDACopyFromGPU(x);}
39: }
40: vscat->xdata = *((PetscScalar**)x->data);
41: } else
42: #endif
43: {VecGetArrayRead(x,&vscat->xdata);}
44: }
46: if (x != y) {
47: if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecGetArrayInPlace_Internal(y,&vscat->ydata,&ymtype);}
48: else {VecGetArray(y,&vscat->ydata);}
49: } else {
50: vscat->ydata = (PetscScalar *)vscat->xdata;
51: ymtype = xmtype;
52: }
53: VecLockWriteSet_Private(y,PETSC_TRUE);
55: /* SCATTER_LOCAL indicates ignoring inter-process communication */
56: MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
57: if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of data->lsf since SCATTER_LOCAL is uncommon */
58: if (!data->lsf) {PetscSFCreateLocalSF_Private(data->sf,&data->lsf);}
59: sf = data->lsf;
60: } else {
61: sf = data->sf;
62: }
64: if (addv == INSERT_VALUES) mop = MPIU_REPLACE;
65: else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
66: else if (addv == MAX_VALUES) mop = MPIU_MAX;
67: else if (addv == MIN_VALUES) mop = MPIU_MIN;
68: else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);
70: if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
71: PetscSFReduceWithMemTypeBegin(sf,data->unit,xmtype,vscat->xdata,ymtype,vscat->ydata,mop);
72: } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
73: PetscSFBcastAndOpWithMemTypeBegin(sf,data->unit,xmtype,vscat->xdata,ymtype,vscat->ydata,mop);
74: }
75: return(0);
76: }
78: static PetscErrorCode VecScatterEnd_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
79: {
80: VecScatter_SF *data=(VecScatter_SF*)vscat->data;
81: PetscSF sf=data->sf;
82: MPI_Op mop=MPI_OP_NULL;
83: PetscMPIInt size;
87: /* SCATTER_LOCAL indicates ignoring inter-process communication */
88: MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
89: sf = ((mode & SCATTER_LOCAL) && size > 1) ? data->lsf : data->sf;
91: if (addv == INSERT_VALUES) mop = MPIU_REPLACE;
92: else if (addv == ADD_VALUES) mop = MPIU_SUM;
93: else if (addv == MAX_VALUES) mop = MPIU_MAX;
94: else if (addv == MIN_VALUES) mop = MPIU_MIN;
95: else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);
97: if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
98: PetscSFReduceEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
99: } else { /* forward scatter sends roots to leaves, i.e., x to y */
100: PetscSFBcastAndOpEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
101: }
103: if (x != y) {
104: if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayReadInPlace(x,&vscat->xdata);}
105: else {VecRestoreArrayRead(x,&vscat->xdata);}
106: VecLockReadPop(x);
107: }
109: if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayInPlace(y,&vscat->ydata);}
110: else {VecRestoreArray(y,&vscat->ydata);}
111: VecLockWriteSet_Private(y,PETSC_FALSE);
113: return(0);
114: }
116: static PetscErrorCode VecScatterCopy_SF(VecScatter vscat,VecScatter ctx)
117: {
118: VecScatter_SF *data=(VecScatter_SF*)vscat->data,*out;
122: PetscMemcpy(ctx->ops,vscat->ops,sizeof(vscat->ops));
123: PetscNewLog(ctx,&out);
124: PetscSFDuplicate(data->sf,PETSCSF_DUPLICATE_GRAPH,&out->sf);
125: PetscSFSetUp(out->sf);
126: /* Do not copy lsf. Build it on demand since it is rarely used */
128: out->bs = data->bs;
129: if (out->bs > 1) {
130: MPI_Type_dup(data->unit,&out->unit); /* Since oldtype is committed, so is newtype, according to MPI */
131: } else {
132: out->unit = MPIU_SCALAR;
133: }
134: ctx->data = (void*)out;
135: return(0);
136: }
138: static PetscErrorCode VecScatterDestroy_SF(VecScatter vscat)
139: {
140: VecScatter_SF *data = (VecScatter_SF *)vscat->data;
144: PetscSFDestroy(&data->sf);
145: PetscSFDestroy(&data->lsf);
146: if (data->bs > 1) {MPI_Type_free(&data->unit);}
147: PetscFree(vscat->data);
148: return(0);
149: }
151: static PetscErrorCode VecScatterView_SF(VecScatter vscat,PetscViewer viewer)
152: {
153: VecScatter_SF *data = (VecScatter_SF *)vscat->data;
157: PetscSFView(data->sf,viewer);
158: return(0);
159: }
161: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input vscat scatters
162: x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
163: x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
164: */
165: static PetscErrorCode VecScatterRemap_SF(VecScatter vscat,const PetscInt *tomap,const PetscInt *frommap)
166: {
167: VecScatter_SF *data = (VecScatter_SF *)vscat->data;
168: PetscSF sf = data->sf;
169: PetscInt i,bs = data->bs;
170: PetscMPIInt size;
171: PetscBool ident = PETSC_TRUE,isbasic,isneighbor;
172: PetscSFType type;
173: PetscSF_Basic *bas = NULL;
177: /* check if it is an identity map. If it is, do nothing */
178: if (tomap) {
179: for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
180: if (ident) return(0);
181: }
182: if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
183: if (!tomap) return(0);
185: MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
187: /* Since the indices changed, we must also update the local SF. But we do not do it since
188: lsf is rarely used. We just destroy lsf and rebuild it on demand from updated data->sf.
189: */
190: if (data->lsf) {PetscSFDestroy(&data->lsf);}
192: PetscSFGetType(sf,&type);
193: PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
194: PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
195: if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);
197: PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */
199: /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
200: sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
201: To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
202: Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
203: x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
204: accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
205: that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
206: so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
207: which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
208: */
209: sf->remote = NULL;
210: PetscFree(sf->remote_alloc);
211: /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
212: for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;
214: /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
215: block in the vector. So before the remapping, we have to expand indices in sf by bs, and
216: after the remapping, we have to shrink them back.
217: */
218: bas = (PetscSF_Basic*)sf->data;
219: for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
220: #if defined(PETSC_HAVE_DEVICE)
221: /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
222: for (i=0; i<2; i++) {PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[i]);}
223: #endif
224: /* Destroy and then rebuild root packing optimizations since indices are changed */
225: PetscSFResetPackFields(sf);
226: PetscSFSetUpPackFields(sf);
227: return(0);
228: }
230: static PetscErrorCode VecScatterGetRemoteCount_SF(VecScatter vscat,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
231: {
232: PetscErrorCode ierr;
233: VecScatter_SF *data = (VecScatter_SF *)vscat->data;
234: PetscSF sf = data->sf;
235: PetscInt nranks,remote_start;
236: PetscMPIInt rank;
237: const PetscInt *offset;
238: const PetscMPIInt *ranks;
241: MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);
243: /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
244: Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
245: get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
246: If send is false, we do the opposite, calling PetscSFGetRootRanks().
247: */
248: if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
249: else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
250: if (nranks) {
251: remote_start = (rank == ranks[0])? 1 : 0;
252: if (num_procs) *num_procs = nranks - remote_start;
253: if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
254: } else {
255: if (num_procs) *num_procs = 0;
256: if (num_entries) *num_entries = 0;
257: }
258: return(0);
259: }
261: static PetscErrorCode VecScatterGetRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
262: {
263: PetscErrorCode ierr;
264: VecScatter_SF *data = (VecScatter_SF *)vscat->data;
265: PetscSF sf = data->sf;
266: PetscInt nranks,remote_start;
267: PetscMPIInt rank;
268: const PetscInt *offset,*location;
269: const PetscMPIInt *ranks;
272: MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);
274: if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
275: else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}
277: if (nranks) {
278: remote_start = (rank == ranks[0])? 1 : 0;
279: if (n) *n = nranks - remote_start;
280: if (starts) *starts = &offset[remote_start];
281: if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
282: if (procs) *procs = &ranks[remote_start];
283: } else {
284: if (n) *n = 0;
285: if (starts) *starts = NULL;
286: if (indices) *indices = NULL;
287: if (procs) *procs = NULL;
288: }
290: if (bs) *bs = 1;
291: return(0);
292: }
294: static PetscErrorCode VecScatterGetRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
295: {
299: VecScatterGetRemote_SF(vscat,send,n,starts,indices,procs,bs);
300: return(0);
301: }
303: static PetscErrorCode VecScatterRestoreRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
304: {
306: if (starts) *starts = NULL;
307: if (indices) *indices = NULL;
308: if (procs) *procs = NULL;
309: return(0);
310: }
312: static PetscErrorCode VecScatterRestoreRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
313: {
316: VecScatterRestoreRemote_SF(vscat,send,n,starts,indices,procs,bs);
317: return(0);
318: }
320: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;
322: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
323: {
325: PetscBool same;
328: *id = IS_INVALID;
329: PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
330: if (same) {*id = IS_GENERAL; goto functionend;}
331: PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
332: if (same) {*id = IS_BLOCK; goto functionend;}
333: PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
334: if (same) {*id = IS_STRIDE; goto functionend;}
335: functionend:
336: return(0);
337: }
339: static PetscErrorCode VecScatterSetUp_SF(VecScatter vscat)
340: {
342: VecScatter_SF *data;
343: MPI_Comm xcomm,ycomm,bigcomm;
344: Vec x=vscat->from_v,y=vscat->to_v,xx,yy;
345: IS ix=vscat->from_is,iy=vscat->to_is,ixx,iyy;
346: PetscMPIInt xcommsize,ycommsize,rank;
347: PetscInt i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
348: const PetscInt *xindices,*yindices;
349: PetscSFNode *iremote;
350: PetscLayout xlayout,ylayout;
351: ISTypeID ixid,iyid;
352: PetscInt bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
353: PetscBool can_do_block_opt=PETSC_FALSE;
356: PetscNewLog(vscat,&data);
357: data->bs = 1; /* default, no blocking */
358: data->unit = MPIU_SCALAR;
360: /*
361: Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
362: StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
363: contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.
365: SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
366: leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
367: */
368: PetscObjectGetComm((PetscObject)x,&xcomm);
369: PetscObjectGetComm((PetscObject)y,&ycomm);
370: MPI_Comm_size(xcomm,&xcommsize);
371: MPI_Comm_size(ycomm,&ycommsize);
373: /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
374: if (!ix) {
375: if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
376: VecGetSize(x,&N);
377: ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
378: } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
379: VecGetLocalSize(x,&n);
380: VecGetOwnershipRange(x,&xstart,NULL);
381: ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
382: }
383: }
385: if (!iy) {
386: if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
387: VecGetSize(y,&N);
388: ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
389: } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
390: VecGetLocalSize(y,&n);
391: VecGetOwnershipRange(y,&ystart,NULL);
392: ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
393: }
394: }
396: /* Do error checking immediately after we have non-empty ix, iy */
397: ISGetLocalSize(ix,&ixsize);
398: ISGetLocalSize(iy,&iysize);
399: VecGetSize(x,&xlen);
400: VecGetSize(y,&ylen);
401: if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
402: ISGetMinMax(ix,&min,&max);
403: if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
404: ISGetMinMax(iy,&min,&max);
405: if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");
407: /* Extract info about ix, iy for further test */
408: ISGetTypeID_Private(ix,&ixid);
409: ISGetTypeID_Private(iy,&iyid);
410: if (ixid == IS_BLOCK) {ISGetBlockSize(ix,&bsx);}
411: else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}
413: if (iyid == IS_BLOCK) {ISGetBlockSize(iy,&bsy);}
414: else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}
416: /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
417: ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
418: vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).
420: We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
421: */
422: if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
423: PetscInt pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
424: PetscLayout map;
426: MPI_Comm_rank(xcomm,&rank);
427: VecGetLayout(x,&map);
428: if (!rank) {
429: if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
430: /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
431: pattern[0] = pattern[1] = 1;
432: }
433: } else {
434: if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
435: /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
436: pattern[0] = 1;
437: } else if (ixsize == 0) {
438: /* Other ranks do nothing, so it is a ToZero candiate */
439: pattern[1] = 1;
440: }
441: }
443: /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
444: MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);
446: if (pattern[0] || pattern[1]) {
447: PetscSFCreate(xcomm,&data->sf);
448: PetscSFSetFromOptions(data->sf);
449: PetscSFSetGraphWithPattern(data->sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
450: goto functionend; /* No further analysis needed. What a big win! */
451: }
452: }
454: /* Continue ...
455: Do block optimization by taking advantage of high level info available in ix, iy.
456: The block optimization is valid when all of the following conditions are met:
457: 1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
458: 2) ix, iy have the same block size;
459: 3) all processors agree on one block size;
460: 4) no blocks span more than one process;
461: */
462: bigcomm = (xcommsize == 1) ? ycomm : xcomm;
464: /* Processors could go through different path in this if-else test */
465: m[0] = m[1] = PETSC_MPI_INT_MIN;
466: if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
467: m[0] = PetscMax(bsx,bsy);
468: m[1] = -PetscMin(bsx,bsy);
469: } else if (ixid == IS_BLOCK && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
470: m[0] = bsx;
471: m[1] = -bsx;
472: } else if (ixid == IS_STRIDE && iyid == IS_BLOCK && ixstep==1 && ixfirst%bsy==0) {
473: m[0] = bsy;
474: m[1] = -bsy;
475: }
476: /* Get max and min of bsx,bsy over all processes in one allreduce */
477: MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
478: max = m[0]; min = -m[1];
480: /* Since we used allreduce above, all ranks will have the same min and max. min==max
481: implies all ranks have the same bs. Do further test to see if local vectors are dividable
482: by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
483: */
484: if (min == max && min > 1) {
485: VecGetLocalSize(x,&xlen);
486: VecGetLocalSize(y,&ylen);
487: m[0] = xlen%min;
488: m[1] = ylen%min;
489: MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
490: if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
491: }
493: /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
494: and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
495: indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.
497: xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
498: we can treat PtoP and StoP uniformly as PtoS.
499: */
500: if (can_do_block_opt) {
501: const PetscInt *indices;
503: data->bs = bs = min;
504: MPI_Type_contiguous(bs,MPIU_SCALAR,&data->unit);
505: MPI_Type_commit(&data->unit);
507: /* Shrink x and ix */
508: VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
509: if (ixid == IS_BLOCK) {
510: ISBlockGetIndices(ix,&indices);
511: ISBlockGetLocalSize(ix,&ixsize);
512: ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
513: ISBlockRestoreIndices(ix,&indices);
514: } else { /* ixid == IS_STRIDE */
515: ISGetLocalSize(ix,&ixsize);
516: ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
517: }
519: /* Shrink y and iy */
520: VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
521: if (iyid == IS_BLOCK) {
522: ISBlockGetIndices(iy,&indices);
523: ISBlockGetLocalSize(iy,&iysize);
524: ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
525: ISBlockRestoreIndices(iy,&indices);
526: } else { /* iyid == IS_STRIDE */
527: ISGetLocalSize(iy,&iysize);
528: ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
529: }
530: } else {
531: ixx = ix;
532: iyy = iy;
533: yy = y;
534: if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
535: }
537: /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
538: ISGetIndices(ixx,&xindices);
539: ISGetIndices(iyy,&yindices);
540: VecGetLayout(xx,&xlayout);
542: if (ycommsize > 1) {
543: /* PtoP or StoP */
545: /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
546: to owner process of yindices[i] according to ylayout, i = 0..n.
548: I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
549: We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
550: PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
551: So I commented it out and did another optimized implementation. The commented code is left here for reference.
552: */
553: #if 0
554: const PetscInt *degree;
555: PetscSF tmpsf;
556: PetscInt inedges=0,*leafdata,*rootdata;
558: VecGetOwnershipRange(xx,&xstart,NULL);
559: VecGetLayout(yy,&ylayout);
560: VecGetOwnershipRange(yy,&ystart,NULL);
562: VecGetLocalSize(yy,&nroots);
563: ISGetLocalSize(iyy,&nleaves);
564: PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);
566: for (i=0; i<nleaves; i++) {
567: PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
568: leafdata[2*i] = yindices[i];
569: leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
570: }
572: PetscSFCreate(ycomm,&tmpsf);
573: PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);
575: PetscSFComputeDegreeBegin(tmpsf,°ree);
576: PetscSFComputeDegreeEnd(tmpsf,°ree);
578: for (i=0; i<nroots; i++) inedges += degree[i];
579: PetscMalloc1(inedges*2,&rootdata);
580: PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
581: PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);
583: PetscFree2(iremote,leafdata);
584: PetscSFDestroy(&tmpsf);
586: /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
587: We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
588: */
589: nleaves = inedges;
590: VecGetLocalSize(xx,&nroots);
591: PetscMalloc1(nleaves,&ilocal);
592: PetscMalloc1(nleaves,&iremote);
594: for (i=0; i<inedges; i++) {
595: ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
596: PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
597: }
598: PetscFree(rootdata);
599: #else
600: PetscInt j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
601: const PetscInt *yrange;
602: PetscMPIInt nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
603: PetscInt *rxindices,*ryindices;
604: MPI_Request *reqs,*sreqs,*rreqs;
606: /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
607: yindices_sorted - sorted yindices
608: xindices_sorted - xindices sorted along with yindces
609: */
610: ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
611: PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
612: PetscArraycpy(xindices_sorted,xindices,n);
613: PetscArraycpy(yindices_sorted,yindices,n);
614: PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
615: VecGetOwnershipRange(xx,&xstart,NULL);
616: if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */
618: /*=============================================================================
619: Calculate info about messages I need to send
620: =============================================================================*/
621: /* nsend - number of non-empty messages to send
622: sendto - [nsend] ranks I will send messages to
623: sstart - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
624: slens - [ycommsize] I want to send slens[i] entries to rank i.
625: */
626: VecGetLayout(yy,&ylayout);
627: PetscLayoutGetRanges(ylayout,&yrange);
628: PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */
630: i = j = nsend = 0;
631: while (i < n) {
632: if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
633: do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
634: if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
635: }
636: i++;
637: if (!slens[j]++) nsend++;
638: }
640: PetscMalloc2(nsend+1,&sstart,nsend,&sendto);
642: sstart[0] = 0;
643: for (i=j=0; i<ycommsize; i++) {
644: if (slens[i]) {
645: sendto[j] = (PetscMPIInt)i;
646: sstart[j+1] = sstart[j] + slens[i];
647: j++;
648: }
649: }
651: /*=============================================================================
652: Calculate the reverse info about messages I will recv
653: =============================================================================*/
654: /* nrecv - number of messages I will recv
655: recvfrom - [nrecv] ranks I recv from
656: rlens - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
657: rlentotal - sum of rlens[]
658: rxindices - [rlentotal] recv buffer for xindices_sorted
659: ryindices - [rlentotal] recv buffer for yindices_sorted
660: */
661: PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
662: PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
663: PetscFree(slens); /* Free the O(P) array ASAP */
664: rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];
666: /*=============================================================================
667: Communicate with processors in recvfrom[] to populate rxindices and ryindices
668: ============================================================================*/
669: PetscCommGetNewTag(ycomm,&tag1);
670: PetscCommGetNewTag(ycomm,&tag2);
671: PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
672: PetscMPIIntCast((nsend+nrecv)*2,&nreq);
673: PetscMalloc1(nreq,&reqs);
674: sreqs = reqs;
675: rreqs = reqs + nsend*2;
677: for (i=disp=0; i<nrecv; i++) {
678: count = rlens[i];
679: MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
680: MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
681: disp += rlens[i];
682: }
684: for (i=0; i<nsend; i++) {
685: PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
686: MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
687: MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
688: }
689: MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);
691: /* Transform VecScatter into SF */
692: nleaves = rlentotal;
693: PetscMalloc1(nleaves,&ilocal);
694: PetscMalloc1(nleaves,&iremote);
695: MPI_Comm_rank(ycomm,&yrank);
696: for (i=disp=0; i<nrecv; i++) {
697: for (j=0; j<rlens[i]; j++) {
698: k = disp + j; /* k-th index pair */
699: ilocal[k] = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
700: PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
701: iremote[k].rank = rank;
702: }
703: disp += rlens[i];
704: }
706: PetscFree2(sstart,sendto);
707: PetscFree(slens);
708: PetscFree(rlens);
709: PetscFree(recvfrom);
710: PetscFree(reqs);
711: PetscFree2(rxindices,ryindices);
712: PetscFree2(xindices_sorted,yindices_sorted);
713: #endif
714: } else {
715: /* PtoS or StoS */
716: ISGetLocalSize(iyy,&nleaves);
717: PetscMalloc1(nleaves,&ilocal);
718: PetscMalloc1(nleaves,&iremote);
719: PetscArraycpy(ilocal,yindices,nleaves);
720: for (i=0; i<nleaves; i++) {
721: PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
722: iremote[i].rank = rank;
723: }
724: }
726: /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
727: In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
728: yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
729: PetscSFCreate(PetscObjectComm((PetscObject)xx),&data->sf);
730: PetscSFSetFromOptions(data->sf);
731: VecGetLocalSize(xx,&nroots);
732: PetscSFSetGraph(data->sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */
734: /* Free memory no longer needed */
735: ISRestoreIndices(ixx,&xindices);
736: ISRestoreIndices(iyy,&yindices);
737: if (can_do_block_opt) {
738: VecDestroy(&xx);
739: VecDestroy(&yy);
740: ISDestroy(&ixx);
741: ISDestroy(&iyy);
742: } else if (xcommsize == 1) {
743: VecDestroy(&xx);
744: }
746: functionend:
747: if (!vscat->from_is) {ISDestroy(&ix);}
748: if (!vscat->to_is) {ISDestroy(&iy);}
750: /* vecscatter uses eager setup */
751: PetscSFSetUp(data->sf);
753: vscat->data = (void*)data;
754: vscat->ops->begin = VecScatterBegin_SF;
755: vscat->ops->end = VecScatterEnd_SF;
756: vscat->ops->remap = VecScatterRemap_SF;
757: vscat->ops->copy = VecScatterCopy_SF;
758: vscat->ops->destroy = VecScatterDestroy_SF;
759: vscat->ops->view = VecScatterView_SF;
760: vscat->ops->getremotecount = VecScatterGetRemoteCount_SF;
761: vscat->ops->getremote = VecScatterGetRemote_SF;
762: vscat->ops->getremoteordered = VecScatterGetRemoteOrdered_SF;
763: vscat->ops->restoreremote = VecScatterRestoreRemote_SF;
764: vscat->ops->restoreremoteordered = VecScatterRestoreRemoteOrdered_SF;
765: return(0);
766: }
768: PetscErrorCode VecScatterCreate_SF(VecScatter ctx)
769: {
773: ctx->ops->setup = VecScatterSetUp_SF;
774: PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERSF);
775: PetscInfo(ctx,"Using StarForest for vector scatter\n");
776: return(0);
777: }