Actual source code: vpscat.c
petsc-3.9.4 2018-09-11
2: /*
3: Defines parallel vector scatters.
4: */
6: #include <../src/vec/vec/impls/dvecimpl.h>
7: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
8: #include <petscsf.h>
10: #if defined(PETSC_HAVE_MPI_WIN_CREATE_FEATURE)
12: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
13: {
14: VecScatter_MPI_General *to =(VecScatter_MPI_General*)ctx->todata;
15: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
16: PetscErrorCode ierr;
17: PetscInt i;
18: PetscMPIInt rank;
19: PetscViewerFormat format;
20: PetscBool iascii;
23: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
24: if (iascii) {
25: MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO) {
28: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
30: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
31: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
32: itmp = to->starts[to->n+1];
33: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
34: itmp = from->starts[from->n+1];
35: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
36: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));
38: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
39: PetscViewerASCIIPrintf(viewer," Blocksize %D\n",to->bs);
40: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
41: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
42: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
43: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
44: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
45: } else {
46: PetscViewerASCIIPrintf(viewer," VecScatter Blocksize %D\n",to->bs);
47: PetscViewerASCIIPushSynchronized(viewer);
48: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
49: if (to->n) {
50: for (i=0; i<to->n; i++) {
51: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
52: }
53: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote sends (in order by process sent to)\n",rank);
54: for (i=0; i<to->starts[to->n]; i++) {
55: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
56: }
57: }
59: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
60: if (from->n) {
61: for (i=0; i<from->n; i++) {
62: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
63: }
65: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote receives (in order by process received from)\n",rank);
66: for (i=0; i<from->starts[from->n]; i++) {
67: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
68: }
69: }
70: if (to->local.n) {
71: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
72: for (i=0; i<to->local.n; i++) { /* the to and from have the opposite meaning from what you would expect */
73: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,to->local.vslots[i],from->local.vslots[i]);
74: }
75: }
77: for (i=0; i<to->msize; i++) {
78: if (to->sharedspacestarts[i+1] > to->sharedspacestarts[i]) {
79: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory to local memory partner %d count %d\n",rank,i,to->sharedspacestarts[i+1]-to->sharedspacestarts[i]);
80: }
81: }
82: for (i=0; i<from->msize; i++) {
83: if (from->sharedspacestarts[i+1] > from->sharedspacestarts[i]) {
84: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory from local memory partner %d count %d\n",rank,i,from->sharedspacestarts[i+1]-from->sharedspacestarts[i]);
85: }
86: }
88: PetscViewerFlush(viewer);
89: PetscViewerASCIIPopSynchronized(viewer);
91: PetscViewerASCIIPrintf(viewer,"Method used to implement the VecScatter: ");
92: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
93: if (to->use_alltoallw) {
94: PetscViewerASCIIPrintf(viewer,"Uses MPI_alltoallw if INSERT_MODE\n");
95: } else
96: #endif
97: if (ctx->packtogether || to->use_alltoallv || to->use_window) {
98: if (to->use_alltoallv) {
99: PetscViewerASCIIPrintf(viewer,"Uses MPI MPI_alltoallv\n");
100: } else if (to->use_window) {
101: PetscViewerASCIIPrintf(viewer,"Uses MPI window\n");
102: } else {
103: PetscViewerASCIIPrintf(viewer,"Packs all messages and then sends them\n");
104: }
105: } else {
106: PetscViewerASCIIPrintf(viewer,"Packs and sends messages one at a time\n");
107: }
108: }
109: }
110: return(0);
111: }
113: /* -----------------------------------------------------------------------------------*/
114: /*
115: The next routine determines what part of the local part of the scatter is an
116: exact copy of values into their current location. We check this here and
117: then know that we need not perform that portion of the scatter when the vector is
118: scattering to itself with INSERT_VALUES.
120: This is currently not used but would speed up, for example DMLocalToLocalBegin/End()
122: */
123: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
124: {
125: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
127: PetscInt *nto_slots,*nfrom_slots,j = 0;
130: for (i=0; i<n; i++) {
131: if (to_slots[i] != from_slots[i]) n_nonmatching++;
132: }
134: if (!n_nonmatching) {
135: to->nonmatching_computed = PETSC_TRUE;
136: to->n_nonmatching = from->n_nonmatching = 0;
137: PetscInfo1(scatter,"Reduced %D to 0\n", n);
138: } else if (n_nonmatching == n) {
139: to->nonmatching_computed = PETSC_FALSE;
140: PetscInfo(scatter,"All values non-matching\n");
141: } else {
142: to->nonmatching_computed= PETSC_TRUE;
143: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
145: PetscMalloc1(n_nonmatching,&nto_slots);
146: PetscMalloc1(n_nonmatching,&nfrom_slots);
148: to->slots_nonmatching = nto_slots;
149: from->slots_nonmatching = nfrom_slots;
150: for (i=0; i<n; i++) {
151: if (to_slots[i] != from_slots[i]) {
152: nto_slots[j] = to_slots[i];
153: nfrom_slots[j] = from_slots[i];
154: j++;
155: }
156: }
157: PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
158: }
159: return(0);
160: }
162: /* --------------------------------------------------------------------------------------*/
164: /* -------------------------------------------------------------------------------------*/
165: PetscErrorCode VecScatterDestroy_PtoP_MPI3(VecScatter ctx)
166: {
167: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
168: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
169: PetscErrorCode ierr;
170: PetscInt i;
173: if (to->use_readyreceiver) {
174: /*
175: Since we have already posted sends we must cancel them before freeing
176: the requests
177: */
178: for (i=0; i<from->n; i++) {
179: MPI_Cancel(from->requests+i);
180: }
181: for (i=0; i<to->n; i++) {
182: MPI_Cancel(to->rev_requests+i);
183: }
184: MPI_Waitall(from->n,from->requests,to->rstatus);
185: MPI_Waitall(to->n,to->rev_requests,to->rstatus);
186: }
188: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
189: if (to->use_alltoallw) {
190: for (i=0; i<to->n; i++) {
191: MPI_Type_free(to->types+to->procs[i]);
192: }
193: PetscFree3(to->wcounts,to->wdispls,to->types);
194: if (!from->contiq) {
195: for (i=0; i<from->n; i++) {
196: MPI_Type_free(from->types+from->procs[i]);
197: }
198: }
199: PetscFree3(from->wcounts,from->wdispls,from->types);
200: }
201: #endif
203: if (to->use_window) {
204: MPI_Win_free(&from->window);
205: MPI_Win_free(&to->window);
206: PetscFree(from->winstarts);
207: PetscFree(to->winstarts);
208: }
210: if (to->use_alltoallv) {
211: PetscFree2(to->counts,to->displs);
212: PetscFree2(from->counts,from->displs);
213: }
215: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
216: /*
217: IBM's PE version of MPI has a bug where freeing these guys will screw up later
218: message passing.
219: */
220: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
221: if (!to->use_alltoallv && !to->use_window) { /* currently the to->requests etc are ALWAYS allocated even if not used */
222: if (to->requests) {
223: for (i=0; i<to->n; i++) {
224: MPI_Request_free(to->requests + i);
225: }
226: }
227: if (to->rev_requests) {
228: for (i=0; i<to->n; i++) {
229: MPI_Request_free(to->rev_requests + i);
230: }
231: }
232: }
233: /*
234: MPICH could not properly cancel requests thus with ready receiver mode we
235: cannot free the requests. It may be fixed now, if not then put the following
236: code inside a if (!to->use_readyreceiver) {
237: */
238: if (!to->use_alltoallv && !to->use_window) { /* currently the from->requests etc are ALWAYS allocated even if not used */
239: if (from->requests) {
240: for (i=0; i<from->n; i++) {
241: MPI_Request_free(from->requests + i);
242: }
243: }
245: if (from->rev_requests) {
246: for (i=0; i<from->n; i++) {
247: MPI_Request_free(from->rev_requests + i);
248: }
249: }
250: }
251: #endif
252: if (to->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&to->sharedwin);}
253: if (from->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&from->sharedwin);}
254: PetscFree(to->sharedspaces);
255: PetscFree(to->sharedspacesoffset);
256: PetscFree(to->sharedspaceindices);
257: PetscFree(to->sharedspacestarts);
259: PetscFree(from->sharedspaceindices);
260: PetscFree(from->sharedspaces);
261: PetscFree(from->sharedspacesoffset);
262: PetscFree(from->sharedspacestarts);
264: PetscFree(to->local.vslots);
265: PetscFree(from->local.vslots);
266: PetscFree2(to->counts,to->displs);
267: PetscFree2(from->counts,from->displs);
268: PetscFree(to->local.slots_nonmatching);
269: PetscFree(from->local.slots_nonmatching);
270: PetscFree(to->rev_requests);
271: PetscFree(from->rev_requests);
272: PetscFree(to->requests);
273: PetscFree(from->requests);
274: PetscFree4(to->values,to->indices,to->starts,to->procs);
275: PetscFree2(to->sstatus,to->rstatus);
276: PetscFree4(from->values,from->indices,from->starts,from->procs);
277: PetscFree(from);
278: PetscFree(to);
279: return(0);
280: }
282: /* --------------------------------------------------------------------------------------*/
283: /*
284: Special optimization to see if the local part of the scatter is actually
285: a copy. The scatter routines call PetscMemcpy() instead.
287: */
288: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
289: {
290: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
291: PetscInt to_start,from_start;
295: to_start = to_slots[0];
296: from_start = from_slots[0];
298: for (i=1; i<n; i++) {
299: to_start += bs;
300: from_start += bs;
301: if (to_slots[i] != to_start) return(0);
302: if (from_slots[i] != from_start) return(0);
303: }
304: to->is_copy = PETSC_TRUE;
305: to->copy_start = to_slots[0];
306: to->copy_length = bs*sizeof(PetscScalar)*n;
307: from->is_copy = PETSC_TRUE;
308: from->copy_start = from_slots[0];
309: from->copy_length = bs*sizeof(PetscScalar)*n;
310: PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
311: return(0);
312: }
314: /* --------------------------------------------------------------------------------------*/
316: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
317: {
318: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
319: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
320: PetscErrorCode ierr;
321: PetscInt ny,bs = in_from->bs,jj;
322: PetscCommShared scomm;
323: MPI_Comm mscomm;
324: MPI_Info info;
327: out->ops->begin = in->ops->begin;
328: out->ops->end = in->ops->end;
329: out->ops->copy = in->ops->copy;
330: out->ops->destroy = in->ops->destroy;
331: out->ops->view = in->ops->view;
333: /* allocate entire send scatter context */
334: PetscNewLog(out,&out_to);
335: out_to->sharedwin = MPI_WIN_NULL;
336: PetscNewLog(out,&out_from);
337: out_from->sharedwin = MPI_WIN_NULL;
339: ny = in_to->starts[in_to->n];
340: out_to->n = in_to->n;
341: out_to->format = in_to->format;
342: out_to->sendfirst = in_to->sendfirst;
344: PetscMalloc1(out_to->n,&out_to->requests);
345: PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
346: PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
347: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
348: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
349: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
351: out->todata = (void*)out_to;
352: out_to->local.n = in_to->local.n;
353: out_to->local.nonmatching_computed = PETSC_FALSE;
354: out_to->local.n_nonmatching = 0;
355: out_to->local.slots_nonmatching = 0;
356: if (in_to->local.n) {
357: PetscMalloc1(in_to->local.n,&out_to->local.vslots);
358: PetscMalloc1(in_from->local.n,&out_from->local.vslots);
359: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
360: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
361: } else {
362: out_to->local.vslots = 0;
363: out_from->local.vslots = 0;
364: }
366: /* allocate entire receive context */
367: out_from->format = in_from->format;
368: ny = in_from->starts[in_from->n];
369: out_from->n = in_from->n;
370: out_from->sendfirst = in_from->sendfirst;
372: PetscMalloc1(out_from->n,&out_from->requests);
373: PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
374: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
375: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
376: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
378: out->fromdata = (void*)out_from;
379: out_from->local.n = in_from->local.n;
380: out_from->local.nonmatching_computed = PETSC_FALSE;
381: out_from->local.n_nonmatching = 0;
382: out_from->local.slots_nonmatching = 0;
384: /*
385: set up the request arrays for use with isend_init() and irecv_init()
386: */
387: {
388: PetscMPIInt tag;
389: MPI_Comm comm;
390: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
391: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
392: PetscInt i;
393: PetscBool flg;
394: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
395: MPI_Request *rev_swaits,*rev_rwaits;
396: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
398: PetscMalloc1(in_to->n,&out_to->rev_requests);
399: PetscMalloc1(in_from->n,&out_from->rev_requests);
401: rev_rwaits = out_to->rev_requests;
402: rev_swaits = out_from->rev_requests;
404: out_from->bs = out_to->bs = bs;
405: tag = ((PetscObject)out)->tag;
406: PetscObjectGetComm((PetscObject)out,&comm);
408: /* Register the receives that you will use later (sends for scatter reverse) */
409: for (i=0; i<out_from->n; i++) {
410: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
411: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
412: }
414: flg = PETSC_FALSE;
415: PetscOptionsGetBool(NULL,NULL,"-vecscatter_rsend",&flg,NULL);
416: if (flg) {
417: out_to->use_readyreceiver = PETSC_TRUE;
418: out_from->use_readyreceiver = PETSC_TRUE;
419: for (i=0; i<out_to->n; i++) {
420: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
421: }
422: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
423: MPI_Barrier(comm);
424: PetscInfo(in,"Using VecScatter ready receiver mode\n");
425: } else {
426: out_to->use_readyreceiver = PETSC_FALSE;
427: out_from->use_readyreceiver = PETSC_FALSE;
428: flg = PETSC_FALSE;
429: PetscOptionsGetBool(NULL,NULL,"-vecscatter_ssend",&flg,NULL);
430: if (flg) {
431: PetscInfo(in,"Using VecScatter Ssend mode\n");
432: }
433: for (i=0; i<out_to->n; i++) {
434: if (!flg) {
435: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
436: } else {
437: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
438: }
439: }
440: }
441: /* Register receives for scatter reverse */
442: for (i=0; i<out_to->n; i++) {
443: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
444: }
445: }
447: /* since the to and from data structures are not symmetric for shared memory copies we insure they always listed in "standard" form */
448: if (!in_to->sharedwin) {
449: VecScatter_MPI_General *totmp = in_to,*out_totmp = out_to;
450: in_to = in_from;
451: in_from = totmp;
452: out_to = out_from;
453: out_from = out_totmp;
454: }
456: /* copy the to parts for the shared memory copies between processes */
457: out_to->sharedcnt = in_to->sharedcnt;
458: out_to->msize = in_to->msize;
459: PetscMalloc1(out_to->msize+1,&out_to->sharedspacestarts);
460: PetscMemcpy(out_to->sharedspacestarts,in_to->sharedspacestarts,(out_to->msize+1)*sizeof(PetscInt));
461: PetscMalloc1(out_to->sharedcnt,&out_to->sharedspaceindices);
462: PetscMemcpy(out_to->sharedspaceindices,in_to->sharedspaceindices,out_to->sharedcnt*sizeof(PetscInt));
464: PetscCommSharedGet(PetscObjectComm((PetscObject)in),&scomm);
465: PetscCommSharedGetComm(scomm,&mscomm);
466: MPI_Info_create(&info);
467: MPI_Info_set(info, "alloc_shared_noncontig", "true");
468: MPIU_Win_allocate_shared(bs*out_to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&out_to->sharedspace,&out_to->sharedwin);
469: MPI_Info_free(&info);
471: /* copy the to parts for the shared memory copies between processes */
472: out_from->sharedcnt = in_from->sharedcnt;
473: out_from->msize = in_from->msize;
474: PetscMalloc1(out_from->msize+1,&out_from->sharedspacestarts);
475: PetscMemcpy(out_from->sharedspacestarts,in_from->sharedspacestarts,(out_from->msize+1)*sizeof(PetscInt));
476: PetscMalloc1(out_from->sharedcnt,&out_from->sharedspaceindices);
477: PetscMemcpy(out_from->sharedspaceindices,in_from->sharedspaceindices,out_from->sharedcnt*sizeof(PetscInt));
478: PetscMalloc1(out_from->msize,&out_from->sharedspacesoffset);
479: PetscMemcpy(out_from->sharedspacesoffset,in_from->sharedspacesoffset,out_from->msize*sizeof(PetscInt));
480: PetscCalloc1(out_from->msize,&out_from->sharedspaces);
481: for (jj=0; jj<out_from->msize; jj++) {
482: MPI_Aint isize;
483: PetscMPIInt disp_unit;
484: MPIU_Win_shared_query(out_to->sharedwin,jj,&isize,&disp_unit,&out_from->sharedspaces[jj]);
485: }
486: return(0);
487: }
489: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
490: {
491: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
492: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
493: PetscErrorCode ierr;
494: PetscInt ny,bs = in_from->bs;
495: PetscMPIInt size;
498: MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);
500: out->ops->begin = in->ops->begin;
501: out->ops->end = in->ops->end;
502: out->ops->copy = in->ops->copy;
503: out->ops->destroy = in->ops->destroy;
504: out->ops->view = in->ops->view;
506: /* allocate entire send scatter context */
507: PetscNewLog(out,&out_to);
508: out_to->sharedwin = MPI_WIN_NULL;
509: PetscNewLog(out,&out_from);
510: out_from->sharedwin = MPI_WIN_NULL;
512: ny = in_to->starts[in_to->n];
513: out_to->n = in_to->n;
514: out_to->format = in_to->format;
515: out_to->sendfirst = in_to->sendfirst;
517: PetscMalloc1(out_to->n,&out_to->requests);
518: PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
519: PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
520: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
521: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
522: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
524: out->todata = (void*)out_to;
525: out_to->local.n = in_to->local.n;
526: out_to->local.nonmatching_computed = PETSC_FALSE;
527: out_to->local.n_nonmatching = 0;
528: out_to->local.slots_nonmatching = 0;
529: if (in_to->local.n) {
530: PetscMalloc1(in_to->local.n,&out_to->local.vslots);
531: PetscMalloc1(in_from->local.n,&out_from->local.vslots);
532: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
533: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
534: } else {
535: out_to->local.vslots = 0;
536: out_from->local.vslots = 0;
537: }
539: /* allocate entire receive context */
540: out_from->format = in_from->format;
541: ny = in_from->starts[in_from->n];
542: out_from->n = in_from->n;
543: out_from->sendfirst = in_from->sendfirst;
545: PetscMalloc1(out_from->n,&out_from->requests);
546: PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
547: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
548: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
549: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
551: out->fromdata = (void*)out_from;
552: out_from->local.n = in_from->local.n;
553: out_from->local.nonmatching_computed = PETSC_FALSE;
554: out_from->local.n_nonmatching = 0;
555: out_from->local.slots_nonmatching = 0;
557: out_to->use_alltoallv = out_from->use_alltoallv = PETSC_TRUE;
559: PetscMalloc2(size,&out_to->counts,size,&out_to->displs);
560: PetscMemcpy(out_to->counts,in_to->counts,size*sizeof(PetscMPIInt));
561: PetscMemcpy(out_to->displs,in_to->displs,size*sizeof(PetscMPIInt));
563: PetscMalloc2(size,&out_from->counts,size,&out_from->displs);
564: PetscMemcpy(out_from->counts,in_from->counts,size*sizeof(PetscMPIInt));
565: PetscMemcpy(out_from->displs,in_from->displs,size*sizeof(PetscMPIInt));
566: return(0);
567: }
568: /* --------------------------------------------------------------------------------------------------
569: Packs and unpacks the message data into send or from receive buffers.
571: These could be generated automatically.
573: Fortran kernels etc. could be used.
574: */
575: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
576: {
577: PetscInt i;
578: for (i=0; i<n; i++) y[i] = x[indicesx[i]];
579: }
581: PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
582: {
583: PetscInt i;
586: switch (addv) {
587: case INSERT_VALUES:
588: case INSERT_ALL_VALUES:
589: for (i=0; i<n; i++) y[indicesy[i]] = x[i];
590: break;
591: case ADD_VALUES:
592: case ADD_ALL_VALUES:
593: for (i=0; i<n; i++) y[indicesy[i]] += x[i];
594: break;
595: #if !defined(PETSC_USE_COMPLEX)
596: case MAX_VALUES:
597: for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
598: #else
599: case MAX_VALUES:
600: #endif
601: case NOT_SET_VALUES:
602: break;
603: default:
604: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
605: }
606: return(0);
607: }
609: PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
610: {
611: PetscInt i;
614: switch (addv) {
615: case INSERT_VALUES:
616: case INSERT_ALL_VALUES:
617: for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
618: break;
619: case ADD_VALUES:
620: case ADD_ALL_VALUES:
621: for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
622: break;
623: #if !defined(PETSC_USE_COMPLEX)
624: case MAX_VALUES:
625: for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
626: #else
627: case MAX_VALUES:
628: #endif
629: case NOT_SET_VALUES:
630: break;
631: default:
632: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
633: }
634: return(0);
635: }
637: /* ----------------------------------------------------------------------------------------------- */
638: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
639: {
640: PetscInt i,idx;
642: for (i=0; i<n; i++) {
643: idx = *indicesx++;
644: y[0] = x[idx];
645: y[1] = x[idx+1];
646: y += 2;
647: }
648: }
650: PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
651: {
652: PetscInt i,idy;
655: switch (addv) {
656: case INSERT_VALUES:
657: case INSERT_ALL_VALUES:
658: for (i=0; i<n; i++) {
659: idy = *indicesy++;
660: y[idy] = x[0];
661: y[idy+1] = x[1];
662: x += 2;
663: }
664: break;
665: case ADD_VALUES:
666: case ADD_ALL_VALUES:
667: for (i=0; i<n; i++) {
668: idy = *indicesy++;
669: y[idy] += x[0];
670: y[idy+1] += x[1];
671: x += 2;
672: }
673: break;
674: #if !defined(PETSC_USE_COMPLEX)
675: case MAX_VALUES:
676: for (i=0; i<n; i++) {
677: idy = *indicesy++;
678: y[idy] = PetscMax(y[idy],x[0]);
679: y[idy+1] = PetscMax(y[idy+1],x[1]);
680: x += 2;
681: }
682: #else
683: case MAX_VALUES:
684: #endif
685: case NOT_SET_VALUES:
686: break;
687: default:
688: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
689: }
690: return(0);
691: }
693: PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
694: {
695: PetscInt i,idx,idy;
698: switch (addv) {
699: case INSERT_VALUES:
700: case INSERT_ALL_VALUES:
701: for (i=0; i<n; i++) {
702: idx = *indicesx++;
703: idy = *indicesy++;
704: y[idy] = x[idx];
705: y[idy+1] = x[idx+1];
706: }
707: break;
708: case ADD_VALUES:
709: case ADD_ALL_VALUES:
710: for (i=0; i<n; i++) {
711: idx = *indicesx++;
712: idy = *indicesy++;
713: y[idy] += x[idx];
714: y[idy+1] += x[idx+1];
715: }
716: break;
717: #if !defined(PETSC_USE_COMPLEX)
718: case MAX_VALUES:
719: for (i=0; i<n; i++) {
720: idx = *indicesx++;
721: idy = *indicesy++;
722: y[idy] = PetscMax(y[idy],x[idx]);
723: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
724: }
725: #else
726: case MAX_VALUES:
727: #endif
728: case NOT_SET_VALUES:
729: break;
730: default:
731: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
732: }
733: return(0);
734: }
735: /* ----------------------------------------------------------------------------------------------- */
736: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
737: {
738: PetscInt i,idx;
740: for (i=0; i<n; i++) {
741: idx = *indicesx++;
742: y[0] = x[idx];
743: y[1] = x[idx+1];
744: y[2] = x[idx+2];
745: y += 3;
746: }
747: }
748: PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
749: {
750: PetscInt i,idy;
753: switch (addv) {
754: case INSERT_VALUES:
755: case INSERT_ALL_VALUES:
756: for (i=0; i<n; i++) {
757: idy = *indicesy++;
758: y[idy] = x[0];
759: y[idy+1] = x[1];
760: y[idy+2] = x[2];
761: x += 3;
762: }
763: break;
764: case ADD_VALUES:
765: case ADD_ALL_VALUES:
766: for (i=0; i<n; i++) {
767: idy = *indicesy++;
768: y[idy] += x[0];
769: y[idy+1] += x[1];
770: y[idy+2] += x[2];
771: x += 3;
772: }
773: break;
774: #if !defined(PETSC_USE_COMPLEX)
775: case MAX_VALUES:
776: for (i=0; i<n; i++) {
777: idy = *indicesy++;
778: y[idy] = PetscMax(y[idy],x[0]);
779: y[idy+1] = PetscMax(y[idy+1],x[1]);
780: y[idy+2] = PetscMax(y[idy+2],x[2]);
781: x += 3;
782: }
783: #else
784: case MAX_VALUES:
785: #endif
786: case NOT_SET_VALUES:
787: break;
788: default:
789: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
790: }
791: return(0);
792: }
794: PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
795: {
796: PetscInt i,idx,idy;
799: switch (addv) {
800: case INSERT_VALUES:
801: case INSERT_ALL_VALUES:
802: for (i=0; i<n; i++) {
803: idx = *indicesx++;
804: idy = *indicesy++;
805: y[idy] = x[idx];
806: y[idy+1] = x[idx+1];
807: y[idy+2] = x[idx+2];
808: }
809: break;
810: case ADD_VALUES:
811: case ADD_ALL_VALUES:
812: for (i=0; i<n; i++) {
813: idx = *indicesx++;
814: idy = *indicesy++;
815: y[idy] += x[idx];
816: y[idy+1] += x[idx+1];
817: y[idy+2] += x[idx+2];
818: }
819: break;
820: #if !defined(PETSC_USE_COMPLEX)
821: case MAX_VALUES:
822: for (i=0; i<n; i++) {
823: idx = *indicesx++;
824: idy = *indicesy++;
825: y[idy] = PetscMax(y[idy],x[idx]);
826: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
827: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
828: }
829: #else
830: case MAX_VALUES:
831: #endif
832: case NOT_SET_VALUES:
833: break;
834: default:
835: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
836: }
837: return(0);
838: }
839: /* ----------------------------------------------------------------------------------------------- */
840: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
841: {
842: PetscInt i,idx;
844: for (i=0; i<n; i++) {
845: idx = *indicesx++;
846: y[0] = x[idx];
847: y[1] = x[idx+1];
848: y[2] = x[idx+2];
849: y[3] = x[idx+3];
850: y += 4;
851: }
852: }
853: PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
854: {
855: PetscInt i,idy;
858: switch (addv) {
859: case INSERT_VALUES:
860: case INSERT_ALL_VALUES:
861: for (i=0; i<n; i++) {
862: idy = *indicesy++;
863: y[idy] = x[0];
864: y[idy+1] = x[1];
865: y[idy+2] = x[2];
866: y[idy+3] = x[3];
867: x += 4;
868: }
869: break;
870: case ADD_VALUES:
871: case ADD_ALL_VALUES:
872: for (i=0; i<n; i++) {
873: idy = *indicesy++;
874: y[idy] += x[0];
875: y[idy+1] += x[1];
876: y[idy+2] += x[2];
877: y[idy+3] += x[3];
878: x += 4;
879: }
880: break;
881: #if !defined(PETSC_USE_COMPLEX)
882: case MAX_VALUES:
883: for (i=0; i<n; i++) {
884: idy = *indicesy++;
885: y[idy] = PetscMax(y[idy],x[0]);
886: y[idy+1] = PetscMax(y[idy+1],x[1]);
887: y[idy+2] = PetscMax(y[idy+2],x[2]);
888: y[idy+3] = PetscMax(y[idy+3],x[3]);
889: x += 4;
890: }
891: #else
892: case MAX_VALUES:
893: #endif
894: case NOT_SET_VALUES:
895: break;
896: default:
897: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
898: }
899: return(0);
900: }
902: PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
903: {
904: PetscInt i,idx,idy;
907: switch (addv) {
908: case INSERT_VALUES:
909: case INSERT_ALL_VALUES:
910: for (i=0; i<n; i++) {
911: idx = *indicesx++;
912: idy = *indicesy++;
913: y[idy] = x[idx];
914: y[idy+1] = x[idx+1];
915: y[idy+2] = x[idx+2];
916: y[idy+3] = x[idx+3];
917: }
918: break;
919: case ADD_VALUES:
920: case ADD_ALL_VALUES:
921: for (i=0; i<n; i++) {
922: idx = *indicesx++;
923: idy = *indicesy++;
924: y[idy] += x[idx];
925: y[idy+1] += x[idx+1];
926: y[idy+2] += x[idx+2];
927: y[idy+3] += x[idx+3];
928: }
929: break;
930: #if !defined(PETSC_USE_COMPLEX)
931: case MAX_VALUES:
932: for (i=0; i<n; i++) {
933: idx = *indicesx++;
934: idy = *indicesy++;
935: y[idy] = PetscMax(y[idy],x[idx]);
936: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
937: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
938: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
939: }
940: #else
941: case MAX_VALUES:
942: #endif
943: case NOT_SET_VALUES:
944: break;
945: default:
946: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
947: }
948: return(0);
949: }
950: /* ----------------------------------------------------------------------------------------------- */
951: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
952: {
953: PetscInt i,idx;
955: for (i=0; i<n; i++) {
956: idx = *indicesx++;
957: y[0] = x[idx];
958: y[1] = x[idx+1];
959: y[2] = x[idx+2];
960: y[3] = x[idx+3];
961: y[4] = x[idx+4];
962: y += 5;
963: }
964: }
966: PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
967: {
968: PetscInt i,idy;
971: switch (addv) {
972: case INSERT_VALUES:
973: case INSERT_ALL_VALUES:
974: for (i=0; i<n; i++) {
975: idy = *indicesy++;
976: y[idy] = x[0];
977: y[idy+1] = x[1];
978: y[idy+2] = x[2];
979: y[idy+3] = x[3];
980: y[idy+4] = x[4];
981: x += 5;
982: }
983: break;
984: case ADD_VALUES:
985: case ADD_ALL_VALUES:
986: for (i=0; i<n; i++) {
987: idy = *indicesy++;
988: y[idy] += x[0];
989: y[idy+1] += x[1];
990: y[idy+2] += x[2];
991: y[idy+3] += x[3];
992: y[idy+4] += x[4];
993: x += 5;
994: }
995: break;
996: #if !defined(PETSC_USE_COMPLEX)
997: case MAX_VALUES:
998: for (i=0; i<n; i++) {
999: idy = *indicesy++;
1000: y[idy] = PetscMax(y[idy],x[0]);
1001: y[idy+1] = PetscMax(y[idy+1],x[1]);
1002: y[idy+2] = PetscMax(y[idy+2],x[2]);
1003: y[idy+3] = PetscMax(y[idy+3],x[3]);
1004: y[idy+4] = PetscMax(y[idy+4],x[4]);
1005: x += 5;
1006: }
1007: #else
1008: case MAX_VALUES:
1009: #endif
1010: case NOT_SET_VALUES:
1011: break;
1012: default:
1013: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1014: }
1015: return(0);
1016: }
1018: PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1019: {
1020: PetscInt i,idx,idy;
1023: switch (addv) {
1024: case INSERT_VALUES:
1025: case INSERT_ALL_VALUES:
1026: for (i=0; i<n; i++) {
1027: idx = *indicesx++;
1028: idy = *indicesy++;
1029: y[idy] = x[idx];
1030: y[idy+1] = x[idx+1];
1031: y[idy+2] = x[idx+2];
1032: y[idy+3] = x[idx+3];
1033: y[idy+4] = x[idx+4];
1034: }
1035: break;
1036: case ADD_VALUES:
1037: case ADD_ALL_VALUES:
1038: for (i=0; i<n; i++) {
1039: idx = *indicesx++;
1040: idy = *indicesy++;
1041: y[idy] += x[idx];
1042: y[idy+1] += x[idx+1];
1043: y[idy+2] += x[idx+2];
1044: y[idy+3] += x[idx+3];
1045: y[idy+4] += x[idx+4];
1046: }
1047: break;
1048: #if !defined(PETSC_USE_COMPLEX)
1049: case MAX_VALUES:
1050: for (i=0; i<n; i++) {
1051: idx = *indicesx++;
1052: idy = *indicesy++;
1053: y[idy] = PetscMax(y[idy],x[idx]);
1054: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1055: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1056: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1057: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1058: }
1059: #else
1060: case MAX_VALUES:
1061: #endif
1062: case NOT_SET_VALUES:
1063: break;
1064: default:
1065: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1066: }
1067: return(0);
1068: }
1069: /* ----------------------------------------------------------------------------------------------- */
1070: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1071: {
1072: PetscInt i,idx;
1074: for (i=0; i<n; i++) {
1075: idx = *indicesx++;
1076: y[0] = x[idx];
1077: y[1] = x[idx+1];
1078: y[2] = x[idx+2];
1079: y[3] = x[idx+3];
1080: y[4] = x[idx+4];
1081: y[5] = x[idx+5];
1082: y += 6;
1083: }
1084: }
1086: PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1087: {
1088: PetscInt i,idy;
1091: switch (addv) {
1092: case INSERT_VALUES:
1093: case INSERT_ALL_VALUES:
1094: for (i=0; i<n; i++) {
1095: idy = *indicesy++;
1096: y[idy] = x[0];
1097: y[idy+1] = x[1];
1098: y[idy+2] = x[2];
1099: y[idy+3] = x[3];
1100: y[idy+4] = x[4];
1101: y[idy+5] = x[5];
1102: x += 6;
1103: }
1104: break;
1105: case ADD_VALUES:
1106: case ADD_ALL_VALUES:
1107: for (i=0; i<n; i++) {
1108: idy = *indicesy++;
1109: y[idy] += x[0];
1110: y[idy+1] += x[1];
1111: y[idy+2] += x[2];
1112: y[idy+3] += x[3];
1113: y[idy+4] += x[4];
1114: y[idy+5] += x[5];
1115: x += 6;
1116: }
1117: break;
1118: #if !defined(PETSC_USE_COMPLEX)
1119: case MAX_VALUES:
1120: for (i=0; i<n; i++) {
1121: idy = *indicesy++;
1122: y[idy] = PetscMax(y[idy],x[0]);
1123: y[idy+1] = PetscMax(y[idy+1],x[1]);
1124: y[idy+2] = PetscMax(y[idy+2],x[2]);
1125: y[idy+3] = PetscMax(y[idy+3],x[3]);
1126: y[idy+4] = PetscMax(y[idy+4],x[4]);
1127: y[idy+5] = PetscMax(y[idy+5],x[5]);
1128: x += 6;
1129: }
1130: #else
1131: case MAX_VALUES:
1132: #endif
1133: case NOT_SET_VALUES:
1134: break;
1135: default:
1136: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1137: }
1138: return(0);
1139: }
1141: PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1142: {
1143: PetscInt i,idx,idy;
1146: switch (addv) {
1147: case INSERT_VALUES:
1148: case INSERT_ALL_VALUES:
1149: for (i=0; i<n; i++) {
1150: idx = *indicesx++;
1151: idy = *indicesy++;
1152: y[idy] = x[idx];
1153: y[idy+1] = x[idx+1];
1154: y[idy+2] = x[idx+2];
1155: y[idy+3] = x[idx+3];
1156: y[idy+4] = x[idx+4];
1157: y[idy+5] = x[idx+5];
1158: }
1159: break;
1160: case ADD_VALUES:
1161: case ADD_ALL_VALUES:
1162: for (i=0; i<n; i++) {
1163: idx = *indicesx++;
1164: idy = *indicesy++;
1165: y[idy] += x[idx];
1166: y[idy+1] += x[idx+1];
1167: y[idy+2] += x[idx+2];
1168: y[idy+3] += x[idx+3];
1169: y[idy+4] += x[idx+4];
1170: y[idy+5] += x[idx+5];
1171: }
1172: break;
1173: #if !defined(PETSC_USE_COMPLEX)
1174: case MAX_VALUES:
1175: for (i=0; i<n; i++) {
1176: idx = *indicesx++;
1177: idy = *indicesy++;
1178: y[idy] = PetscMax(y[idy],x[idx]);
1179: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1180: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1181: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1182: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1183: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1184: }
1185: #else
1186: case MAX_VALUES:
1187: #endif
1188: case NOT_SET_VALUES:
1189: break;
1190: default:
1191: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1192: }
1193: return(0);
1194: }
1195: /* ----------------------------------------------------------------------------------------------- */
1196: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1197: {
1198: PetscInt i,idx;
1200: for (i=0; i<n; i++) {
1201: idx = *indicesx++;
1202: y[0] = x[idx];
1203: y[1] = x[idx+1];
1204: y[2] = x[idx+2];
1205: y[3] = x[idx+3];
1206: y[4] = x[idx+4];
1207: y[5] = x[idx+5];
1208: y[6] = x[idx+6];
1209: y += 7;
1210: }
1211: }
1213: PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1214: {
1215: PetscInt i,idy;
1218: switch (addv) {
1219: case INSERT_VALUES:
1220: case INSERT_ALL_VALUES:
1221: for (i=0; i<n; i++) {
1222: idy = *indicesy++;
1223: y[idy] = x[0];
1224: y[idy+1] = x[1];
1225: y[idy+2] = x[2];
1226: y[idy+3] = x[3];
1227: y[idy+4] = x[4];
1228: y[idy+5] = x[5];
1229: y[idy+6] = x[6];
1230: x += 7;
1231: }
1232: break;
1233: case ADD_VALUES:
1234: case ADD_ALL_VALUES:
1235: for (i=0; i<n; i++) {
1236: idy = *indicesy++;
1237: y[idy] += x[0];
1238: y[idy+1] += x[1];
1239: y[idy+2] += x[2];
1240: y[idy+3] += x[3];
1241: y[idy+4] += x[4];
1242: y[idy+5] += x[5];
1243: y[idy+6] += x[6];
1244: x += 7;
1245: }
1246: break;
1247: #if !defined(PETSC_USE_COMPLEX)
1248: case MAX_VALUES:
1249: for (i=0; i<n; i++) {
1250: idy = *indicesy++;
1251: y[idy] = PetscMax(y[idy],x[0]);
1252: y[idy+1] = PetscMax(y[idy+1],x[1]);
1253: y[idy+2] = PetscMax(y[idy+2],x[2]);
1254: y[idy+3] = PetscMax(y[idy+3],x[3]);
1255: y[idy+4] = PetscMax(y[idy+4],x[4]);
1256: y[idy+5] = PetscMax(y[idy+5],x[5]);
1257: y[idy+6] = PetscMax(y[idy+6],x[6]);
1258: x += 7;
1259: }
1260: #else
1261: case MAX_VALUES:
1262: #endif
1263: case NOT_SET_VALUES:
1264: break;
1265: default:
1266: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1267: }
1268: return(0);
1269: }
1271: PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1272: {
1273: PetscInt i,idx,idy;
1276: switch (addv) {
1277: case INSERT_VALUES:
1278: case INSERT_ALL_VALUES:
1279: for (i=0; i<n; i++) {
1280: idx = *indicesx++;
1281: idy = *indicesy++;
1282: y[idy] = x[idx];
1283: y[idy+1] = x[idx+1];
1284: y[idy+2] = x[idx+2];
1285: y[idy+3] = x[idx+3];
1286: y[idy+4] = x[idx+4];
1287: y[idy+5] = x[idx+5];
1288: y[idy+6] = x[idx+6];
1289: }
1290: break;
1291: case ADD_VALUES:
1292: case ADD_ALL_VALUES:
1293: for (i=0; i<n; i++) {
1294: idx = *indicesx++;
1295: idy = *indicesy++;
1296: y[idy] += x[idx];
1297: y[idy+1] += x[idx+1];
1298: y[idy+2] += x[idx+2];
1299: y[idy+3] += x[idx+3];
1300: y[idy+4] += x[idx+4];
1301: y[idy+5] += x[idx+5];
1302: y[idy+6] += x[idx+6];
1303: }
1304: break;
1305: #if !defined(PETSC_USE_COMPLEX)
1306: case MAX_VALUES:
1307: for (i=0; i<n; i++) {
1308: idx = *indicesx++;
1309: idy = *indicesy++;
1310: y[idy] = PetscMax(y[idy],x[idx]);
1311: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1312: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1313: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1314: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1315: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1316: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1317: }
1318: #else
1319: case MAX_VALUES:
1320: #endif
1321: case NOT_SET_VALUES:
1322: break;
1323: default:
1324: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1325: }
1326: return(0);
1327: }
1328: /* ----------------------------------------------------------------------------------------------- */
1329: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1330: {
1331: PetscInt i,idx;
1333: for (i=0; i<n; i++) {
1334: idx = *indicesx++;
1335: y[0] = x[idx];
1336: y[1] = x[idx+1];
1337: y[2] = x[idx+2];
1338: y[3] = x[idx+3];
1339: y[4] = x[idx+4];
1340: y[5] = x[idx+5];
1341: y[6] = x[idx+6];
1342: y[7] = x[idx+7];
1343: y += 8;
1344: }
1345: }
1347: PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1348: {
1349: PetscInt i,idy;
1352: switch (addv) {
1353: case INSERT_VALUES:
1354: case INSERT_ALL_VALUES:
1355: for (i=0; i<n; i++) {
1356: idy = *indicesy++;
1357: y[idy] = x[0];
1358: y[idy+1] = x[1];
1359: y[idy+2] = x[2];
1360: y[idy+3] = x[3];
1361: y[idy+4] = x[4];
1362: y[idy+5] = x[5];
1363: y[idy+6] = x[6];
1364: y[idy+7] = x[7];
1365: x += 8;
1366: }
1367: break;
1368: case ADD_VALUES:
1369: case ADD_ALL_VALUES:
1370: for (i=0; i<n; i++) {
1371: idy = *indicesy++;
1372: y[idy] += x[0];
1373: y[idy+1] += x[1];
1374: y[idy+2] += x[2];
1375: y[idy+3] += x[3];
1376: y[idy+4] += x[4];
1377: y[idy+5] += x[5];
1378: y[idy+6] += x[6];
1379: y[idy+7] += x[7];
1380: x += 8;
1381: }
1382: break;
1383: #if !defined(PETSC_USE_COMPLEX)
1384: case MAX_VALUES:
1385: for (i=0; i<n; i++) {
1386: idy = *indicesy++;
1387: y[idy] = PetscMax(y[idy],x[0]);
1388: y[idy+1] = PetscMax(y[idy+1],x[1]);
1389: y[idy+2] = PetscMax(y[idy+2],x[2]);
1390: y[idy+3] = PetscMax(y[idy+3],x[3]);
1391: y[idy+4] = PetscMax(y[idy+4],x[4]);
1392: y[idy+5] = PetscMax(y[idy+5],x[5]);
1393: y[idy+6] = PetscMax(y[idy+6],x[6]);
1394: y[idy+7] = PetscMax(y[idy+7],x[7]);
1395: x += 8;
1396: }
1397: #else
1398: case MAX_VALUES:
1399: #endif
1400: case NOT_SET_VALUES:
1401: break;
1402: default:
1403: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1404: }
1405: return(0);
1406: }
1408: PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1409: {
1410: PetscInt i,idx,idy;
1413: switch (addv) {
1414: case INSERT_VALUES:
1415: case INSERT_ALL_VALUES:
1416: for (i=0; i<n; i++) {
1417: idx = *indicesx++;
1418: idy = *indicesy++;
1419: y[idy] = x[idx];
1420: y[idy+1] = x[idx+1];
1421: y[idy+2] = x[idx+2];
1422: y[idy+3] = x[idx+3];
1423: y[idy+4] = x[idx+4];
1424: y[idy+5] = x[idx+5];
1425: y[idy+6] = x[idx+6];
1426: y[idy+7] = x[idx+7];
1427: }
1428: break;
1429: case ADD_VALUES:
1430: case ADD_ALL_VALUES:
1431: for (i=0; i<n; i++) {
1432: idx = *indicesx++;
1433: idy = *indicesy++;
1434: y[idy] += x[idx];
1435: y[idy+1] += x[idx+1];
1436: y[idy+2] += x[idx+2];
1437: y[idy+3] += x[idx+3];
1438: y[idy+4] += x[idx+4];
1439: y[idy+5] += x[idx+5];
1440: y[idy+6] += x[idx+6];
1441: y[idy+7] += x[idx+7];
1442: }
1443: break;
1444: #if !defined(PETSC_USE_COMPLEX)
1445: case MAX_VALUES:
1446: for (i=0; i<n; i++) {
1447: idx = *indicesx++;
1448: idy = *indicesy++;
1449: y[idy] = PetscMax(y[idy],x[idx]);
1450: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1451: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1452: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1453: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1454: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1455: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1456: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1457: }
1458: #else
1459: case MAX_VALUES:
1460: #endif
1461: case NOT_SET_VALUES:
1462: break;
1463: default:
1464: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1465: }
1466: return(0);
1467: }
1469: PETSC_STATIC_INLINE void Pack_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1470: {
1471: PetscInt i,idx;
1473: for (i=0; i<n; i++) {
1474: idx = *indicesx++;
1475: y[0] = x[idx];
1476: y[1] = x[idx+1];
1477: y[2] = x[idx+2];
1478: y[3] = x[idx+3];
1479: y[4] = x[idx+4];
1480: y[5] = x[idx+5];
1481: y[6] = x[idx+6];
1482: y[7] = x[idx+7];
1483: y[8] = x[idx+8];
1484: y += 9;
1485: }
1486: }
1488: PETSC_STATIC_INLINE PetscErrorCode UnPack_9(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1489: {
1490: PetscInt i,idy;
1493: switch (addv) {
1494: case INSERT_VALUES:
1495: case INSERT_ALL_VALUES:
1496: for (i=0; i<n; i++) {
1497: idy = *indicesy++;
1498: y[idy] = x[0];
1499: y[idy+1] = x[1];
1500: y[idy+2] = x[2];
1501: y[idy+3] = x[3];
1502: y[idy+4] = x[4];
1503: y[idy+5] = x[5];
1504: y[idy+6] = x[6];
1505: y[idy+7] = x[7];
1506: y[idy+8] = x[8];
1507: x += 9;
1508: }
1509: break;
1510: case ADD_VALUES:
1511: case ADD_ALL_VALUES:
1512: for (i=0; i<n; i++) {
1513: idy = *indicesy++;
1514: y[idy] += x[0];
1515: y[idy+1] += x[1];
1516: y[idy+2] += x[2];
1517: y[idy+3] += x[3];
1518: y[idy+4] += x[4];
1519: y[idy+5] += x[5];
1520: y[idy+6] += x[6];
1521: y[idy+7] += x[7];
1522: y[idy+8] += x[8];
1523: x += 9;
1524: }
1525: break;
1526: #if !defined(PETSC_USE_COMPLEX)
1527: case MAX_VALUES:
1528: for (i=0; i<n; i++) {
1529: idy = *indicesy++;
1530: y[idy] = PetscMax(y[idy],x[0]);
1531: y[idy+1] = PetscMax(y[idy+1],x[1]);
1532: y[idy+2] = PetscMax(y[idy+2],x[2]);
1533: y[idy+3] = PetscMax(y[idy+3],x[3]);
1534: y[idy+4] = PetscMax(y[idy+4],x[4]);
1535: y[idy+5] = PetscMax(y[idy+5],x[5]);
1536: y[idy+6] = PetscMax(y[idy+6],x[6]);
1537: y[idy+7] = PetscMax(y[idy+7],x[7]);
1538: y[idy+8] = PetscMax(y[idy+8],x[8]);
1539: x += 9;
1540: }
1541: #else
1542: case MAX_VALUES:
1543: #endif
1544: case NOT_SET_VALUES:
1545: break;
1546: default:
1547: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1548: }
1549: return(0);
1550: }
1552: PETSC_STATIC_INLINE PetscErrorCode Scatter_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1553: {
1554: PetscInt i,idx,idy;
1557: switch (addv) {
1558: case INSERT_VALUES:
1559: case INSERT_ALL_VALUES:
1560: for (i=0; i<n; i++) {
1561: idx = *indicesx++;
1562: idy = *indicesy++;
1563: y[idy] = x[idx];
1564: y[idy+1] = x[idx+1];
1565: y[idy+2] = x[idx+2];
1566: y[idy+3] = x[idx+3];
1567: y[idy+4] = x[idx+4];
1568: y[idy+5] = x[idx+5];
1569: y[idy+6] = x[idx+6];
1570: y[idy+7] = x[idx+7];
1571: y[idy+8] = x[idx+8];
1572: }
1573: break;
1574: case ADD_VALUES:
1575: case ADD_ALL_VALUES:
1576: for (i=0; i<n; i++) {
1577: idx = *indicesx++;
1578: idy = *indicesy++;
1579: y[idy] += x[idx];
1580: y[idy+1] += x[idx+1];
1581: y[idy+2] += x[idx+2];
1582: y[idy+3] += x[idx+3];
1583: y[idy+4] += x[idx+4];
1584: y[idy+5] += x[idx+5];
1585: y[idy+6] += x[idx+6];
1586: y[idy+7] += x[idx+7];
1587: y[idy+8] += x[idx+8];
1588: }
1589: break;
1590: #if !defined(PETSC_USE_COMPLEX)
1591: case MAX_VALUES:
1592: for (i=0; i<n; i++) {
1593: idx = *indicesx++;
1594: idy = *indicesy++;
1595: y[idy] = PetscMax(y[idy],x[idx]);
1596: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1597: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1598: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1599: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1600: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1601: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1602: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1603: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1604: }
1605: #else
1606: case MAX_VALUES:
1607: #endif
1608: case NOT_SET_VALUES:
1609: break;
1610: default:
1611: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1612: }
1613: return(0);
1614: }
1616: PETSC_STATIC_INLINE void Pack_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1617: {
1618: PetscInt i,idx;
1620: for (i=0; i<n; i++) {
1621: idx = *indicesx++;
1622: y[0] = x[idx];
1623: y[1] = x[idx+1];
1624: y[2] = x[idx+2];
1625: y[3] = x[idx+3];
1626: y[4] = x[idx+4];
1627: y[5] = x[idx+5];
1628: y[6] = x[idx+6];
1629: y[7] = x[idx+7];
1630: y[8] = x[idx+8];
1631: y[9] = x[idx+9];
1632: y += 10;
1633: }
1634: }
1636: PETSC_STATIC_INLINE PetscErrorCode UnPack_10(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1637: {
1638: PetscInt i,idy;
1641: switch (addv) {
1642: case INSERT_VALUES:
1643: case INSERT_ALL_VALUES:
1644: for (i=0; i<n; i++) {
1645: idy = *indicesy++;
1646: y[idy] = x[0];
1647: y[idy+1] = x[1];
1648: y[idy+2] = x[2];
1649: y[idy+3] = x[3];
1650: y[idy+4] = x[4];
1651: y[idy+5] = x[5];
1652: y[idy+6] = x[6];
1653: y[idy+7] = x[7];
1654: y[idy+8] = x[8];
1655: y[idy+9] = x[9];
1656: x += 10;
1657: }
1658: break;
1659: case ADD_VALUES:
1660: case ADD_ALL_VALUES:
1661: for (i=0; i<n; i++) {
1662: idy = *indicesy++;
1663: y[idy] += x[0];
1664: y[idy+1] += x[1];
1665: y[idy+2] += x[2];
1666: y[idy+3] += x[3];
1667: y[idy+4] += x[4];
1668: y[idy+5] += x[5];
1669: y[idy+6] += x[6];
1670: y[idy+7] += x[7];
1671: y[idy+8] += x[8];
1672: y[idy+9] += x[9];
1673: x += 10;
1674: }
1675: break;
1676: #if !defined(PETSC_USE_COMPLEX)
1677: case MAX_VALUES:
1678: for (i=0; i<n; i++) {
1679: idy = *indicesy++;
1680: y[idy] = PetscMax(y[idy],x[0]);
1681: y[idy+1] = PetscMax(y[idy+1],x[1]);
1682: y[idy+2] = PetscMax(y[idy+2],x[2]);
1683: y[idy+3] = PetscMax(y[idy+3],x[3]);
1684: y[idy+4] = PetscMax(y[idy+4],x[4]);
1685: y[idy+5] = PetscMax(y[idy+5],x[5]);
1686: y[idy+6] = PetscMax(y[idy+6],x[6]);
1687: y[idy+7] = PetscMax(y[idy+7],x[7]);
1688: y[idy+8] = PetscMax(y[idy+8],x[8]);
1689: y[idy+9] = PetscMax(y[idy+9],x[9]);
1690: x += 10;
1691: }
1692: #else
1693: case MAX_VALUES:
1694: #endif
1695: case NOT_SET_VALUES:
1696: break;
1697: default:
1698: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1699: }
1700: return(0);
1701: }
1703: PETSC_STATIC_INLINE PetscErrorCode Scatter_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1704: {
1705: PetscInt i,idx,idy;
1708: switch (addv) {
1709: case INSERT_VALUES:
1710: case INSERT_ALL_VALUES:
1711: for (i=0; i<n; i++) {
1712: idx = *indicesx++;
1713: idy = *indicesy++;
1714: y[idy] = x[idx];
1715: y[idy+1] = x[idx+1];
1716: y[idy+2] = x[idx+2];
1717: y[idy+3] = x[idx+3];
1718: y[idy+4] = x[idx+4];
1719: y[idy+5] = x[idx+5];
1720: y[idy+6] = x[idx+6];
1721: y[idy+7] = x[idx+7];
1722: y[idy+8] = x[idx+8];
1723: y[idy+9] = x[idx+9];
1724: }
1725: break;
1726: case ADD_VALUES:
1727: case ADD_ALL_VALUES:
1728: for (i=0; i<n; i++) {
1729: idx = *indicesx++;
1730: idy = *indicesy++;
1731: y[idy] += x[idx];
1732: y[idy+1] += x[idx+1];
1733: y[idy+2] += x[idx+2];
1734: y[idy+3] += x[idx+3];
1735: y[idy+4] += x[idx+4];
1736: y[idy+5] += x[idx+5];
1737: y[idy+6] += x[idx+6];
1738: y[idy+7] += x[idx+7];
1739: y[idy+8] += x[idx+8];
1740: y[idy+9] += x[idx+9];
1741: }
1742: break;
1743: #if !defined(PETSC_USE_COMPLEX)
1744: case MAX_VALUES:
1745: for (i=0; i<n; i++) {
1746: idx = *indicesx++;
1747: idy = *indicesy++;
1748: y[idy] = PetscMax(y[idy],x[idx]);
1749: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1750: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1751: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1752: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1753: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1754: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1755: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1756: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1757: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1758: }
1759: #else
1760: case MAX_VALUES:
1761: #endif
1762: case NOT_SET_VALUES:
1763: break;
1764: default:
1765: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1766: }
1767: return(0);
1768: }
1770: PETSC_STATIC_INLINE void Pack_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1771: {
1772: PetscInt i,idx;
1774: for (i=0; i<n; i++) {
1775: idx = *indicesx++;
1776: y[0] = x[idx];
1777: y[1] = x[idx+1];
1778: y[2] = x[idx+2];
1779: y[3] = x[idx+3];
1780: y[4] = x[idx+4];
1781: y[5] = x[idx+5];
1782: y[6] = x[idx+6];
1783: y[7] = x[idx+7];
1784: y[8] = x[idx+8];
1785: y[9] = x[idx+9];
1786: y[10] = x[idx+10];
1787: y += 11;
1788: }
1789: }
1791: PETSC_STATIC_INLINE PetscErrorCode UnPack_11(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1792: {
1793: PetscInt i,idy;
1796: switch (addv) {
1797: case INSERT_VALUES:
1798: case INSERT_ALL_VALUES:
1799: for (i=0; i<n; i++) {
1800: idy = *indicesy++;
1801: y[idy] = x[0];
1802: y[idy+1] = x[1];
1803: y[idy+2] = x[2];
1804: y[idy+3] = x[3];
1805: y[idy+4] = x[4];
1806: y[idy+5] = x[5];
1807: y[idy+6] = x[6];
1808: y[idy+7] = x[7];
1809: y[idy+8] = x[8];
1810: y[idy+9] = x[9];
1811: y[idy+10] = x[10];
1812: x += 11;
1813: }
1814: break;
1815: case ADD_VALUES:
1816: case ADD_ALL_VALUES:
1817: for (i=0; i<n; i++) {
1818: idy = *indicesy++;
1819: y[idy] += x[0];
1820: y[idy+1] += x[1];
1821: y[idy+2] += x[2];
1822: y[idy+3] += x[3];
1823: y[idy+4] += x[4];
1824: y[idy+5] += x[5];
1825: y[idy+6] += x[6];
1826: y[idy+7] += x[7];
1827: y[idy+8] += x[8];
1828: y[idy+9] += x[9];
1829: y[idy+10] += x[10];
1830: x += 11;
1831: }
1832: break;
1833: #if !defined(PETSC_USE_COMPLEX)
1834: case MAX_VALUES:
1835: for (i=0; i<n; i++) {
1836: idy = *indicesy++;
1837: y[idy] = PetscMax(y[idy],x[0]);
1838: y[idy+1] = PetscMax(y[idy+1],x[1]);
1839: y[idy+2] = PetscMax(y[idy+2],x[2]);
1840: y[idy+3] = PetscMax(y[idy+3],x[3]);
1841: y[idy+4] = PetscMax(y[idy+4],x[4]);
1842: y[idy+5] = PetscMax(y[idy+5],x[5]);
1843: y[idy+6] = PetscMax(y[idy+6],x[6]);
1844: y[idy+7] = PetscMax(y[idy+7],x[7]);
1845: y[idy+8] = PetscMax(y[idy+8],x[8]);
1846: y[idy+9] = PetscMax(y[idy+9],x[9]);
1847: y[idy+10] = PetscMax(y[idy+10],x[10]);
1848: x += 11;
1849: }
1850: #else
1851: case MAX_VALUES:
1852: #endif
1853: case NOT_SET_VALUES:
1854: break;
1855: default:
1856: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1857: }
1858: return(0);
1859: }
1861: PETSC_STATIC_INLINE PetscErrorCode Scatter_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1862: {
1863: PetscInt i,idx,idy;
1866: switch (addv) {
1867: case INSERT_VALUES:
1868: case INSERT_ALL_VALUES:
1869: for (i=0; i<n; i++) {
1870: idx = *indicesx++;
1871: idy = *indicesy++;
1872: y[idy] = x[idx];
1873: y[idy+1] = x[idx+1];
1874: y[idy+2] = x[idx+2];
1875: y[idy+3] = x[idx+3];
1876: y[idy+4] = x[idx+4];
1877: y[idy+5] = x[idx+5];
1878: y[idy+6] = x[idx+6];
1879: y[idy+7] = x[idx+7];
1880: y[idy+8] = x[idx+8];
1881: y[idy+9] = x[idx+9];
1882: y[idy+10] = x[idx+10];
1883: }
1884: break;
1885: case ADD_VALUES:
1886: case ADD_ALL_VALUES:
1887: for (i=0; i<n; i++) {
1888: idx = *indicesx++;
1889: idy = *indicesy++;
1890: y[idy] += x[idx];
1891: y[idy+1] += x[idx+1];
1892: y[idy+2] += x[idx+2];
1893: y[idy+3] += x[idx+3];
1894: y[idy+4] += x[idx+4];
1895: y[idy+5] += x[idx+5];
1896: y[idy+6] += x[idx+6];
1897: y[idy+7] += x[idx+7];
1898: y[idy+8] += x[idx+8];
1899: y[idy+9] += x[idx+9];
1900: y[idy+10] += x[idx+10];
1901: }
1902: break;
1903: #if !defined(PETSC_USE_COMPLEX)
1904: case MAX_VALUES:
1905: for (i=0; i<n; i++) {
1906: idx = *indicesx++;
1907: idy = *indicesy++;
1908: y[idy] = PetscMax(y[idy],x[idx]);
1909: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1910: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1911: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1912: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1913: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1914: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1915: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1916: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1917: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1918: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1919: }
1920: #else
1921: case MAX_VALUES:
1922: #endif
1923: case NOT_SET_VALUES:
1924: break;
1925: default:
1926: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1927: }
1928: return(0);
1929: }
1931: /* ----------------------------------------------------------------------------------------------- */
1932: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1933: {
1934: PetscInt i,idx;
1936: for (i=0; i<n; i++) {
1937: idx = *indicesx++;
1938: y[0] = x[idx];
1939: y[1] = x[idx+1];
1940: y[2] = x[idx+2];
1941: y[3] = x[idx+3];
1942: y[4] = x[idx+4];
1943: y[5] = x[idx+5];
1944: y[6] = x[idx+6];
1945: y[7] = x[idx+7];
1946: y[8] = x[idx+8];
1947: y[9] = x[idx+9];
1948: y[10] = x[idx+10];
1949: y[11] = x[idx+11];
1950: y += 12;
1951: }
1952: }
1954: PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1955: {
1956: PetscInt i,idy;
1959: switch (addv) {
1960: case INSERT_VALUES:
1961: case INSERT_ALL_VALUES:
1962: for (i=0; i<n; i++) {
1963: idy = *indicesy++;
1964: y[idy] = x[0];
1965: y[idy+1] = x[1];
1966: y[idy+2] = x[2];
1967: y[idy+3] = x[3];
1968: y[idy+4] = x[4];
1969: y[idy+5] = x[5];
1970: y[idy+6] = x[6];
1971: y[idy+7] = x[7];
1972: y[idy+8] = x[8];
1973: y[idy+9] = x[9];
1974: y[idy+10] = x[10];
1975: y[idy+11] = x[11];
1976: x += 12;
1977: }
1978: break;
1979: case ADD_VALUES:
1980: case ADD_ALL_VALUES:
1981: for (i=0; i<n; i++) {
1982: idy = *indicesy++;
1983: y[idy] += x[0];
1984: y[idy+1] += x[1];
1985: y[idy+2] += x[2];
1986: y[idy+3] += x[3];
1987: y[idy+4] += x[4];
1988: y[idy+5] += x[5];
1989: y[idy+6] += x[6];
1990: y[idy+7] += x[7];
1991: y[idy+8] += x[8];
1992: y[idy+9] += x[9];
1993: y[idy+10] += x[10];
1994: y[idy+11] += x[11];
1995: x += 12;
1996: }
1997: break;
1998: #if !defined(PETSC_USE_COMPLEX)
1999: case MAX_VALUES:
2000: for (i=0; i<n; i++) {
2001: idy = *indicesy++;
2002: y[idy] = PetscMax(y[idy],x[0]);
2003: y[idy+1] = PetscMax(y[idy+1],x[1]);
2004: y[idy+2] = PetscMax(y[idy+2],x[2]);
2005: y[idy+3] = PetscMax(y[idy+3],x[3]);
2006: y[idy+4] = PetscMax(y[idy+4],x[4]);
2007: y[idy+5] = PetscMax(y[idy+5],x[5]);
2008: y[idy+6] = PetscMax(y[idy+6],x[6]);
2009: y[idy+7] = PetscMax(y[idy+7],x[7]);
2010: y[idy+8] = PetscMax(y[idy+8],x[8]);
2011: y[idy+9] = PetscMax(y[idy+9],x[9]);
2012: y[idy+10] = PetscMax(y[idy+10],x[10]);
2013: y[idy+11] = PetscMax(y[idy+11],x[11]);
2014: x += 12;
2015: }
2016: #else
2017: case MAX_VALUES:
2018: #endif
2019: case NOT_SET_VALUES:
2020: break;
2021: default:
2022: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2023: }
2024: return(0);
2025: }
2027: PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2028: {
2029: PetscInt i,idx,idy;
2032: switch (addv) {
2033: case INSERT_VALUES:
2034: case INSERT_ALL_VALUES:
2035: for (i=0; i<n; i++) {
2036: idx = *indicesx++;
2037: idy = *indicesy++;
2038: y[idy] = x[idx];
2039: y[idy+1] = x[idx+1];
2040: y[idy+2] = x[idx+2];
2041: y[idy+3] = x[idx+3];
2042: y[idy+4] = x[idx+4];
2043: y[idy+5] = x[idx+5];
2044: y[idy+6] = x[idx+6];
2045: y[idy+7] = x[idx+7];
2046: y[idy+8] = x[idx+8];
2047: y[idy+9] = x[idx+9];
2048: y[idy+10] = x[idx+10];
2049: y[idy+11] = x[idx+11];
2050: }
2051: break;
2052: case ADD_VALUES:
2053: case ADD_ALL_VALUES:
2054: for (i=0; i<n; i++) {
2055: idx = *indicesx++;
2056: idy = *indicesy++;
2057: y[idy] += x[idx];
2058: y[idy+1] += x[idx+1];
2059: y[idy+2] += x[idx+2];
2060: y[idy+3] += x[idx+3];
2061: y[idy+4] += x[idx+4];
2062: y[idy+5] += x[idx+5];
2063: y[idy+6] += x[idx+6];
2064: y[idy+7] += x[idx+7];
2065: y[idy+8] += x[idx+8];
2066: y[idy+9] += x[idx+9];
2067: y[idy+10] += x[idx+10];
2068: y[idy+11] += x[idx+11];
2069: }
2070: break;
2071: #if !defined(PETSC_USE_COMPLEX)
2072: case MAX_VALUES:
2073: for (i=0; i<n; i++) {
2074: idx = *indicesx++;
2075: idy = *indicesy++;
2076: y[idy] = PetscMax(y[idy],x[idx]);
2077: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
2078: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
2079: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
2080: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
2081: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
2082: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
2083: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
2084: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
2085: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
2086: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
2087: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
2088: }
2089: #else
2090: case MAX_VALUES:
2091: #endif
2092: case NOT_SET_VALUES:
2093: break;
2094: default:
2095: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2096: }
2097: return(0);
2098: }
2100: /* ----------------------------------------------------------------------------------------------- */
2101: PETSC_STATIC_INLINE void Pack_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
2102: {
2103: PetscInt i,idx;
2105: for (i=0; i<n; i++) {
2106: idx = *indicesx++;
2107: PetscMemcpy(y,x + idx,bs*sizeof(PetscScalar));
2108: y += bs;
2109: }
2110: }
2112: PETSC_STATIC_INLINE PetscErrorCode UnPack_bs(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2113: {
2114: PetscInt i,idy,j;
2117: switch (addv) {
2118: case INSERT_VALUES:
2119: case INSERT_ALL_VALUES:
2120: for (i=0; i<n; i++) {
2121: idy = *indicesy++;
2122: PetscMemcpy(y + idy,x,bs*sizeof(PetscScalar));
2123: x += bs;
2124: }
2125: break;
2126: case ADD_VALUES:
2127: case ADD_ALL_VALUES:
2128: for (i=0; i<n; i++) {
2129: idy = *indicesy++;
2130: for (j=0; j<bs; j++) y[idy+j] += x[j];
2131: x += bs;
2132: }
2133: break;
2134: #if !defined(PETSC_USE_COMPLEX)
2135: case MAX_VALUES:
2136: for (i=0; i<n; i++) {
2137: idy = *indicesy++;
2138: for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[j]);
2139: x += bs;
2140: }
2141: #else
2142: case MAX_VALUES:
2143: #endif
2144: case NOT_SET_VALUES:
2145: break;
2146: default:
2147: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2148: }
2149: return(0);
2150: }
2152: PETSC_STATIC_INLINE PetscErrorCode Scatter_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2153: {
2154: PetscInt i,idx,idy,j;
2157: switch (addv) {
2158: case INSERT_VALUES:
2159: case INSERT_ALL_VALUES:
2160: for (i=0; i<n; i++) {
2161: idx = *indicesx++;
2162: idy = *indicesy++;
2163: PetscMemcpy(y + idy, x + idx,bs*sizeof(PetscScalar));
2164: }
2165: break;
2166: case ADD_VALUES:
2167: case ADD_ALL_VALUES:
2168: for (i=0; i<n; i++) {
2169: idx = *indicesx++;
2170: idy = *indicesy++;
2171: for (j=0; j<bs; j++ ) y[idy+j] += x[idx+j];
2172: }
2173: break;
2174: #if !defined(PETSC_USE_COMPLEX)
2175: case MAX_VALUES:
2176: for (i=0; i<n; i++) {
2177: idx = *indicesx++;
2178: idy = *indicesy++;
2179: for (j=0; j<bs; j++ ) y[idy+j] = PetscMax(y[idy+j],x[idx+j]);
2180: }
2181: #else
2182: case MAX_VALUES:
2183: #endif
2184: case NOT_SET_VALUES:
2185: break;
2186: default:
2187: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2188: }
2189: return(0);
2190: }
2192: /* Create the VecScatterBegin/End_P for our chosen block sizes */
2193: #define BS 1
2194: #include <../src/vec/vscat/impls/vpscat.h>
2195: #define BS 2
2196: #include <../src/vec/vscat/impls/vpscat.h>
2197: #define BS 3
2198: #include <../src/vec/vscat/impls/vpscat.h>
2199: #define BS 4
2200: #include <../src/vec/vscat/impls/vpscat.h>
2201: #define BS 5
2202: #include <../src/vec/vscat/impls/vpscat.h>
2203: #define BS 6
2204: #include <../src/vec/vscat/impls/vpscat.h>
2205: #define BS 7
2206: #include <../src/vec/vscat/impls/vpscat.h>
2207: #define BS 8
2208: #include <../src/vec/vscat/impls/vpscat.h>
2209: #define BS 9
2210: #include <../src/vec/vscat/impls/vpscat.h>
2211: #define BS 10
2212: #include <../src/vec/vscat/impls/vpscat.h>
2213: #define BS 11
2214: #include <../src/vec/vscat/impls/vpscat.h>
2215: #define BS 12
2216: #include <../src/vec/vscat/impls/vpscat.h>
2217: #define BS bs
2218: #include <../src/vec/vscat/impls/vpscat.h>
2220: /* ==========================================================================================*/
2221: /* create parallel to sequential scatter context */
2223: extern PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);
2225: /*
2226: bs indicates how many elements there are in each block. Normally this would be 1.
2228: contains check that PetscMPIInt can handle the sizes needed
2229: */
2230: #include <petscbt.h>
2231: PetscErrorCode VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2232: {
2233: VecScatter_MPI_General *from,*to;
2234: PetscMPIInt size,rank,mrank,imdex,tag,n;
2235: PetscInt *source = NULL,*owners = NULL,nxr;
2236: PetscInt *lowner = NULL,*start = NULL,*lsharedowner = NULL,*sharedstart = NULL,lengthy,lengthx;
2237: PetscMPIInt *nprocs = NULL,nrecvs,nrecvshared;
2238: PetscInt i,j,idx = 0,nsends;
2239: PetscMPIInt *owner = NULL;
2240: PetscInt *starts = NULL,count,slen,slenshared;
2241: PetscInt *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
2242: PetscMPIInt *onodes1,*olengths1;
2243: MPI_Comm comm;
2244: MPI_Request *send_waits = NULL,*recv_waits = NULL;
2245: MPI_Status recv_status,*send_status;
2246: PetscErrorCode ierr;
2247: PetscCommShared scomm;
2248: PetscMPIInt jj;
2249: MPI_Info info;
2250: MPI_Comm mscomm;
2251: MPI_Win sharedoffsetwin; /* Window that owns sharedspaceoffset */
2252: PetscInt *sharedspaceoffset;
2253: VecScatterType type;
2254: PetscBool mpi3node;
2256: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2257: PetscObjectGetComm((PetscObject)xin,&comm);
2258: MPI_Comm_rank(comm,&rank);
2259: MPI_Comm_size(comm,&size);
2261: VecScatterGetType(ctx,&type);
2262: PetscStrcmp(type,"mpi3node",&mpi3node);
2264: PetscCommSharedGet(comm,&scomm);
2265: PetscCommSharedGetComm(scomm,&mscomm);
2267: MPI_Info_create(&info);
2268: MPI_Info_set(info, "alloc_shared_noncontig", "true");
2270: if (mpi3node) {
2271: /* Check if (parallel) inidx has duplicate indices.
2272: Current VecScatterEndMPI3Node() (case StoP) writes the sequential vector to the parallel vector.
2273: Writing to the same shared location without variable locking
2274: leads to incorrect scattering. See src/vec/vscat/examples/runex2_5 and runex3_5 */
2276: PetscInt *mem,**optr;
2277: MPI_Win swin;
2278: PetscMPIInt msize;
2280: MPI_Comm_size(mscomm,&msize);
2281: MPI_Comm_rank(mscomm,&mrank);
2283: PetscMalloc1(msize,&optr);
2284: MPIU_Win_allocate_shared((nx+1)*sizeof(PetscInt),sizeof(PetscInt),MPI_INFO_NULL,mscomm,&mem,&swin);
2286: /* Write local nx and inidx into mem */
2287: mem[0] = nx;
2288: for (i=1; i<=nx; i++) mem[i] = inidx[i-1];
2289: MPI_Barrier(mscomm); /* sync shared memory */
2291: if (!mrank) {
2292: PetscBT table;
2293: /* sz and dsp_unit are not used. Replacing them with NULL would cause MPI_Win_shared_query() crash */
2294: MPI_Aint sz;
2295: PetscMPIInt dsp_unit;
2296: PetscInt N = xin->map->N;
2298: PetscBTCreate(N,&table); /* may not be scalable */
2299: PetscBTMemzero(N,table);
2301: jj = 0;
2302: while (jj<msize && !ctx->is_duplicate) {
2303: if (jj == mrank) {jj++; continue;}
2304: MPIU_Win_shared_query(swin,jj,&sz,&dsp_unit,&optr[jj]);
2305: for (j=1; j<=optr[jj][0]; j++) { /* optr[jj][0] = nx */
2306: if (PetscBTLookupSet(table,optr[jj][j])) {
2307: ctx->is_duplicate = PETSC_TRUE; /* only mrank=0 has this info., will be checked in VecScatterEndMPI3Node() */
2308: break;
2309: }
2310: }
2311: jj++;
2312: }
2313: PetscBTDestroy(&table);
2314: }
2316: if (swin != MPI_WIN_NULL) {MPI_Win_free(&swin);}
2317: PetscFree(optr);
2318: }
2320: owners = xin->map->range;
2321: VecGetSize(yin,&lengthy);
2322: VecGetSize(xin,&lengthx);
2324: /* first count number of contributors to each processor */
2325: PetscMalloc2(size,&nprocs,nx,&owner);
2326: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
2328: j = 0;
2329: nsends = 0;
2330: for (i=0; i<nx; i++) {
2331: PetscIntMultError(bs,inidx[i],&idx);
2332: if (idx < owners[j]) j = 0;
2333: for (; j<size; j++) {
2334: if (idx < owners[j+1]) {
2335: if (!nprocs[j]++) nsends++;
2336: owner[i] = j;
2337: break;
2338: }
2339: }
2340: if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
2341: }
2343: nprocslocal = nprocs[rank];
2344: nprocs[rank] = 0;
2345: if (nprocslocal) nsends--;
2346: /* inform other processors of number of messages and max length*/
2347: PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2348: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2349: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2350: recvtotal = 0;
2351: for (i=0; i<nrecvs; i++) {
2352: PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2353: }
2355: /* post receives: */
2356: PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);
2357: count = 0;
2358: for (i=0; i<nrecvs; i++) {
2359: MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2360: count += olengths1[i];
2361: }
2363: /* do sends:
2364: 1) starts[i] gives the starting index in svalues for stuff going to
2365: the ith processor
2366: */
2367: nxr = 0;
2368: for (i=0; i<nx; i++) {
2369: if (owner[i] != rank) nxr++;
2370: }
2371: PetscMalloc3(nxr,&svalues,nsends,&send_waits,size+1,&starts);
2373: starts[0] = 0;
2374: for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2375: for (i=0; i<nx; i++) {
2376: if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
2377: }
2378: starts[0] = 0;
2379: for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2380: count = 0;
2381: for (i=0; i<size; i++) {
2382: if (nprocs[i]) {
2383: MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
2384: }
2385: }
2387: /* wait on receives; this is everyone who we need to deliver data to */
2388: count = nrecvs;
2389: slen = 0;
2390: nrecvshared = 0;
2391: slenshared = 0;
2392: while (count) {
2393: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2394: /* unpack receives into our local space */
2395: MPI_Get_count(&recv_status,MPIU_INT,&n);
2396: PetscCommSharedGlobalToLocal(scomm,onodes1[imdex],&jj);
2397: if (jj > -1) {
2398: PetscInfo3(NULL,"[%d] Sending values to shared memory partner %d,global rank %d\n",rank,jj,onodes1[imdex]);
2399: nrecvshared++;
2400: slenshared += n;
2401: }
2402: slen += n;
2403: count--;
2404: }
2406: if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
2408: /* allocate entire send scatter context */
2409: PetscNewLog(ctx,&to);
2410: to->sharedwin = MPI_WIN_NULL;
2411: to->n = nrecvs-nrecvshared;
2413: PetscMalloc1(nrecvs-nrecvshared,&to->requests);
2414: PetscMalloc4(bs*(slen-slenshared),&to->values,slen-slenshared,&to->indices,nrecvs-nrecvshared+1,&to->starts,nrecvs-nrecvshared,&to->procs);
2415: PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);
2417: MPI_Comm_size(mscomm,&to->msize);
2418: PetscMalloc1(slenshared,&to->sharedspaceindices);
2419: PetscCalloc1(to->msize+1,&to->sharedspacestarts);
2421: ctx->todata = (void*)to;
2422: to->starts[0] = 0;
2423: to->sharedspacestarts[0] = 0;
2425: /* Allocate shared memory space for shared memory partner communication */
2426: MPIU_Win_allocate_shared(to->msize*sizeof(PetscInt),sizeof(PetscInt),info,mscomm,&sharedspaceoffset,&sharedoffsetwin);
2427: for (i=0; i<to->msize; i++) sharedspaceoffset[i] = -1; /* mark with -1 for later error checking */
2428: if (nrecvs) {
2429: /* move the data into the send scatter */
2430: base = owners[rank];
2431: rsvalues = rvalues;
2432: to->n = 0;
2433: for (i=0; i<nrecvs; i++) {
2434: values = rsvalues;
2435: PetscCommSharedGlobalToLocal(scomm,(PetscMPIInt)onodes1[i],&jj);
2436: if (jj > -1) {
2437: to->sharedspacestarts[jj] = to->sharedcnt;
2438: to->sharedspacestarts[jj+1] = to->sharedcnt + olengths1[i];
2439: for (j=0; j<olengths1[i]; j++) to->sharedspaceindices[to->sharedcnt + j] = values[j] - base;
2440: sharedspaceoffset[jj] = to->sharedcnt;
2441: to->sharedcnt += olengths1[i];
2442: PetscInfo4(NULL,"[%d] Sending %d values to shared memory partner %d,global rank %d\n",rank,olengths1[i],jj,onodes1[i]);
2443: } else {
2444: to->starts[to->n+1] = to->starts[to->n] + olengths1[i];
2445: to->procs[to->n] = onodes1[i];
2446: for (j=0; j<olengths1[i]; j++) to->indices[to->starts[to->n] + j] = values[j] - base;
2447: to->n++;
2448: }
2449: rsvalues += olengths1[i];
2450: }
2451: }
2452: PetscFree(olengths1);
2453: PetscFree(onodes1);
2454: PetscFree3(rvalues,source,recv_waits);
2456: if (mpi3node) {
2457: /* to->sharedspace is used only as a flag */
2458: i = 0;
2459: if (to->sharedcnt) i = 1;
2460: MPIU_Win_allocate_shared(i*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2461: } else { /* mpi3 */
2462: MPIU_Win_allocate_shared(bs*to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2463: }
2464: if (to->sharedwin == MPI_WIN_NULL) SETERRQ(PETSC_COMM_SELF,100,"what the");
2465: MPI_Info_free(&info);
2467: /* allocate entire receive scatter context */
2468: PetscNewLog(ctx,&from);
2469: from->sharedwin = MPI_WIN_NULL;
2470: from->msize = to->msize;
2471: /* we don't actually know the correct value for from->n at this point so we use the upper bound */
2472: from->n = nsends;
2474: PetscMalloc1(nsends,&from->requests);
2475: PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);
2476: ctx->fromdata = (void*)from;
2478: PetscCalloc1(to->msize+1,&from->sharedspacestarts);
2479: /* move data into receive scatter */
2480: PetscMalloc2(size,&lowner,nsends+1,&start);
2481: PetscMalloc2(size,&lsharedowner,to->msize+1,&sharedstart);
2482: count = 0; from->starts[0] = start[0] = 0;
2483: from->sharedcnt = 0;
2484: for (i=0; i<size; i++) {
2485: lsharedowner[i] = -1;
2486: lowner[i] = -1;
2487: if (nprocs[i]) {
2488: PetscCommSharedGlobalToLocal(scomm,i,&jj);
2489: if (jj > -1) {
2490: from->sharedspacestarts[jj] = sharedstart[jj] = from->sharedcnt;
2491: from->sharedspacestarts[jj+1] = sharedstart[jj+1] = from->sharedspacestarts[jj] + nprocs[i];
2492: from->sharedcnt += nprocs[i];
2493: lsharedowner[i] = jj;
2494: } else {
2495: lowner[i] = count++;
2496: from->procs[count-1] = i;
2497: from->starts[count] = start[count] = start[count-1] + nprocs[i];
2498: }
2499: }
2500: }
2501: from->n = count;
2503: for (i=0; i<nx; i++) {
2504: if (owner[i] != rank && lowner[owner[i]] > -1) {
2505: from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
2506: if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2507: }
2508: }
2510: /* copy over appropriate parts of inidy[] into from->sharedspaceindices */
2511: if (mpi3node) {
2512: /* in addition, copy over appropriate parts of inidx[] into 2nd part of from->sharedspaceindices */
2513: PetscInt *sharedspaceindicesx;
2514: PetscMalloc1(2*from->sharedcnt,&from->sharedspaceindices);
2515: sharedspaceindicesx = from->sharedspaceindices + from->sharedcnt;
2516: for (i=0; i<nx; i++) {
2517: if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2518: if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2520: from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]] = bs*inidy[i];
2521: sharedspaceindicesx[sharedstart[lsharedowner[owner[i]]]] = bs*inidx[i]-owners[owner[i]]; /* local inidx */
2522: sharedstart[lsharedowner[owner[i]]]++;
2523: }
2524: }
2525: } else { /* mpi3 */
2526: PetscMalloc1(from->sharedcnt,&from->sharedspaceindices);
2527: for (i=0; i<nx; i++) {
2528: if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2529: if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2530: from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]++] = bs*inidy[i];
2531: }
2532: }
2533: }
2535: PetscFree2(lowner,start);
2536: PetscFree2(lsharedowner,sharedstart);
2537: PetscFree2(nprocs,owner);
2539: /* wait on sends */
2540: if (nsends) {
2541: PetscMalloc1(nsends,&send_status);
2542: MPI_Waitall(nsends,send_waits,send_status);
2543: PetscFree(send_status);
2544: }
2545: PetscFree3(svalues,send_waits,starts);
2547: if (nprocslocal) {
2548: PetscInt nt = from->local.n = to->local.n = nprocslocal;
2549: /* we have a scatter to ourselves */
2550: PetscMalloc1(nt,&to->local.vslots);
2551: PetscMalloc1(nt,&from->local.vslots);
2552: nt = 0;
2553: for (i=0; i<nx; i++) {
2554: idx = bs*inidx[i];
2555: if (idx >= owners[rank] && idx < owners[rank+1]) {
2556: to->local.vslots[nt] = idx - owners[rank];
2557: from->local.vslots[nt++] = bs*inidy[i];
2558: if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2559: }
2560: }
2561: PetscLogObjectMemory((PetscObject)ctx,2*nt*sizeof(PetscInt));
2562: } else {
2563: from->local.n = 0;
2564: from->local.vslots = 0;
2565: to->local.n = 0;
2566: to->local.vslots = 0;
2567: }
2569: /* Get the shared memory address for all processes we will be copying data from */
2570: PetscCalloc1(to->msize,&from->sharedspaces);
2571: PetscCalloc1(to->msize,&from->sharedspacesoffset);
2572: MPI_Comm_rank(mscomm,&mrank);
2573: for (jj=0; jj<to->msize; jj++) {
2574: MPI_Aint isize;
2575: PetscMPIInt disp_unit;
2576: PetscInt *ptr;
2577: MPIU_Win_shared_query(to->sharedwin,jj,&isize,&disp_unit,&from->sharedspaces[jj]);
2578: MPIU_Win_shared_query(sharedoffsetwin,jj,&isize,&disp_unit,&ptr);
2579: from->sharedspacesoffset[jj] = ptr[mrank];
2580: }
2581: MPI_Win_free(&sharedoffsetwin);
2583: if (mpi3node && !from->sharedspace) {
2584: /* comput from->notdone to be used by VecScatterEndMPI3Node() */
2585: PetscInt notdone = 0;
2586: for (i=0; i<from->msize; i++) {
2587: if (from->sharedspacesoffset && from->sharedspacesoffset[i] > -1) {
2588: notdone++;
2589: }
2590: }
2591: from->notdone = notdone;
2592: }
2594: from->local.nonmatching_computed = PETSC_FALSE;
2595: from->local.n_nonmatching = 0;
2596: from->local.slots_nonmatching = 0;
2597: to->local.nonmatching_computed = PETSC_FALSE;
2598: to->local.n_nonmatching = 0;
2599: to->local.slots_nonmatching = 0;
2601: from->format = VEC_SCATTER_MPI_GENERAL;
2602: to->format = VEC_SCATTER_MPI_GENERAL;
2603: from->bs = bs;
2604: to->bs = bs;
2606: VecScatterCreateCommon_PtoS_MPI3(from,to,ctx);
2607: return(0);
2608: }
2610: /*
2611: bs indicates how many elements there are in each block. Normally this would be 1.
2612: */
2613: PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
2614: {
2615: MPI_Comm comm;
2616: PetscMPIInt tag = ((PetscObject)ctx)->tag, tagr;
2617: PetscInt bs = to->bs;
2618: PetscMPIInt size;
2619: PetscInt i, n;
2621: VecScatterType type;
2622: PetscBool mpi3node;
2625: PetscObjectGetComm((PetscObject)ctx,&comm);
2626: PetscObjectGetNewTag((PetscObject)ctx,&tagr);
2627: ctx->ops->destroy = VecScatterDestroy_PtoP_MPI3;
2629: ctx->reproduce = PETSC_FALSE;
2630: to->sendfirst = PETSC_FALSE;
2631: PetscOptionsGetBool(NULL,NULL,"-vecscatter_reproduce",&ctx->reproduce,NULL);
2632: PetscOptionsGetBool(NULL,NULL,"-vecscatter_sendfirst",&to->sendfirst,NULL);
2633: from->sendfirst = to->sendfirst;
2635: MPI_Comm_size(comm,&size);
2636: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
2637: to->contiq = PETSC_FALSE;
2638: n = from->starts[from->n];
2639: from->contiq = PETSC_TRUE;
2640: for (i=1; i<n; i++) {
2641: if (from->indices[i] != from->indices[i-1] + bs) {
2642: from->contiq = PETSC_FALSE;
2643: break;
2644: }
2645: }
2647: to->use_alltoallv = PETSC_FALSE;
2648: PetscOptionsGetBool(NULL,NULL,"-vecscatter_alltoall",&to->use_alltoallv,NULL);
2649: from->use_alltoallv = to->use_alltoallv;
2650: if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
2651: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
2652: if (to->use_alltoallv) {
2653: to->use_alltoallw = PETSC_FALSE;
2654: PetscOptionsGetBool(NULL,NULL,"-vecscatter_nopack",&to->use_alltoallw,NULL);
2655: }
2656: from->use_alltoallw = to->use_alltoallw;
2657: if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
2658: #endif
2660: to->use_window = PETSC_FALSE;
2661: PetscOptionsGetBool(NULL,NULL,"-vecscatter_window",&to->use_window,NULL);
2662: from->use_window = to->use_window;
2664: if (to->use_alltoallv) {
2665: PetscMalloc2(size,&to->counts,size,&to->displs);
2666: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
2667: for (i=0; i<to->n; i++) to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
2669: to->displs[0] = 0;
2670: for (i=1; i<size; i++) to->displs[i] = to->displs[i-1] + to->counts[i-1];
2672: PetscMalloc2(size,&from->counts,size,&from->displs);
2673: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
2674: for (i=0; i<from->n; i++) from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
2675: from->displs[0] = 0;
2676: for (i=1; i<size; i++) from->displs[i] = from->displs[i-1] + from->counts[i-1];
2678: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
2679: if (to->use_alltoallw) {
2680: PetscMPIInt mpibs, mpilen;
2682: ctx->packtogether = PETSC_FALSE;
2683: PetscMPIIntCast(bs,&mpibs);
2684: PetscMalloc3(size,&to->wcounts,size,&to->wdispls,size,&to->types);
2685: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
2686: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
2687: for (i=0; i<size; i++) to->types[i] = MPIU_SCALAR;
2689: for (i=0; i<to->n; i++) {
2690: to->wcounts[to->procs[i]] = 1;
2691: PetscMPIIntCast(to->starts[i+1]-to->starts[i],&mpilen);
2692: MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
2693: MPI_Type_commit(to->types+to->procs[i]);
2694: }
2695: PetscMalloc3(size,&from->wcounts,size,&from->wdispls,size,&from->types);
2696: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
2697: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
2698: for (i=0; i<size; i++) from->types[i] = MPIU_SCALAR;
2700: if (from->contiq) {
2701: PetscInfo(ctx,"Scattered vector entries are stored contiguously, taking advantage of this with -vecscatter_alltoall\n");
2702: for (i=0; i<from->n; i++) from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
2704: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
2705: for (i=1; i<from->n; i++) from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
2707: } else {
2708: for (i=0; i<from->n; i++) {
2709: from->wcounts[from->procs[i]] = 1;
2710: PetscMPIIntCast(from->starts[i+1]-from->starts[i],&mpilen);
2711: MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
2712: MPI_Type_commit(from->types+from->procs[i]);
2713: }
2714: }
2715: } else ctx->ops->copy = VecScatterCopy_PtoP_AllToAll;
2717: #else
2718: to->use_alltoallw = PETSC_FALSE;
2719: from->use_alltoallw = PETSC_FALSE;
2720: ctx->ops->copy = VecScatterCopy_PtoP_AllToAll;
2721: #endif
2722: } else if (to->use_window) {
2723: PetscMPIInt temptag,winsize;
2724: PetscInt iwinsize = 0;
2725: MPI_Request *request;
2726: MPI_Status *status;
2728: PetscObjectGetNewTag((PetscObject)ctx,&temptag);
2729: PetscIntMultError((to->n ? to->starts[to->n] : 0),bs*sizeof(PetscScalar),&iwinsize);
2730: PetscMPIIntCast(iwinsize,&winsize);
2731: MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
2732: PetscMalloc1(to->n,&to->winstarts);
2733: PetscMalloc2(to->n,&request,to->n,&status);
2734: for (i=0; i<to->n; i++) {
2735: MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
2736: }
2737: for (i=0; i<from->n; i++) {
2738: MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
2739: }
2740: MPI_Waitall(to->n,request,status);
2741: PetscFree2(request,status);
2743: PetscIntMultError((from->n ? from->starts[from->n] : 0),bs*sizeof(PetscScalar),&iwinsize);
2744: PetscMPIIntCast(iwinsize,&winsize);
2745: MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
2746: PetscMalloc1(from->n,&from->winstarts);
2747: PetscMalloc2(from->n,&request,from->n,&status);
2748: for (i=0; i<from->n; i++) {
2749: MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
2750: }
2751: for (i=0; i<to->n; i++) {
2752: MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
2753: }
2754: MPI_Waitall(from->n,request,status);
2755: PetscFree2(request,status);
2756: } else {
2757: PetscBool use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
2758: PetscInt *sstarts = to->starts, *rstarts = from->starts;
2759: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
2760: MPI_Request *swaits = to->requests,*rwaits = from->requests;
2761: MPI_Request *rev_swaits,*rev_rwaits;
2762: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
2764: /* allocate additional wait variables for the "reverse" scatter */
2765: PetscMalloc1(to->n,&rev_rwaits);
2766: PetscMalloc1(from->n,&rev_swaits);
2767: to->rev_requests = rev_rwaits;
2768: from->rev_requests = rev_swaits;
2770: /* Register the receives that you will use later (sends for scatter reverse) */
2771: PetscOptionsGetBool(NULL,NULL,"-vecscatter_rsend",&use_rsend,NULL);
2772: PetscOptionsGetBool(NULL,NULL,"-vecscatter_ssend",&use_ssend,NULL);
2773: if (use_rsend) {
2774: PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
2775: to->use_readyreceiver = PETSC_TRUE;
2776: from->use_readyreceiver = PETSC_TRUE;
2777: } else {
2778: to->use_readyreceiver = PETSC_FALSE;
2779: from->use_readyreceiver = PETSC_FALSE;
2780: }
2781: if (use_ssend) {
2782: PetscInfo(ctx,"Using VecScatter Ssend mode\n");
2783: }
2785: for (i=0; i<from->n; i++) {
2786: if (use_rsend) {
2787: MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2788: } else if (use_ssend) {
2789: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2790: } else {
2791: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2792: }
2793: }
2795: for (i=0; i<to->n; i++) {
2796: if (use_rsend) {
2797: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2798: } else if (use_ssend) {
2799: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2800: } else {
2801: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2802: }
2803: }
2804: /* Register receives for scatter and reverse */
2805: for (i=0; i<from->n; i++) {
2806: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2807: }
2808: for (i=0; i<to->n; i++) {
2809: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2810: }
2811: if (use_rsend) {
2812: if (to->n) {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
2813: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
2814: MPI_Barrier(comm);
2815: }
2816: ctx->ops->copy = VecScatterCopy_PtoP_X;
2817: }
2818: PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
2820: #if defined(PETSC_USE_DEBUG)
2821: MPIU_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));
2822: MPIU_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));
2823: if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2824: #endif
2826: VecScatterGetType(ctx,&type);
2827: PetscStrcmp(type,"mpi3node",&mpi3node);
2829: if (mpi3node) {
2830: switch (bs) {
2831: case 12:
2832: ctx->ops->begin = VecScatterBeginMPI3Node_12;
2833: ctx->ops->end = VecScatterEndMPI3Node_12;
2834: break;
2835: case 11:
2836: ctx->ops->begin = VecScatterBeginMPI3Node_11;
2837: ctx->ops->end = VecScatterEndMPI3Node_11;
2838: break;
2839: case 10:
2840: ctx->ops->begin = VecScatterBeginMPI3Node_10;
2841: ctx->ops->end = VecScatterEndMPI3Node_10;
2842: break;
2843: case 9:
2844: ctx->ops->begin = VecScatterBeginMPI3Node_9;
2845: ctx->ops->end = VecScatterEndMPI3Node_9;
2846: break;
2847: case 8:
2848: ctx->ops->begin = VecScatterBeginMPI3Node_8;
2849: ctx->ops->end = VecScatterEndMPI3Node_8;
2850: break;
2851: case 7:
2852: ctx->ops->begin = VecScatterBeginMPI3Node_7;
2853: ctx->ops->end = VecScatterEndMPI3Node_7;
2854: break;
2855: case 6:
2856: ctx->ops->begin = VecScatterBeginMPI3Node_6;
2857: ctx->ops->end = VecScatterEndMPI3Node_6;
2858: break;
2859: case 5:
2860: ctx->ops->begin = VecScatterBeginMPI3Node_5;
2861: ctx->ops->end = VecScatterEndMPI3Node_5;
2862: break;
2863: case 4:
2864: ctx->ops->begin = VecScatterBeginMPI3Node_4;
2865: ctx->ops->end = VecScatterEndMPI3Node_4;
2866: break;
2867: case 3:
2868: ctx->ops->begin = VecScatterBeginMPI3Node_3;
2869: ctx->ops->end = VecScatterEndMPI3Node_3;
2870: break;
2871: case 2:
2872: ctx->ops->begin = VecScatterBeginMPI3Node_2;
2873: ctx->ops->end = VecScatterEndMPI3Node_2;
2874: break;
2875: case 1:
2876: ctx->ops->begin = VecScatterBeginMPI3Node_1;
2877: ctx->ops->end = VecScatterEndMPI3Node_1;
2878: break;
2879: default:
2880: ctx->ops->begin = VecScatterBegin_bs;
2881: ctx->ops->end = VecScatterEnd_bs;
2882: }
2883: } else { /* !mpi3node */
2885: switch (bs) {
2886: case 12:
2887: ctx->ops->begin = VecScatterBegin_12;
2888: ctx->ops->end = VecScatterEnd_12;
2889: break;
2890: case 11:
2891: ctx->ops->begin = VecScatterBegin_11;
2892: ctx->ops->end = VecScatterEnd_11;
2893: break;
2894: case 10:
2895: ctx->ops->begin = VecScatterBegin_10;
2896: ctx->ops->end = VecScatterEnd_10;
2897: break;
2898: case 9:
2899: ctx->ops->begin = VecScatterBegin_9;
2900: ctx->ops->end = VecScatterEnd_9;
2901: break;
2902: case 8:
2903: ctx->ops->begin = VecScatterBegin_8;
2904: ctx->ops->end = VecScatterEnd_8;
2905: break;
2906: case 7:
2907: ctx->ops->begin = VecScatterBegin_7;
2908: ctx->ops->end = VecScatterEnd_7;
2909: break;
2910: case 6:
2911: ctx->ops->begin = VecScatterBegin_6;
2912: ctx->ops->end = VecScatterEnd_6;
2913: break;
2914: case 5:
2915: ctx->ops->begin = VecScatterBegin_5;
2916: ctx->ops->end = VecScatterEnd_5;
2917: break;
2918: case 4:
2919: ctx->ops->begin = VecScatterBegin_4;
2920: ctx->ops->end = VecScatterEnd_4;
2921: break;
2922: case 3:
2923: ctx->ops->begin = VecScatterBegin_3;
2924: ctx->ops->end = VecScatterEnd_3;
2925: break;
2926: case 2:
2927: ctx->ops->begin = VecScatterBegin_2;
2928: ctx->ops->end = VecScatterEnd_2;
2929: break;
2930: case 1:
2931: if (mpi3node) {
2932: ctx->ops->begin = VecScatterBeginMPI3Node_1;
2933: ctx->ops->end = VecScatterEndMPI3Node_1;
2934: } else {
2935: ctx->ops->begin = VecScatterBegin_1;
2936: ctx->ops->end = VecScatterEnd_1;
2937: }
2938: break;
2939: default:
2940: ctx->ops->begin = VecScatterBegin_bs;
2941: ctx->ops->end = VecScatterEnd_bs;
2942: }
2943: }
2945: ctx->ops->view = VecScatterView_MPI;
2947: /* Check if the local scatter is actually a copy; important special case */
2948: if (to->local.n) {
2949: VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
2950: }
2951: return(0);
2952: }
2954: /* ------------------------------------------------------------------------------------*/
2955: /*
2956: Scatter from local Seq vectors to a parallel vector.
2957: Reverses the order of the arguments, calls VecScatterCreateLocal_PtoS() then
2958: reverses the result.
2959: */
2960: PetscErrorCode VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2961: {
2962: PetscErrorCode ierr;
2963: MPI_Request *waits;
2964: VecScatter_MPI_General *to,*from;
2967: VecScatterCreateLocal_PtoS_MPI3(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2968: to = (VecScatter_MPI_General*)ctx->fromdata;
2969: from = (VecScatter_MPI_General*)ctx->todata;
2970: ctx->todata = (void*)to;
2971: ctx->fromdata = (void*)from;
2972: /* these two are special, they are ALWAYS stored in to struct */
2973: to->sstatus = from->sstatus;
2974: to->rstatus = from->rstatus;
2976: from->sstatus = 0;
2977: from->rstatus = 0;
2978: waits = from->rev_requests;
2979: from->rev_requests = from->requests;
2980: from->requests = waits;
2981: waits = to->rev_requests;
2982: to->rev_requests = to->requests;
2983: to->requests = waits;
2984: return(0);
2985: }
2987: /* ---------------------------------------------------------------------------------*/
2988: PetscErrorCode VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2989: {
2991: PetscMPIInt size,rank,tag,imdex,n;
2992: PetscInt *owners = xin->map->range;
2993: PetscMPIInt *nprocs = NULL;
2994: PetscInt i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2995: PetscMPIInt *owner = NULL;
2996: PetscInt *starts = NULL,count,slen;
2997: PetscInt *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2998: PetscMPIInt *onodes1,*olengths1,nrecvs;
2999: MPI_Comm comm;
3000: MPI_Request *send_waits = NULL,*recv_waits = NULL;
3001: MPI_Status recv_status,*send_status = NULL;
3002: PetscBool duplicate = PETSC_FALSE;
3003: #if defined(PETSC_USE_DEBUG)
3004: PetscBool found = PETSC_FALSE;
3005: #endif
3008: PetscObjectGetNewTag((PetscObject)ctx,&tag);
3009: PetscObjectGetComm((PetscObject)xin,&comm);
3010: MPI_Comm_size(comm,&size);
3011: MPI_Comm_rank(comm,&rank);
3012: if (size == 1) {
3013: VecScatterCreateLocal_StoP_MPI3(nx,inidx,ny,inidy,xin,yin,bs,ctx);
3014: return(0);
3015: }
3017: /*
3018: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
3019: They then call the StoPScatterCreate()
3020: */
3021: /* first count number of contributors to each processor */
3022: PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);
3023: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
3025: lastidx = -1;
3026: j = 0;
3027: for (i=0; i<nx; i++) {
3028: /* if indices are NOT locally sorted, need to start search at the beginning */
3029: if (lastidx > (idx = bs*inidx[i])) j = 0;
3030: lastidx = idx;
3031: for (; j<size; j++) {
3032: if (idx >= owners[j] && idx < owners[j+1]) {
3033: nprocs[j]++;
3034: owner[i] = j;
3035: #if defined(PETSC_USE_DEBUG)
3036: found = PETSC_TRUE;
3037: #endif
3038: break;
3039: }
3040: }
3041: #if defined(PETSC_USE_DEBUG)
3042: if (!found) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
3043: found = PETSC_FALSE;
3044: #endif
3045: }
3046: nsends = 0;
3047: for (i=0; i<size; i++) nsends += (nprocs[i] > 0);
3049: /* inform other processors of number of messages and max length*/
3050: PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
3051: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
3052: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
3053: recvtotal = 0;
3054: for (i=0; i<nrecvs; i++) {
3055: PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
3056: }
3058: /* post receives: */
3059: PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);
3061: count = 0;
3062: for (i=0; i<nrecvs; i++) {
3063: MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
3064: PetscIntSumError(count,olengths1[i],&count);
3065: }
3066: PetscFree(onodes1);
3068: /* do sends:
3069: 1) starts[i] gives the starting index in svalues for stuff going to
3070: the ith processor
3071: */
3072: starts[0]= 0;
3073: for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
3074: for (i=0; i<nx; i++) {
3075: svalues[2*starts[owner[i]]] = bs*inidx[i];
3076: svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
3077: }
3079: starts[0] = 0;
3080: for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
3081: count = 0;
3082: for (i=0; i<size; i++) {
3083: if (nprocs[i]) {
3084: MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
3085: count++;
3086: }
3087: }
3088: PetscFree3(nprocs,owner,starts);
3090: /* wait on receives */
3091: count = nrecvs;
3092: slen = 0;
3093: while (count) {
3094: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
3095: /* unpack receives into our local space */
3096: MPI_Get_count(&recv_status,MPIU_INT,&n);
3097: slen += n/2;
3098: count--;
3099: }
3100: if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
3102: PetscMalloc2(slen,&local_inidx,slen,&local_inidy);
3103: base = owners[rank];
3104: count = 0;
3105: rsvalues = rvalues;
3106: for (i=0; i<nrecvs; i++) {
3107: values = rsvalues;
3108: rsvalues += 2*olengths1[i];
3109: for (j=0; j<olengths1[i]; j++) {
3110: local_inidx[count] = values[2*j] - base;
3111: local_inidy[count++] = values[2*j+1];
3112: }
3113: }
3114: PetscFree(olengths1);
3116: /* wait on sends */
3117: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
3118: PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);
3120: /*
3121: should sort and remove duplicates from local_inidx,local_inidy
3122: */
3123: #if defined(do_it_slow)
3124: /* sort on the from index */
3125: PetscSortIntWithArray(slen,local_inidx,local_inidy);
3126: start = 0;
3127: while (start < slen) {
3128: count = start+1;
3129: last = local_inidx[start];
3130: while (count < slen && last == local_inidx[count]) count++;
3131: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
3132: /* sort on to index */
3133: PetscSortInt(count-start,local_inidy+start);
3134: }
3135: /* remove duplicates; not most efficient way, but probably good enough */
3136: i = start;
3137: while (i < count-1) {
3138: if (local_inidy[i] != local_inidy[i+1]) i++;
3139: else { /* found a duplicate */
3140: duplicate = PETSC_TRUE;
3141: for (j=i; j<slen-1; j++) {
3142: local_inidx[j] = local_inidx[j+1];
3143: local_inidy[j] = local_inidy[j+1];
3144: }
3145: slen--;
3146: count--;
3147: }
3148: }
3149: start = count;
3150: }
3151: #endif
3152: if (duplicate) {
3153: PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
3154: }
3155: VecScatterCreateLocal_StoP_MPI3(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
3156: PetscFree2(local_inidx,local_inidy);
3157: return(0);
3158: }
3160: #endif