Actual source code: vpscat.c

petsc-3.14.6 2021-03-30
Report Typos and Errors
  1: /*
  2:     Defines parallel vector scatters.
  3: */

  5: #include <petsc/private/vecscatterimpl.h>


  8: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
  9: {
 10:   VecScatter_MPI_General *to  =(VecScatter_MPI_General*)ctx->todata;
 11:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 12:   PetscErrorCode         ierr;
 13:   PetscInt               i;
 14:   PetscMPIInt            rank;
 15:   PetscViewerFormat      format;
 16:   PetscBool              iascii;

 19:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 20:   if (iascii) {
 21:     MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);
 22:     PetscViewerGetFormat(viewer,&format);
 23:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 24:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 26:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 27:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 28:       itmp = to->starts[to->n+1];
 29:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 30:       itmp = from->starts[from->n+1];
 31:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 32:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));

 34:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 35:       PetscViewerASCIIPrintf(viewer,"  Blocksize %D\n",to->bs);
 36:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 37:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 40:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
 41:     } else {
 42:       PetscViewerASCIIPrintf(viewer,"  VecScatter Blocksize %D\n",to->bs);
 43:       PetscViewerASCIIPushSynchronized(viewer);
 44:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 45:       if (to->n) {
 46:         for (i=0; i<to->n; i++) {
 47:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 48:         }
 49:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote sends (in order by process sent to)\n",rank);
 50:         for (i=0; i<to->starts[to->n]; i++) {
 51:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
 52:         }
 53:       }

 55:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 56:       if (from->n) {
 57:         for (i=0; i<from->n; i++) {
 58:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 59:         }

 61:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote receives (in order by process received from)\n",rank);
 62:         for (i=0; i<from->starts[from->n]; i++) {
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
 64:         }
 65:       }
 66:       if (to->local.n) {
 67:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
 68:         for (i=0; i<to->local.n; i++) {  /* the to and from have the opposite meaning from what you would expect */
 69:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,to->local.vslots[i],from->local.vslots[i]);
 70:         }
 71:       }

 73:      for (i=0; i<to->msize; i++) {
 74:        if (to->sharedspacestarts[i+1] > to->sharedspacestarts[i]) {
 75:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory to local memory partner %d count %d\n",rank,i,to->sharedspacestarts[i+1]-to->sharedspacestarts[i]);
 76:         }
 77:       }
 78:      for (i=0; i<from->msize; i++) {
 79:        if (from->sharedspacestarts[i+1] > from->sharedspacestarts[i]) {
 80:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory from local memory partner %d count %d\n",rank,i,from->sharedspacestarts[i+1]-from->sharedspacestarts[i]);
 81:         }
 82:       }

 84:       PetscViewerFlush(viewer);
 85:       PetscViewerASCIIPopSynchronized(viewer);

 87:       PetscViewerASCIIPrintf(viewer,"Method used to implement the VecScatter: ");
 88:     }
 89:   }
 90:   return(0);
 91: }

 93: /* -----------------------------------------------------------------------------------*/
 94: /*
 95:       The next routine determines what part of  the local part of the scatter is an
 96:   exact copy of values into their current location. We check this here and
 97:   then know that we need not perform that portion of the scatter when the vector is
 98:   scattering to itself with INSERT_VALUES.

100:      This is currently not used but would speed up, for example DMLocalToLocalBegin/End()

102: */
103: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
104: {
105:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
107:   PetscInt       *nto_slots,*nfrom_slots,j = 0;

110:   for (i=0; i<n; i++) {
111:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
112:   }

114:   if (!n_nonmatching) {
115:     to->nonmatching_computed = PETSC_TRUE;
116:     to->n_nonmatching        = from->n_nonmatching = 0;
117:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
118:   } else if (n_nonmatching == n) {
119:     to->nonmatching_computed = PETSC_FALSE;
120:     PetscInfo(scatter,"All values non-matching\n");
121:   } else {
122:     to->nonmatching_computed= PETSC_TRUE;
123:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;

125:     PetscMalloc1(n_nonmatching,&nto_slots);
126:     PetscMalloc1(n_nonmatching,&nfrom_slots);

128:     to->slots_nonmatching   = nto_slots;
129:     from->slots_nonmatching = nfrom_slots;
130:     for (i=0; i<n; i++) {
131:       if (to_slots[i] != from_slots[i]) {
132:         nto_slots[j]   = to_slots[i];
133:         nfrom_slots[j] = from_slots[i];
134:         j++;
135:       }
136:     }
137:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
138:   }
139:   return(0);
140: }

142: /* --------------------------------------------------------------------------------------*/

144: /* -------------------------------------------------------------------------------------*/
145: PetscErrorCode VecScatterDestroy_PtoP_MPI3(VecScatter ctx)
146: {
147:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
148:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
149:   PetscErrorCode         ierr;
150:   PetscInt               i;

153:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
154:   if (to->requests) {
155:     for (i=0; i<to->n; i++) {
156:       MPI_Request_free(to->requests + i);
157:     }
158:   }
159:   if (to->rev_requests) {
160:     for (i=0; i<to->n; i++) {
161:       MPI_Request_free(to->rev_requests + i);
162:     }
163:   }
164:   if (from->requests) {
165:     for (i=0; i<from->n; i++) {
166:       MPI_Request_free(from->requests + i);
167:     }
168:   }
169:   if (from->rev_requests) {
170:     for (i=0; i<from->n; i++) {
171:       MPI_Request_free(from->rev_requests + i);
172:     }
173:   }
174:   if (to->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&to->sharedwin);}
175:   if (from->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&from->sharedwin);}
176:   PetscFree(to->sharedspaces);
177:   PetscFree(to->sharedspacesoffset);
178:   PetscFree(to->sharedspaceindices);
179:   PetscFree(to->sharedspacestarts);

181:   PetscFree(from->sharedspaceindices);
182:   PetscFree(from->sharedspaces);
183:   PetscFree(from->sharedspacesoffset);
184:   PetscFree(from->sharedspacestarts);

186:   PetscFree(to->local.vslots);
187:   PetscFree(from->local.vslots);
188:   PetscFree(to->local.slots_nonmatching);
189:   PetscFree(from->local.slots_nonmatching);
190:   PetscFree(to->rev_requests);
191:   PetscFree(from->rev_requests);
192:   PetscFree(to->requests);
193:   PetscFree(from->requests);
194:   PetscFree4(to->values,to->indices,to->starts,to->procs);
195:   PetscFree2(to->sstatus,to->rstatus);
196:   PetscFree4(from->values,from->indices,from->starts,from->procs);
197:   VecScatterMemcpyPlanDestroy_PtoP(to,from);
198:   PetscFree(from);
199:   PetscFree(to);
200:   return(0);
201: }

203: /* --------------------------------------------------------------------------------------*/

205: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
206: {
207:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
208:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
209:   PetscErrorCode         ierr;
210:   PetscInt               ny,bs = in_from->bs,jj;
211:   PetscShmComm           scomm;
212:   MPI_Comm               mscomm;
213:   MPI_Info               info;

216:   out->ops->begin   = in->ops->begin;
217:   out->ops->end     = in->ops->end;
218:   out->ops->copy    = in->ops->copy;
219:   out->ops->destroy = in->ops->destroy;
220:   out->ops->view    = in->ops->view;

222:   /* allocate entire send scatter context */
223:   PetscNewLog(out,&out_to);
224:   out_to->sharedwin       = MPI_WIN_NULL;
225:   PetscNewLog(out,&out_from);
226:   out_from->sharedwin       = MPI_WIN_NULL;

228:   ny                = in_to->starts[in_to->n];
229:   out_to->n         = in_to->n;
230:   out_to->format    = in_to->format;

232:   PetscMalloc1(out_to->n,&out_to->requests);
233:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
234:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
235:   PetscArraycpy(out_to->indices,in_to->indices,ny);
236:   PetscArraycpy(out_to->starts,in_to->starts,out_to->n+1);
237:   PetscArraycpy(out_to->procs,in_to->procs,out_to->n);

239:   out->todata                        = (void*)out_to;
240:   out_to->local.n                    = in_to->local.n;
241:   out_to->local.nonmatching_computed = PETSC_FALSE;
242:   out_to->local.n_nonmatching        = 0;
243:   out_to->local.slots_nonmatching    = NULL;
244:   if (in_to->local.n) {
245:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
246:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
247:     PetscArraycpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n);
248:     PetscArraycpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n);
249:   } else {
250:     out_to->local.vslots   = NULL;
251:     out_from->local.vslots = NULL;
252:   }

254:   /* allocate entire receive context */
255:   VecScatterMemcpyPlanCopy(&in_to->local.memcpy_plan,&out_to->local.memcpy_plan);
256:   VecScatterMemcpyPlanCopy(&in_from->local.memcpy_plan,&out_from->local.memcpy_plan);
257:   VecScatterMemcpyPlanCopy(&in_to->memcpy_plan,&out_to->memcpy_plan);
258:   VecScatterMemcpyPlanCopy(&in_from->memcpy_plan,&out_from->memcpy_plan);

260:   out_from->format    = in_from->format;
261:   ny                  = in_from->starts[in_from->n];
262:   out_from->n         = in_from->n;

264:   PetscMalloc1(out_from->n,&out_from->requests);
265:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
266:   PetscArraycpy(out_from->indices,in_from->indices,ny);
267:   PetscArraycpy(out_from->starts,in_from->starts,out_from->n+1);
268:   PetscArraycpy(out_from->procs,in_from->procs,out_from->n);

270:   out->fromdata                        = (void*)out_from;
271:   out_from->local.n                    = in_from->local.n;
272:   out_from->local.nonmatching_computed = PETSC_FALSE;
273:   out_from->local.n_nonmatching        = 0;
274:   out_from->local.slots_nonmatching    = NULL;

276:   /*
277:       set up the request arrays for use with isend_init() and irecv_init()
278:   */
279:   {
280:     PetscMPIInt tag;
281:     MPI_Comm    comm;
282:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
283:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
284:     PetscInt    i;
285:     MPI_Request *swaits   = out_to->requests,*rwaits  = out_from->requests;
286:     MPI_Request *rev_swaits,*rev_rwaits;
287:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

289:     PetscMalloc1(in_to->n,&out_to->rev_requests);
290:     PetscMalloc1(in_from->n,&out_from->rev_requests);

292:     rev_rwaits = out_to->rev_requests;
293:     rev_swaits = out_from->rev_requests;

295:     out_from->bs = out_to->bs = bs;
296:     tag          = ((PetscObject)out)->tag;
297:     PetscObjectGetComm((PetscObject)out,&comm);

299:     /* Register the receives that you will use later (sends for scatter reverse) */
300:     for (i=0; i<out_from->n; i++) {
301:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
302:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
303:     }

305:     for (i=0; i<out_to->n; i++) {
306:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
307:     }

309:     /* Register receives for scatter reverse */
310:     for (i=0; i<out_to->n; i++) {
311:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
312:     }
313:   }

315:     /* since the to and from data structures are not symmetric for shared memory copies we insure they always listed in "standard" form */
316:   if (!in_to->sharedwin) {
317:     VecScatter_MPI_General *totmp = in_to,*out_totmp = out_to;
318:     in_to       = in_from;
319:     in_from     = totmp;
320:     out_to      = out_from;
321:     out_from    = out_totmp;
322:   }

324:   /* copy the to parts for the shared memory copies between processes */
325:   out_to->sharedcnt = in_to->sharedcnt;
326:   out_to->msize     = in_to->msize;
327:   PetscMalloc1(out_to->msize+1,&out_to->sharedspacestarts);
328:   PetscArraycpy(out_to->sharedspacestarts,in_to->sharedspacestarts,out_to->msize+1);
329:   PetscMalloc1(out_to->sharedcnt,&out_to->sharedspaceindices);
330:   PetscArraycpy(out_to->sharedspaceindices,in_to->sharedspaceindices,out_to->sharedcnt);

332:   PetscShmCommGet(PetscObjectComm((PetscObject)in),&scomm);
333:   PetscShmCommGetMpiShmComm(scomm,&mscomm);
334:   MPI_Info_create(&info);
335:   MPI_Info_set(info, "alloc_shared_noncontig", "true");
336:   MPIU_Win_allocate_shared(bs*out_to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&out_to->sharedspace,&out_to->sharedwin);
337:   MPI_Info_free(&info);

339:   /* copy the to parts for the shared memory copies between processes */
340:   out_from->sharedcnt = in_from->sharedcnt;
341:   out_from->msize     = in_from->msize;
342:   PetscMalloc1(out_from->msize+1,&out_from->sharedspacestarts);
343:   PetscArraycpy(out_from->sharedspacestarts,in_from->sharedspacestarts,out_from->msize+1);
344:   PetscMalloc1(out_from->sharedcnt,&out_from->sharedspaceindices);
345:   PetscArraycpy(out_from->sharedspaceindices,in_from->sharedspaceindices,out_from->sharedcnt);
346:   PetscMalloc1(out_from->msize,&out_from->sharedspacesoffset);
347:   PetscArraycpy(out_from->sharedspacesoffset,in_from->sharedspacesoffset,out_from->msize);
348:   PetscCalloc1(out_from->msize,&out_from->sharedspaces);
349:   for (jj=0; jj<out_from->msize; jj++) {
350:     MPI_Aint    isize;
351:     PetscMPIInt disp_unit;
352:     MPIU_Win_shared_query(out_to->sharedwin,jj,&isize,&disp_unit,&out_from->sharedspaces[jj]);
353:   }
354:   return(0);
355: }

357: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
358: {
359:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
360:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
361:   PetscErrorCode         ierr;
362:   PetscInt               ny,bs = in_from->bs;
363:   PetscMPIInt            size;

366:   MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);

368:   out->ops->begin     = in->ops->begin;
369:   out->ops->end       = in->ops->end;
370:   out->ops->copy      = in->ops->copy;
371:   out->ops->destroy   = in->ops->destroy;
372:   out->ops->view      = in->ops->view;

374:   /* allocate entire send scatter context */
375:   PetscNewLog(out,&out_to);
376:   out_to->sharedwin       = MPI_WIN_NULL;
377:   PetscNewLog(out,&out_from);
378:   out_from->sharedwin       = MPI_WIN_NULL;

380:   ny                = in_to->starts[in_to->n];
381:   out_to->n         = in_to->n;
382:   out_to->format    = in_to->format;

384:   PetscMalloc1(out_to->n,&out_to->requests);
385:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
386:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
387:   PetscArraycpy(out_to->indices,in_to->indices,ny);
388:   PetscArraycpy(out_to->starts,in_to->starts,out_to->n+1);
389:   PetscArraycpy(out_to->procs,in_to->procs,out_to->n);

391:   out->todata                        = (void*)out_to;
392:   out_to->local.n                    = in_to->local.n;
393:   out_to->local.nonmatching_computed = PETSC_FALSE;
394:   out_to->local.n_nonmatching        = 0;
395:   out_to->local.slots_nonmatching    = NULL;
396:   if (in_to->local.n) {
397:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
398:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
399:     PetscArraycpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n);
400:     PetscArraycpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n);
401:   } else {
402:     out_to->local.vslots   = NULL;
403:     out_from->local.vslots = NULL;
404:   }

406:   /* allocate entire receive context */
407:   VecScatterMemcpyPlanCopy_PtoP(in_to,in_from,out_to,out_from);

409:   out_from->format    = in_from->format;
410:   ny                  = in_from->starts[in_from->n];
411:   out_from->n         = in_from->n;

413:   PetscMalloc1(out_from->n,&out_from->requests);
414:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
415:   PetscArraycpy(out_from->indices,in_from->indices,ny);
416:   PetscArraycpy(out_from->starts,in_from->starts,out_from->n+1);
417:   PetscArraycpy(out_from->procs,in_from->procs,out_from->n);

419:   out->fromdata                        = (void*)out_from;
420:   out_from->local.n                    = in_from->local.n;
421:   out_from->local.nonmatching_computed = PETSC_FALSE;
422:   out_from->local.n_nonmatching        = 0;
423:   out_from->local.slots_nonmatching    = NULL;

425:   return(0);
426: }
427: /* --------------------------------------------------------------------------------------------------
428:     Packs and unpacks the message data into send or from receive buffers.

430:     These could be generated automatically.

432:     Fortran kernels etc. could be used.
433: */
434: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
435: {
436:   PetscInt i;
437:   for (i=0; i<n; i++) y[i] = x[indicesx[i]];
438: }

440: PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
441: {
442:   PetscInt i;

445:   switch (addv) {
446:   case INSERT_VALUES:
447:   case INSERT_ALL_VALUES:
448:     for (i=0; i<n; i++) y[indicesy[i]] = x[i];
449:     break;
450:   case ADD_VALUES:
451:   case ADD_ALL_VALUES:
452:     for (i=0; i<n; i++) y[indicesy[i]] += x[i];
453:     break;
454: #if !defined(PETSC_USE_COMPLEX)
455:   case MAX_VALUES:
456:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
457: #else
458:   case MAX_VALUES:
459: #endif
460:   case NOT_SET_VALUES:
461:     break;
462:   default:
463:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
464:   }
465:   return(0);
466: }

468: PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
469: {
470:   PetscInt i;

473:   switch (addv) {
474:   case INSERT_VALUES:
475:   case INSERT_ALL_VALUES:
476:     for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
477:     break;
478:   case ADD_VALUES:
479:   case ADD_ALL_VALUES:
480:     for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
481:     break;
482: #if !defined(PETSC_USE_COMPLEX)
483:   case MAX_VALUES:
484:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
485: #else
486:   case MAX_VALUES:
487: #endif
488:   case NOT_SET_VALUES:
489:     break;
490:   default:
491:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
492:   }
493:   return(0);
494: }

496: /* ----------------------------------------------------------------------------------------------- */
497: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
498: {
499:   PetscInt i,idx;

501:   for (i=0; i<n; i++) {
502:     idx  = *indicesx++;
503:     y[0] = x[idx];
504:     y[1] = x[idx+1];
505:     y   += 2;
506:   }
507: }

509: PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
510: {
511:   PetscInt i,idy;

514:   switch (addv) {
515:   case INSERT_VALUES:
516:   case INSERT_ALL_VALUES:
517:     for (i=0; i<n; i++) {
518:       idy      = *indicesy++;
519:       y[idy]   = x[0];
520:       y[idy+1] = x[1];
521:       x       += 2;
522:     }
523:     break;
524:   case ADD_VALUES:
525:   case ADD_ALL_VALUES:
526:     for (i=0; i<n; i++) {
527:       idy       = *indicesy++;
528:       y[idy]   += x[0];
529:       y[idy+1] += x[1];
530:       x        += 2;
531:     }
532:     break;
533: #if !defined(PETSC_USE_COMPLEX)
534:   case MAX_VALUES:
535:     for (i=0; i<n; i++) {
536:       idy      = *indicesy++;
537:       y[idy]   = PetscMax(y[idy],x[0]);
538:       y[idy+1] = PetscMax(y[idy+1],x[1]);
539:       x       += 2;
540:     }
541: #else
542:   case MAX_VALUES:
543: #endif
544:   case NOT_SET_VALUES:
545:     break;
546:   default:
547:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
548:   }
549:   return(0);
550: }

552: PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
553: {
554:   PetscInt i,idx,idy;

557:   switch (addv) {
558:   case INSERT_VALUES:
559:   case INSERT_ALL_VALUES:
560:     for (i=0; i<n; i++) {
561:       idx      = *indicesx++;
562:       idy      = *indicesy++;
563:       y[idy]   = x[idx];
564:       y[idy+1] = x[idx+1];
565:     }
566:     break;
567:   case ADD_VALUES:
568:   case ADD_ALL_VALUES:
569:     for (i=0; i<n; i++) {
570:       idx       = *indicesx++;
571:       idy       = *indicesy++;
572:       y[idy]   += x[idx];
573:       y[idy+1] += x[idx+1];
574:     }
575:     break;
576: #if !defined(PETSC_USE_COMPLEX)
577:   case MAX_VALUES:
578:     for (i=0; i<n; i++) {
579:       idx      = *indicesx++;
580:       idy      = *indicesy++;
581:       y[idy]   = PetscMax(y[idy],x[idx]);
582:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
583:     }
584: #else
585:   case MAX_VALUES:
586: #endif
587:   case NOT_SET_VALUES:
588:     break;
589:   default:
590:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
591:   }
592:   return(0);
593: }
594: /* ----------------------------------------------------------------------------------------------- */
595: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
596: {
597:   PetscInt i,idx;

599:   for (i=0; i<n; i++) {
600:     idx  = *indicesx++;
601:     y[0] = x[idx];
602:     y[1] = x[idx+1];
603:     y[2] = x[idx+2];
604:     y   += 3;
605:   }
606: }
607: PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
608: {
609:   PetscInt i,idy;

612:   switch (addv) {
613:   case INSERT_VALUES:
614:   case INSERT_ALL_VALUES:
615:     for (i=0; i<n; i++) {
616:       idy      = *indicesy++;
617:       y[idy]   = x[0];
618:       y[idy+1] = x[1];
619:       y[idy+2] = x[2];
620:       x       += 3;
621:     }
622:     break;
623:   case ADD_VALUES:
624:   case ADD_ALL_VALUES:
625:     for (i=0; i<n; i++) {
626:       idy       = *indicesy++;
627:       y[idy]   += x[0];
628:       y[idy+1] += x[1];
629:       y[idy+2] += x[2];
630:       x        += 3;
631:     }
632:     break;
633: #if !defined(PETSC_USE_COMPLEX)
634:   case MAX_VALUES:
635:     for (i=0; i<n; i++) {
636:       idy      = *indicesy++;
637:       y[idy]   = PetscMax(y[idy],x[0]);
638:       y[idy+1] = PetscMax(y[idy+1],x[1]);
639:       y[idy+2] = PetscMax(y[idy+2],x[2]);
640:       x       += 3;
641:     }
642: #else
643:   case MAX_VALUES:
644: #endif
645:   case NOT_SET_VALUES:
646:     break;
647:   default:
648:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
649:   }
650:   return(0);
651: }

653: PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
654: {
655:   PetscInt i,idx,idy;

658:   switch (addv) {
659:   case INSERT_VALUES:
660:   case INSERT_ALL_VALUES:
661:     for (i=0; i<n; i++) {
662:       idx      = *indicesx++;
663:       idy      = *indicesy++;
664:       y[idy]   = x[idx];
665:       y[idy+1] = x[idx+1];
666:       y[idy+2] = x[idx+2];
667:     }
668:     break;
669:   case ADD_VALUES:
670:   case ADD_ALL_VALUES:
671:     for (i=0; i<n; i++) {
672:       idx       = *indicesx++;
673:       idy       = *indicesy++;
674:       y[idy]   += x[idx];
675:       y[idy+1] += x[idx+1];
676:       y[idy+2] += x[idx+2];
677:     }
678:     break;
679: #if !defined(PETSC_USE_COMPLEX)
680:   case MAX_VALUES:
681:     for (i=0; i<n; i++) {
682:       idx      = *indicesx++;
683:       idy      = *indicesy++;
684:       y[idy]   = PetscMax(y[idy],x[idx]);
685:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
686:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
687:     }
688: #else
689:   case MAX_VALUES:
690: #endif
691:   case NOT_SET_VALUES:
692:     break;
693:   default:
694:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
695:   }
696:   return(0);
697: }
698: /* ----------------------------------------------------------------------------------------------- */
699: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
700: {
701:   PetscInt i,idx;

703:   for (i=0; i<n; i++) {
704:     idx  = *indicesx++;
705:     y[0] = x[idx];
706:     y[1] = x[idx+1];
707:     y[2] = x[idx+2];
708:     y[3] = x[idx+3];
709:     y   += 4;
710:   }
711: }
712: PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
713: {
714:   PetscInt i,idy;

717:   switch (addv) {
718:   case INSERT_VALUES:
719:   case INSERT_ALL_VALUES:
720:     for (i=0; i<n; i++) {
721:       idy      = *indicesy++;
722:       y[idy]   = x[0];
723:       y[idy+1] = x[1];
724:       y[idy+2] = x[2];
725:       y[idy+3] = x[3];
726:       x       += 4;
727:     }
728:     break;
729:   case ADD_VALUES:
730:   case ADD_ALL_VALUES:
731:     for (i=0; i<n; i++) {
732:       idy       = *indicesy++;
733:       y[idy]   += x[0];
734:       y[idy+1] += x[1];
735:       y[idy+2] += x[2];
736:       y[idy+3] += x[3];
737:       x        += 4;
738:     }
739:     break;
740: #if !defined(PETSC_USE_COMPLEX)
741:   case MAX_VALUES:
742:     for (i=0; i<n; i++) {
743:       idy      = *indicesy++;
744:       y[idy]   = PetscMax(y[idy],x[0]);
745:       y[idy+1] = PetscMax(y[idy+1],x[1]);
746:       y[idy+2] = PetscMax(y[idy+2],x[2]);
747:       y[idy+3] = PetscMax(y[idy+3],x[3]);
748:       x       += 4;
749:     }
750: #else
751:   case MAX_VALUES:
752: #endif
753:   case NOT_SET_VALUES:
754:     break;
755:   default:
756:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
757:   }
758:   return(0);
759: }

761: PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
762: {
763:   PetscInt i,idx,idy;

766:   switch (addv) {
767:   case INSERT_VALUES:
768:   case INSERT_ALL_VALUES:
769:     for (i=0; i<n; i++) {
770:       idx      = *indicesx++;
771:       idy      = *indicesy++;
772:       y[idy]   = x[idx];
773:       y[idy+1] = x[idx+1];
774:       y[idy+2] = x[idx+2];
775:       y[idy+3] = x[idx+3];
776:     }
777:     break;
778:   case ADD_VALUES:
779:   case ADD_ALL_VALUES:
780:     for (i=0; i<n; i++) {
781:       idx       = *indicesx++;
782:       idy       = *indicesy++;
783:       y[idy]   += x[idx];
784:       y[idy+1] += x[idx+1];
785:       y[idy+2] += x[idx+2];
786:       y[idy+3] += x[idx+3];
787:     }
788:     break;
789: #if !defined(PETSC_USE_COMPLEX)
790:   case MAX_VALUES:
791:     for (i=0; i<n; i++) {
792:       idx      = *indicesx++;
793:       idy      = *indicesy++;
794:       y[idy]   = PetscMax(y[idy],x[idx]);
795:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
796:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
797:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
798:     }
799: #else
800:   case MAX_VALUES:
801: #endif
802:   case NOT_SET_VALUES:
803:     break;
804:   default:
805:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
806:   }
807:   return(0);
808: }
809: /* ----------------------------------------------------------------------------------------------- */
810: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
811: {
812:   PetscInt i,idx;

814:   for (i=0; i<n; i++) {
815:     idx  = *indicesx++;
816:     y[0] = x[idx];
817:     y[1] = x[idx+1];
818:     y[2] = x[idx+2];
819:     y[3] = x[idx+3];
820:     y[4] = x[idx+4];
821:     y   += 5;
822:   }
823: }

825: PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
826: {
827:   PetscInt i,idy;

830:   switch (addv) {
831:   case INSERT_VALUES:
832:   case INSERT_ALL_VALUES:
833:     for (i=0; i<n; i++) {
834:       idy      = *indicesy++;
835:       y[idy]   = x[0];
836:       y[idy+1] = x[1];
837:       y[idy+2] = x[2];
838:       y[idy+3] = x[3];
839:       y[idy+4] = x[4];
840:       x       += 5;
841:     }
842:     break;
843:   case ADD_VALUES:
844:   case ADD_ALL_VALUES:
845:     for (i=0; i<n; i++) {
846:       idy       = *indicesy++;
847:       y[idy]   += x[0];
848:       y[idy+1] += x[1];
849:       y[idy+2] += x[2];
850:       y[idy+3] += x[3];
851:       y[idy+4] += x[4];
852:       x        += 5;
853:     }
854:     break;
855: #if !defined(PETSC_USE_COMPLEX)
856:   case MAX_VALUES:
857:     for (i=0; i<n; i++) {
858:       idy      = *indicesy++;
859:       y[idy]   = PetscMax(y[idy],x[0]);
860:       y[idy+1] = PetscMax(y[idy+1],x[1]);
861:       y[idy+2] = PetscMax(y[idy+2],x[2]);
862:       y[idy+3] = PetscMax(y[idy+3],x[3]);
863:       y[idy+4] = PetscMax(y[idy+4],x[4]);
864:       x       += 5;
865:     }
866: #else
867:   case MAX_VALUES:
868: #endif
869:   case NOT_SET_VALUES:
870:     break;
871:   default:
872:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
873:   }
874:   return(0);
875: }

877: PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
878: {
879:   PetscInt i,idx,idy;

882:   switch (addv) {
883:   case INSERT_VALUES:
884:   case INSERT_ALL_VALUES:
885:     for (i=0; i<n; i++) {
886:       idx      = *indicesx++;
887:       idy      = *indicesy++;
888:       y[idy]   = x[idx];
889:       y[idy+1] = x[idx+1];
890:       y[idy+2] = x[idx+2];
891:       y[idy+3] = x[idx+3];
892:       y[idy+4] = x[idx+4];
893:     }
894:     break;
895:   case ADD_VALUES:
896:   case ADD_ALL_VALUES:
897:     for (i=0; i<n; i++) {
898:       idx       = *indicesx++;
899:       idy       = *indicesy++;
900:       y[idy]   += x[idx];
901:       y[idy+1] += x[idx+1];
902:       y[idy+2] += x[idx+2];
903:       y[idy+3] += x[idx+3];
904:       y[idy+4] += x[idx+4];
905:     }
906:     break;
907: #if !defined(PETSC_USE_COMPLEX)
908:   case MAX_VALUES:
909:     for (i=0; i<n; i++) {
910:       idx      = *indicesx++;
911:       idy      = *indicesy++;
912:       y[idy]   = PetscMax(y[idy],x[idx]);
913:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
914:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
915:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
916:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
917:     }
918: #else
919:   case MAX_VALUES:
920: #endif
921:   case NOT_SET_VALUES:
922:     break;
923:   default:
924:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
925:   }
926:   return(0);
927: }
928: /* ----------------------------------------------------------------------------------------------- */
929: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
930: {
931:   PetscInt i,idx;

933:   for (i=0; i<n; i++) {
934:     idx  = *indicesx++;
935:     y[0] = x[idx];
936:     y[1] = x[idx+1];
937:     y[2] = x[idx+2];
938:     y[3] = x[idx+3];
939:     y[4] = x[idx+4];
940:     y[5] = x[idx+5];
941:     y   += 6;
942:   }
943: }

945: PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
946: {
947:   PetscInt i,idy;

950:   switch (addv) {
951:   case INSERT_VALUES:
952:   case INSERT_ALL_VALUES:
953:     for (i=0; i<n; i++) {
954:       idy      = *indicesy++;
955:       y[idy]   = x[0];
956:       y[idy+1] = x[1];
957:       y[idy+2] = x[2];
958:       y[idy+3] = x[3];
959:       y[idy+4] = x[4];
960:       y[idy+5] = x[5];
961:       x       += 6;
962:     }
963:     break;
964:   case ADD_VALUES:
965:   case ADD_ALL_VALUES:
966:     for (i=0; i<n; i++) {
967:       idy       = *indicesy++;
968:       y[idy]   += x[0];
969:       y[idy+1] += x[1];
970:       y[idy+2] += x[2];
971:       y[idy+3] += x[3];
972:       y[idy+4] += x[4];
973:       y[idy+5] += x[5];
974:       x        += 6;
975:     }
976:     break;
977: #if !defined(PETSC_USE_COMPLEX)
978:   case MAX_VALUES:
979:     for (i=0; i<n; i++) {
980:       idy      = *indicesy++;
981:       y[idy]   = PetscMax(y[idy],x[0]);
982:       y[idy+1] = PetscMax(y[idy+1],x[1]);
983:       y[idy+2] = PetscMax(y[idy+2],x[2]);
984:       y[idy+3] = PetscMax(y[idy+3],x[3]);
985:       y[idy+4] = PetscMax(y[idy+4],x[4]);
986:       y[idy+5] = PetscMax(y[idy+5],x[5]);
987:       x       += 6;
988:     }
989: #else
990:   case MAX_VALUES:
991: #endif
992:   case NOT_SET_VALUES:
993:     break;
994:   default:
995:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
996:   }
997:   return(0);
998: }

1000: PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1001: {
1002:   PetscInt i,idx,idy;

1005:   switch (addv) {
1006:   case INSERT_VALUES:
1007:   case INSERT_ALL_VALUES:
1008:     for (i=0; i<n; i++) {
1009:       idx      = *indicesx++;
1010:       idy      = *indicesy++;
1011:       y[idy]   = x[idx];
1012:       y[idy+1] = x[idx+1];
1013:       y[idy+2] = x[idx+2];
1014:       y[idy+3] = x[idx+3];
1015:       y[idy+4] = x[idx+4];
1016:       y[idy+5] = x[idx+5];
1017:     }
1018:     break;
1019:   case ADD_VALUES:
1020:   case ADD_ALL_VALUES:
1021:     for (i=0; i<n; i++) {
1022:       idx       = *indicesx++;
1023:       idy       = *indicesy++;
1024:       y[idy]   += x[idx];
1025:       y[idy+1] += x[idx+1];
1026:       y[idy+2] += x[idx+2];
1027:       y[idy+3] += x[idx+3];
1028:       y[idy+4] += x[idx+4];
1029:       y[idy+5] += x[idx+5];
1030:     }
1031:     break;
1032: #if !defined(PETSC_USE_COMPLEX)
1033:   case MAX_VALUES:
1034:     for (i=0; i<n; i++) {
1035:       idx      = *indicesx++;
1036:       idy      = *indicesy++;
1037:       y[idy]   = PetscMax(y[idy],x[idx]);
1038:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1039:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1040:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1041:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1042:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1043:     }
1044: #else
1045:   case MAX_VALUES:
1046: #endif
1047:   case NOT_SET_VALUES:
1048:     break;
1049:   default:
1050:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1051:   }
1052:   return(0);
1053: }
1054: /* ----------------------------------------------------------------------------------------------- */
1055: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1056: {
1057:   PetscInt i,idx;

1059:   for (i=0; i<n; i++) {
1060:     idx  = *indicesx++;
1061:     y[0] = x[idx];
1062:     y[1] = x[idx+1];
1063:     y[2] = x[idx+2];
1064:     y[3] = x[idx+3];
1065:     y[4] = x[idx+4];
1066:     y[5] = x[idx+5];
1067:     y[6] = x[idx+6];
1068:     y   += 7;
1069:   }
1070: }

1072: PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1073: {
1074:   PetscInt i,idy;

1077:   switch (addv) {
1078:   case INSERT_VALUES:
1079:   case INSERT_ALL_VALUES:
1080:     for (i=0; i<n; i++) {
1081:       idy      = *indicesy++;
1082:       y[idy]   = x[0];
1083:       y[idy+1] = x[1];
1084:       y[idy+2] = x[2];
1085:       y[idy+3] = x[3];
1086:       y[idy+4] = x[4];
1087:       y[idy+5] = x[5];
1088:       y[idy+6] = x[6];
1089:       x       += 7;
1090:     }
1091:     break;
1092:   case ADD_VALUES:
1093:   case ADD_ALL_VALUES:
1094:     for (i=0; i<n; i++) {
1095:       idy       = *indicesy++;
1096:       y[idy]   += x[0];
1097:       y[idy+1] += x[1];
1098:       y[idy+2] += x[2];
1099:       y[idy+3] += x[3];
1100:       y[idy+4] += x[4];
1101:       y[idy+5] += x[5];
1102:       y[idy+6] += x[6];
1103:       x        += 7;
1104:     }
1105:     break;
1106: #if !defined(PETSC_USE_COMPLEX)
1107:   case MAX_VALUES:
1108:     for (i=0; i<n; i++) {
1109:       idy      = *indicesy++;
1110:       y[idy]   = PetscMax(y[idy],x[0]);
1111:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1112:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1113:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1114:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1115:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1116:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1117:       x       += 7;
1118:     }
1119: #else
1120:   case MAX_VALUES:
1121: #endif
1122:   case NOT_SET_VALUES:
1123:     break;
1124:   default:
1125:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1126:   }
1127:   return(0);
1128: }

1130: PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1131: {
1132:   PetscInt i,idx,idy;

1135:   switch (addv) {
1136:   case INSERT_VALUES:
1137:   case INSERT_ALL_VALUES:
1138:     for (i=0; i<n; i++) {
1139:       idx      = *indicesx++;
1140:       idy      = *indicesy++;
1141:       y[idy]   = x[idx];
1142:       y[idy+1] = x[idx+1];
1143:       y[idy+2] = x[idx+2];
1144:       y[idy+3] = x[idx+3];
1145:       y[idy+4] = x[idx+4];
1146:       y[idy+5] = x[idx+5];
1147:       y[idy+6] = x[idx+6];
1148:     }
1149:     break;
1150:   case ADD_VALUES:
1151:   case ADD_ALL_VALUES:
1152:     for (i=0; i<n; i++) {
1153:       idx       = *indicesx++;
1154:       idy       = *indicesy++;
1155:       y[idy]   += x[idx];
1156:       y[idy+1] += x[idx+1];
1157:       y[idy+2] += x[idx+2];
1158:       y[idy+3] += x[idx+3];
1159:       y[idy+4] += x[idx+4];
1160:       y[idy+5] += x[idx+5];
1161:       y[idy+6] += x[idx+6];
1162:     }
1163:     break;
1164: #if !defined(PETSC_USE_COMPLEX)
1165:   case MAX_VALUES:
1166:     for (i=0; i<n; i++) {
1167:       idx      = *indicesx++;
1168:       idy      = *indicesy++;
1169:       y[idy]   = PetscMax(y[idy],x[idx]);
1170:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1171:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1172:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1173:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1174:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1175:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1176:     }
1177: #else
1178:   case MAX_VALUES:
1179: #endif
1180:   case NOT_SET_VALUES:
1181:     break;
1182:   default:
1183:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1184:   }
1185:   return(0);
1186: }
1187: /* ----------------------------------------------------------------------------------------------- */
1188: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1189: {
1190:   PetscInt i,idx;

1192:   for (i=0; i<n; i++) {
1193:     idx  = *indicesx++;
1194:     y[0] = x[idx];
1195:     y[1] = x[idx+1];
1196:     y[2] = x[idx+2];
1197:     y[3] = x[idx+3];
1198:     y[4] = x[idx+4];
1199:     y[5] = x[idx+5];
1200:     y[6] = x[idx+6];
1201:     y[7] = x[idx+7];
1202:     y   += 8;
1203:   }
1204: }

1206: PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1207: {
1208:   PetscInt i,idy;

1211:   switch (addv) {
1212:   case INSERT_VALUES:
1213:   case INSERT_ALL_VALUES:
1214:     for (i=0; i<n; i++) {
1215:       idy      = *indicesy++;
1216:       y[idy]   = x[0];
1217:       y[idy+1] = x[1];
1218:       y[idy+2] = x[2];
1219:       y[idy+3] = x[3];
1220:       y[idy+4] = x[4];
1221:       y[idy+5] = x[5];
1222:       y[idy+6] = x[6];
1223:       y[idy+7] = x[7];
1224:       x       += 8;
1225:     }
1226:     break;
1227:   case ADD_VALUES:
1228:   case ADD_ALL_VALUES:
1229:     for (i=0; i<n; i++) {
1230:       idy       = *indicesy++;
1231:       y[idy]   += x[0];
1232:       y[idy+1] += x[1];
1233:       y[idy+2] += x[2];
1234:       y[idy+3] += x[3];
1235:       y[idy+4] += x[4];
1236:       y[idy+5] += x[5];
1237:       y[idy+6] += x[6];
1238:       y[idy+7] += x[7];
1239:       x        += 8;
1240:     }
1241:     break;
1242: #if !defined(PETSC_USE_COMPLEX)
1243:   case MAX_VALUES:
1244:     for (i=0; i<n; i++) {
1245:       idy      = *indicesy++;
1246:       y[idy]   = PetscMax(y[idy],x[0]);
1247:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1248:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1249:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1250:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1251:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1252:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1253:       y[idy+7] = PetscMax(y[idy+7],x[7]);
1254:       x       += 8;
1255:     }
1256: #else
1257:   case MAX_VALUES:
1258: #endif
1259:   case NOT_SET_VALUES:
1260:     break;
1261:   default:
1262:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1263:   }
1264:   return(0);
1265: }

1267: PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1268: {
1269:   PetscInt i,idx,idy;

1272:   switch (addv) {
1273:   case INSERT_VALUES:
1274:   case INSERT_ALL_VALUES:
1275:     for (i=0; i<n; i++) {
1276:       idx      = *indicesx++;
1277:       idy      = *indicesy++;
1278:       y[idy]   = x[idx];
1279:       y[idy+1] = x[idx+1];
1280:       y[idy+2] = x[idx+2];
1281:       y[idy+3] = x[idx+3];
1282:       y[idy+4] = x[idx+4];
1283:       y[idy+5] = x[idx+5];
1284:       y[idy+6] = x[idx+6];
1285:       y[idy+7] = x[idx+7];
1286:     }
1287:     break;
1288:   case ADD_VALUES:
1289:   case ADD_ALL_VALUES:
1290:     for (i=0; i<n; i++) {
1291:       idx       = *indicesx++;
1292:       idy       = *indicesy++;
1293:       y[idy]   += x[idx];
1294:       y[idy+1] += x[idx+1];
1295:       y[idy+2] += x[idx+2];
1296:       y[idy+3] += x[idx+3];
1297:       y[idy+4] += x[idx+4];
1298:       y[idy+5] += x[idx+5];
1299:       y[idy+6] += x[idx+6];
1300:       y[idy+7] += x[idx+7];
1301:     }
1302:     break;
1303: #if !defined(PETSC_USE_COMPLEX)
1304:   case MAX_VALUES:
1305:     for (i=0; i<n; i++) {
1306:       idx      = *indicesx++;
1307:       idy      = *indicesy++;
1308:       y[idy]   = PetscMax(y[idy],x[idx]);
1309:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1310:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1311:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1312:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1313:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1314:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1315:       y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1316:     }
1317: #else
1318:   case MAX_VALUES:
1319: #endif
1320:   case NOT_SET_VALUES:
1321:     break;
1322:   default:
1323:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1324:   }
1325:   return(0);
1326: }

1328: PETSC_STATIC_INLINE void Pack_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1329: {
1330:   PetscInt i,idx;

1332:   for (i=0; i<n; i++) {
1333:     idx   = *indicesx++;
1334:     y[0]  = x[idx];
1335:     y[1]  = x[idx+1];
1336:     y[2]  = x[idx+2];
1337:     y[3]  = x[idx+3];
1338:     y[4]  = x[idx+4];
1339:     y[5]  = x[idx+5];
1340:     y[6]  = x[idx+6];
1341:     y[7]  = x[idx+7];
1342:     y[8]  = x[idx+8];
1343:     y    += 9;
1344:   }
1345: }

1347: PETSC_STATIC_INLINE PetscErrorCode UnPack_9(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1348: {
1349:   PetscInt i,idy;

1352:   switch (addv) {
1353:   case INSERT_VALUES:
1354:   case INSERT_ALL_VALUES:
1355:     for (i=0; i<n; i++) {
1356:       idy       = *indicesy++;
1357:       y[idy]    = x[0];
1358:       y[idy+1]  = x[1];
1359:       y[idy+2]  = x[2];
1360:       y[idy+3]  = x[3];
1361:       y[idy+4]  = x[4];
1362:       y[idy+5]  = x[5];
1363:       y[idy+6]  = x[6];
1364:       y[idy+7]  = x[7];
1365:       y[idy+8]  = x[8];
1366:       x        += 9;
1367:     }
1368:     break;
1369:   case ADD_VALUES:
1370:   case ADD_ALL_VALUES:
1371:     for (i=0; i<n; i++) {
1372:       idy        = *indicesy++;
1373:       y[idy]    += x[0];
1374:       y[idy+1]  += x[1];
1375:       y[idy+2]  += x[2];
1376:       y[idy+3]  += x[3];
1377:       y[idy+4]  += x[4];
1378:       y[idy+5]  += x[5];
1379:       y[idy+6]  += x[6];
1380:       y[idy+7]  += x[7];
1381:       y[idy+8]  += x[8];
1382:       x         += 9;
1383:     }
1384:     break;
1385: #if !defined(PETSC_USE_COMPLEX)
1386:   case MAX_VALUES:
1387:     for (i=0; i<n; i++) {
1388:       idy       = *indicesy++;
1389:       y[idy]    = PetscMax(y[idy],x[0]);
1390:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1391:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1392:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1393:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1394:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1395:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1396:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1397:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1398:       x        += 9;
1399:     }
1400: #else
1401:   case MAX_VALUES:
1402: #endif
1403:   case NOT_SET_VALUES:
1404:     break;
1405:   default:
1406:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1407:   }
1408:   return(0);
1409: }

1411: PETSC_STATIC_INLINE PetscErrorCode Scatter_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1412: {
1413:   PetscInt i,idx,idy;

1416:   switch (addv) {
1417:   case INSERT_VALUES:
1418:   case INSERT_ALL_VALUES:
1419:     for (i=0; i<n; i++) {
1420:       idx       = *indicesx++;
1421:       idy       = *indicesy++;
1422:       y[idy]    = x[idx];
1423:       y[idy+1]  = x[idx+1];
1424:       y[idy+2]  = x[idx+2];
1425:       y[idy+3]  = x[idx+3];
1426:       y[idy+4]  = x[idx+4];
1427:       y[idy+5]  = x[idx+5];
1428:       y[idy+6]  = x[idx+6];
1429:       y[idy+7]  = x[idx+7];
1430:       y[idy+8]  = x[idx+8];
1431:     }
1432:     break;
1433:   case ADD_VALUES:
1434:   case ADD_ALL_VALUES:
1435:     for (i=0; i<n; i++) {
1436:       idx        = *indicesx++;
1437:       idy        = *indicesy++;
1438:       y[idy]    += x[idx];
1439:       y[idy+1]  += x[idx+1];
1440:       y[idy+2]  += x[idx+2];
1441:       y[idy+3]  += x[idx+3];
1442:       y[idy+4]  += x[idx+4];
1443:       y[idy+5]  += x[idx+5];
1444:       y[idy+6]  += x[idx+6];
1445:       y[idy+7]  += x[idx+7];
1446:       y[idy+8]  += x[idx+8];
1447:     }
1448:     break;
1449: #if !defined(PETSC_USE_COMPLEX)
1450:   case MAX_VALUES:
1451:     for (i=0; i<n; i++) {
1452:       idx       = *indicesx++;
1453:       idy       = *indicesy++;
1454:       y[idy]    = PetscMax(y[idy],x[idx]);
1455:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1456:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1457:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1458:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1459:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1460:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1461:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1462:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1463:     }
1464: #else
1465:   case MAX_VALUES:
1466: #endif
1467:   case NOT_SET_VALUES:
1468:     break;
1469:   default:
1470:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1471:   }
1472:   return(0);
1473: }

1475: PETSC_STATIC_INLINE void Pack_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1476: {
1477:   PetscInt i,idx;

1479:   for (i=0; i<n; i++) {
1480:     idx   = *indicesx++;
1481:     y[0]  = x[idx];
1482:     y[1]  = x[idx+1];
1483:     y[2]  = x[idx+2];
1484:     y[3]  = x[idx+3];
1485:     y[4]  = x[idx+4];
1486:     y[5]  = x[idx+5];
1487:     y[6]  = x[idx+6];
1488:     y[7]  = x[idx+7];
1489:     y[8]  = x[idx+8];
1490:     y[9]  = x[idx+9];
1491:     y    += 10;
1492:   }
1493: }

1495: PETSC_STATIC_INLINE PetscErrorCode UnPack_10(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1496: {
1497:   PetscInt i,idy;

1500:   switch (addv) {
1501:   case INSERT_VALUES:
1502:   case INSERT_ALL_VALUES:
1503:     for (i=0; i<n; i++) {
1504:       idy       = *indicesy++;
1505:       y[idy]    = x[0];
1506:       y[idy+1]  = x[1];
1507:       y[idy+2]  = x[2];
1508:       y[idy+3]  = x[3];
1509:       y[idy+4]  = x[4];
1510:       y[idy+5]  = x[5];
1511:       y[idy+6]  = x[6];
1512:       y[idy+7]  = x[7];
1513:       y[idy+8]  = x[8];
1514:       y[idy+9]  = x[9];
1515:       x        += 10;
1516:     }
1517:     break;
1518:   case ADD_VALUES:
1519:   case ADD_ALL_VALUES:
1520:     for (i=0; i<n; i++) {
1521:       idy        = *indicesy++;
1522:       y[idy]    += x[0];
1523:       y[idy+1]  += x[1];
1524:       y[idy+2]  += x[2];
1525:       y[idy+3]  += x[3];
1526:       y[idy+4]  += x[4];
1527:       y[idy+5]  += x[5];
1528:       y[idy+6]  += x[6];
1529:       y[idy+7]  += x[7];
1530:       y[idy+8]  += x[8];
1531:       y[idy+9]  += x[9];
1532:       x         += 10;
1533:     }
1534:     break;
1535: #if !defined(PETSC_USE_COMPLEX)
1536:   case MAX_VALUES:
1537:     for (i=0; i<n; i++) {
1538:       idy       = *indicesy++;
1539:       y[idy]    = PetscMax(y[idy],x[0]);
1540:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1541:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1542:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1543:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1544:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1545:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1546:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1547:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1548:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1549:       x        += 10;
1550:     }
1551: #else
1552:   case MAX_VALUES:
1553: #endif
1554:   case NOT_SET_VALUES:
1555:     break;
1556:   default:
1557:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1558:   }
1559:   return(0);
1560: }

1562: PETSC_STATIC_INLINE PetscErrorCode Scatter_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1563: {
1564:   PetscInt i,idx,idy;

1567:   switch (addv) {
1568:   case INSERT_VALUES:
1569:   case INSERT_ALL_VALUES:
1570:     for (i=0; i<n; i++) {
1571:       idx       = *indicesx++;
1572:       idy       = *indicesy++;
1573:       y[idy]    = x[idx];
1574:       y[idy+1]  = x[idx+1];
1575:       y[idy+2]  = x[idx+2];
1576:       y[idy+3]  = x[idx+3];
1577:       y[idy+4]  = x[idx+4];
1578:       y[idy+5]  = x[idx+5];
1579:       y[idy+6]  = x[idx+6];
1580:       y[idy+7]  = x[idx+7];
1581:       y[idy+8]  = x[idx+8];
1582:       y[idy+9]  = x[idx+9];
1583:     }
1584:     break;
1585:   case ADD_VALUES:
1586:   case ADD_ALL_VALUES:
1587:     for (i=0; i<n; i++) {
1588:       idx        = *indicesx++;
1589:       idy        = *indicesy++;
1590:       y[idy]    += x[idx];
1591:       y[idy+1]  += x[idx+1];
1592:       y[idy+2]  += x[idx+2];
1593:       y[idy+3]  += x[idx+3];
1594:       y[idy+4]  += x[idx+4];
1595:       y[idy+5]  += x[idx+5];
1596:       y[idy+6]  += x[idx+6];
1597:       y[idy+7]  += x[idx+7];
1598:       y[idy+8]  += x[idx+8];
1599:       y[idy+9]  += x[idx+9];
1600:     }
1601:     break;
1602: #if !defined(PETSC_USE_COMPLEX)
1603:   case MAX_VALUES:
1604:     for (i=0; i<n; i++) {
1605:       idx       = *indicesx++;
1606:       idy       = *indicesy++;
1607:       y[idy]    = PetscMax(y[idy],x[idx]);
1608:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1609:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1610:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1611:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1612:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1613:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1614:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1615:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1616:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1617:     }
1618: #else
1619:   case MAX_VALUES:
1620: #endif
1621:   case NOT_SET_VALUES:
1622:     break;
1623:   default:
1624:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1625:   }
1626:   return(0);
1627: }

1629: PETSC_STATIC_INLINE void Pack_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1630: {
1631:   PetscInt i,idx;

1633:   for (i=0; i<n; i++) {
1634:     idx   = *indicesx++;
1635:     y[0]  = x[idx];
1636:     y[1]  = x[idx+1];
1637:     y[2]  = x[idx+2];
1638:     y[3]  = x[idx+3];
1639:     y[4]  = x[idx+4];
1640:     y[5]  = x[idx+5];
1641:     y[6]  = x[idx+6];
1642:     y[7]  = x[idx+7];
1643:     y[8]  = x[idx+8];
1644:     y[9]  = x[idx+9];
1645:     y[10] = x[idx+10];
1646:     y    += 11;
1647:   }
1648: }

1650: PETSC_STATIC_INLINE PetscErrorCode UnPack_11(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1651: {
1652:   PetscInt i,idy;

1655:   switch (addv) {
1656:   case INSERT_VALUES:
1657:   case INSERT_ALL_VALUES:
1658:     for (i=0; i<n; i++) {
1659:       idy       = *indicesy++;
1660:       y[idy]    = x[0];
1661:       y[idy+1]  = x[1];
1662:       y[idy+2]  = x[2];
1663:       y[idy+3]  = x[3];
1664:       y[idy+4]  = x[4];
1665:       y[idy+5]  = x[5];
1666:       y[idy+6]  = x[6];
1667:       y[idy+7]  = x[7];
1668:       y[idy+8]  = x[8];
1669:       y[idy+9]  = x[9];
1670:       y[idy+10] = x[10];
1671:       x        += 11;
1672:     }
1673:     break;
1674:   case ADD_VALUES:
1675:   case ADD_ALL_VALUES:
1676:     for (i=0; i<n; i++) {
1677:       idy        = *indicesy++;
1678:       y[idy]    += x[0];
1679:       y[idy+1]  += x[1];
1680:       y[idy+2]  += x[2];
1681:       y[idy+3]  += x[3];
1682:       y[idy+4]  += x[4];
1683:       y[idy+5]  += x[5];
1684:       y[idy+6]  += x[6];
1685:       y[idy+7]  += x[7];
1686:       y[idy+8]  += x[8];
1687:       y[idy+9]  += x[9];
1688:       y[idy+10] += x[10];
1689:       x         += 11;
1690:     }
1691:     break;
1692: #if !defined(PETSC_USE_COMPLEX)
1693:   case MAX_VALUES:
1694:     for (i=0; i<n; i++) {
1695:       idy       = *indicesy++;
1696:       y[idy]    = PetscMax(y[idy],x[0]);
1697:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1698:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1699:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1700:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1701:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1702:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1703:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1704:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1705:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1706:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1707:       x        += 11;
1708:     }
1709: #else
1710:   case MAX_VALUES:
1711: #endif
1712:   case NOT_SET_VALUES:
1713:     break;
1714:   default:
1715:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1716:   }
1717:   return(0);
1718: }

1720: PETSC_STATIC_INLINE PetscErrorCode Scatter_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1721: {
1722:   PetscInt i,idx,idy;

1725:   switch (addv) {
1726:   case INSERT_VALUES:
1727:   case INSERT_ALL_VALUES:
1728:     for (i=0; i<n; i++) {
1729:       idx       = *indicesx++;
1730:       idy       = *indicesy++;
1731:       y[idy]    = x[idx];
1732:       y[idy+1]  = x[idx+1];
1733:       y[idy+2]  = x[idx+2];
1734:       y[idy+3]  = x[idx+3];
1735:       y[idy+4]  = x[idx+4];
1736:       y[idy+5]  = x[idx+5];
1737:       y[idy+6]  = x[idx+6];
1738:       y[idy+7]  = x[idx+7];
1739:       y[idy+8]  = x[idx+8];
1740:       y[idy+9]  = x[idx+9];
1741:       y[idy+10] = x[idx+10];
1742:     }
1743:     break;
1744:   case ADD_VALUES:
1745:   case ADD_ALL_VALUES:
1746:     for (i=0; i<n; i++) {
1747:       idx        = *indicesx++;
1748:       idy        = *indicesy++;
1749:       y[idy]    += x[idx];
1750:       y[idy+1]  += x[idx+1];
1751:       y[idy+2]  += x[idx+2];
1752:       y[idy+3]  += x[idx+3];
1753:       y[idy+4]  += x[idx+4];
1754:       y[idy+5]  += x[idx+5];
1755:       y[idy+6]  += x[idx+6];
1756:       y[idy+7]  += x[idx+7];
1757:       y[idy+8]  += x[idx+8];
1758:       y[idy+9]  += x[idx+9];
1759:       y[idy+10] += x[idx+10];
1760:     }
1761:     break;
1762: #if !defined(PETSC_USE_COMPLEX)
1763:   case MAX_VALUES:
1764:     for (i=0; i<n; i++) {
1765:       idx       = *indicesx++;
1766:       idy       = *indicesy++;
1767:       y[idy]    = PetscMax(y[idy],x[idx]);
1768:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1769:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1770:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1771:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1772:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1773:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1774:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1775:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1776:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1777:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1778:     }
1779: #else
1780:   case MAX_VALUES:
1781: #endif
1782:   case NOT_SET_VALUES:
1783:     break;
1784:   default:
1785:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1786:   }
1787:   return(0);
1788: }

1790: /* ----------------------------------------------------------------------------------------------- */
1791: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1792: {
1793:   PetscInt i,idx;

1795:   for (i=0; i<n; i++) {
1796:     idx   = *indicesx++;
1797:     y[0]  = x[idx];
1798:     y[1]  = x[idx+1];
1799:     y[2]  = x[idx+2];
1800:     y[3]  = x[idx+3];
1801:     y[4]  = x[idx+4];
1802:     y[5]  = x[idx+5];
1803:     y[6]  = x[idx+6];
1804:     y[7]  = x[idx+7];
1805:     y[8]  = x[idx+8];
1806:     y[9]  = x[idx+9];
1807:     y[10] = x[idx+10];
1808:     y[11] = x[idx+11];
1809:     y    += 12;
1810:   }
1811: }

1813: PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1814: {
1815:   PetscInt i,idy;

1818:   switch (addv) {
1819:   case INSERT_VALUES:
1820:   case INSERT_ALL_VALUES:
1821:     for (i=0; i<n; i++) {
1822:       idy       = *indicesy++;
1823:       y[idy]    = x[0];
1824:       y[idy+1]  = x[1];
1825:       y[idy+2]  = x[2];
1826:       y[idy+3]  = x[3];
1827:       y[idy+4]  = x[4];
1828:       y[idy+5]  = x[5];
1829:       y[idy+6]  = x[6];
1830:       y[idy+7]  = x[7];
1831:       y[idy+8]  = x[8];
1832:       y[idy+9]  = x[9];
1833:       y[idy+10] = x[10];
1834:       y[idy+11] = x[11];
1835:       x        += 12;
1836:     }
1837:     break;
1838:   case ADD_VALUES:
1839:   case ADD_ALL_VALUES:
1840:     for (i=0; i<n; i++) {
1841:       idy        = *indicesy++;
1842:       y[idy]    += x[0];
1843:       y[idy+1]  += x[1];
1844:       y[idy+2]  += x[2];
1845:       y[idy+3]  += x[3];
1846:       y[idy+4]  += x[4];
1847:       y[idy+5]  += x[5];
1848:       y[idy+6]  += x[6];
1849:       y[idy+7]  += x[7];
1850:       y[idy+8]  += x[8];
1851:       y[idy+9]  += x[9];
1852:       y[idy+10] += x[10];
1853:       y[idy+11] += x[11];
1854:       x         += 12;
1855:     }
1856:     break;
1857: #if !defined(PETSC_USE_COMPLEX)
1858:   case MAX_VALUES:
1859:     for (i=0; i<n; i++) {
1860:       idy       = *indicesy++;
1861:       y[idy]    = PetscMax(y[idy],x[0]);
1862:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1863:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1864:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1865:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1866:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1867:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1868:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1869:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1870:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1871:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1872:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1873:       x        += 12;
1874:     }
1875: #else
1876:   case MAX_VALUES:
1877: #endif
1878:   case NOT_SET_VALUES:
1879:     break;
1880:   default:
1881:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1882:   }
1883:   return(0);
1884: }

1886: PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1887: {
1888:   PetscInt i,idx,idy;

1891:   switch (addv) {
1892:   case INSERT_VALUES:
1893:   case INSERT_ALL_VALUES:
1894:     for (i=0; i<n; i++) {
1895:       idx       = *indicesx++;
1896:       idy       = *indicesy++;
1897:       y[idy]    = x[idx];
1898:       y[idy+1]  = x[idx+1];
1899:       y[idy+2]  = x[idx+2];
1900:       y[idy+3]  = x[idx+3];
1901:       y[idy+4]  = x[idx+4];
1902:       y[idy+5]  = x[idx+5];
1903:       y[idy+6]  = x[idx+6];
1904:       y[idy+7]  = x[idx+7];
1905:       y[idy+8]  = x[idx+8];
1906:       y[idy+9]  = x[idx+9];
1907:       y[idy+10] = x[idx+10];
1908:       y[idy+11] = x[idx+11];
1909:     }
1910:     break;
1911:   case ADD_VALUES:
1912:   case ADD_ALL_VALUES:
1913:     for (i=0; i<n; i++) {
1914:       idx        = *indicesx++;
1915:       idy        = *indicesy++;
1916:       y[idy]    += x[idx];
1917:       y[idy+1]  += x[idx+1];
1918:       y[idy+2]  += x[idx+2];
1919:       y[idy+3]  += x[idx+3];
1920:       y[idy+4]  += x[idx+4];
1921:       y[idy+5]  += x[idx+5];
1922:       y[idy+6]  += x[idx+6];
1923:       y[idy+7]  += x[idx+7];
1924:       y[idy+8]  += x[idx+8];
1925:       y[idy+9]  += x[idx+9];
1926:       y[idy+10] += x[idx+10];
1927:       y[idy+11] += x[idx+11];
1928:     }
1929:     break;
1930: #if !defined(PETSC_USE_COMPLEX)
1931:   case MAX_VALUES:
1932:     for (i=0; i<n; i++) {
1933:       idx       = *indicesx++;
1934:       idy       = *indicesy++;
1935:       y[idy]    = PetscMax(y[idy],x[idx]);
1936:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1937:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1938:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1939:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1940:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1941:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1942:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1943:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1944:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1945:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1946:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1947:     }
1948: #else
1949:   case MAX_VALUES:
1950: #endif
1951:   case NOT_SET_VALUES:
1952:     break;
1953:   default:
1954:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1955:   }
1956:   return(0);
1957: }

1959: /* ----------------------------------------------------------------------------------------------- */
1960: PETSC_STATIC_INLINE void Pack_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1961: {
1962:   PetscInt       i,idx;

1965:   for (i=0; i<n; i++) {
1966:     idx   = *indicesx++;
1967:     PetscArraycpy(y,x + idx,bs);CHKERRV(ierr);
1968:     y    += bs;
1969:   }
1970: }

1972: PETSC_STATIC_INLINE PetscErrorCode UnPack_bs(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1973: {
1974:   PetscInt i,idy,j;

1978:   switch (addv) {
1979:   case INSERT_VALUES:
1980:   case INSERT_ALL_VALUES:
1981:     for (i=0; i<n; i++) {
1982:       idy       = *indicesy++;
1983:       PetscArraycpy(y + idy,x,bs);
1984:       x        += bs;
1985:     }
1986:     break;
1987:   case ADD_VALUES:
1988:   case ADD_ALL_VALUES:
1989:     for (i=0; i<n; i++) {
1990:       idy        = *indicesy++;
1991:       for (j=0; j<bs; j++) y[idy+j] += x[j];
1992:       x         += bs;
1993:     }
1994:     break;
1995: #if !defined(PETSC_USE_COMPLEX)
1996:   case MAX_VALUES:
1997:     for (i=0; i<n; i++) {
1998:       idy = *indicesy++;
1999:       for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[j]);
2000:       x  += bs;
2001:     }
2002: #else
2003:   case MAX_VALUES:
2004: #endif
2005:   case NOT_SET_VALUES:
2006:     break;
2007:   default:
2008:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2009:   }
2010:   return(0);
2011: }

2013: PETSC_STATIC_INLINE PetscErrorCode Scatter_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2014: {
2015:   PetscInt i,idx,idy,j;

2019:   switch (addv) {
2020:   case INSERT_VALUES:
2021:   case INSERT_ALL_VALUES:
2022:     for (i=0; i<n; i++) {
2023:       idx       = *indicesx++;
2024:       idy       = *indicesy++;
2025:       PetscArraycpy(y + idy, x + idx,bs);
2026:     }
2027:     break;
2028:   case ADD_VALUES:
2029:   case ADD_ALL_VALUES:
2030:     for (i=0; i<n; i++) {
2031:       idx        = *indicesx++;
2032:       idy        = *indicesy++;
2033:       for (j=0; j<bs; j++)  y[idy+j] += x[idx+j];
2034:     }
2035:     break;
2036: #if !defined(PETSC_USE_COMPLEX)
2037:   case MAX_VALUES:
2038:     for (i=0; i<n; i++) {
2039:       idx       = *indicesx++;
2040:       idy       = *indicesy++;
2041:       for (j=0; j<bs; j++)  y[idy+j] = PetscMax(y[idy+j],x[idx+j]);
2042:     }
2043: #else
2044:   case MAX_VALUES:
2045: #endif
2046:   case NOT_SET_VALUES:
2047:     break;
2048:   default:
2049:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2050:   }
2051:   return(0);
2052: }

2054: /* Create the VecScatterBegin/End_P for our chosen block sizes */
2055: #define BS 1
2056: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2057: #define BS 2
2058: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2059: #define BS 3
2060: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2061: #define BS 4
2062: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2063: #define BS 5
2064: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2065: #define BS 6
2066: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2067: #define BS 7
2068: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2069: #define BS 8
2070: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2071: #define BS 9
2072: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2073: #define BS 10
2074: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2075: #define BS 11
2076: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2077: #define BS 12
2078: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2079: #define BS bs
2080: #include <../src/vec/vscat/impls/mpi3/vpscat.h>

2082: /* ==========================================================================================*/
2083: /*              create parallel to sequential scatter context                                */

2085: extern PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);

2087: /*
2088:    bs indicates how many elements there are in each block. Normally this would be 1.

2090:    contains check that PetscMPIInt can handle the sizes needed
2091: */
2092: #include <petscbt.h>
2093: PetscErrorCode VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2094: {
2095:   VecScatter_MPI_General *from,*to;
2096:   PetscMPIInt            size,rank,mrank,imdex,tag,n;
2097:   PetscInt               *source = NULL,*owners = NULL,nxr;
2098:   PetscInt               *lowner = NULL,*start = NULL,*lsharedowner = NULL,*sharedstart = NULL,lengthy,lengthx;
2099:   PetscMPIInt            *nprocs = NULL,nrecvs,nrecvshared;
2100:   PetscInt               i,j,idx = 0,nsends;
2101:   PetscMPIInt            *owner = NULL;
2102:   PetscInt               *starts = NULL,count,slen,slenshared;
2103:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
2104:   PetscMPIInt            *onodes1,*olengths1;
2105:   MPI_Comm               comm;
2106:   MPI_Request            *send_waits = NULL,*recv_waits = NULL;
2107:   MPI_Status             recv_status,*send_status;
2108:   PetscErrorCode         ierr;
2109:   PetscShmComm           scomm;
2110:   PetscMPIInt            jj;
2111:   MPI_Info               info;
2112:   MPI_Comm               mscomm;
2113:   MPI_Win                sharedoffsetwin;  /* Window that owns sharedspaceoffset */
2114:   PetscInt               *sharedspaceoffset;
2115:   VecScatterType         type;
2116:   PetscBool              mpi3node;
2118:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2119:   PetscObjectGetComm((PetscObject)xin,&comm);
2120:   MPI_Comm_rank(comm,&rank);
2121:   MPI_Comm_size(comm,&size);

2123:   VecScatterGetType(ctx,&type);
2124:   PetscStrcmp(type,"mpi3node",&mpi3node);

2126:   PetscShmCommGet(comm,&scomm);
2127:   PetscShmCommGetMpiShmComm(scomm,&mscomm);

2129:   MPI_Info_create(&info);
2130:   MPI_Info_set(info, "alloc_shared_noncontig", "true");

2132:   if (mpi3node) {
2133:     /* Check if (parallel) inidx has duplicate indices.
2134:      Current VecScatterEndMPI3Node() (case StoP) writes the sequential vector to the parallel vector.
2135:      Writing to the same shared location without variable locking
2136:      leads to incorrect scattering. See src/vec/vscat/examples/runex2_5 and runex3_5 */

2138:     PetscInt    *mem,**optr;
2139:     MPI_Win     swin;
2140:     PetscMPIInt msize;

2142:     MPI_Comm_size(mscomm,&msize);
2143:     MPI_Comm_rank(mscomm,&mrank);

2145:     PetscMalloc1(msize,&optr);
2146:     MPIU_Win_allocate_shared((nx+1)*sizeof(PetscInt),sizeof(PetscInt),MPI_INFO_NULL,mscomm,&mem,&swin);

2148:     /* Write local nx and inidx into mem */
2149:     mem[0] = nx;
2150:     for (i=1; i<=nx; i++) mem[i] = inidx[i-1];
2151:     MPI_Barrier(mscomm); /* sync shared memory */

2153:     if (!mrank) {
2154:       PetscBT        table;
2155:       /* sz and dsp_unit are not used. Replacing them with NULL would cause MPI_Win_shared_query() crash */
2156:       MPI_Aint       sz;
2157:       PetscMPIInt    dsp_unit;
2158:       PetscInt       N = xin->map->N;

2160:       PetscBTCreate(N,&table); /* may not be scalable */
2161:       PetscBTMemzero(N,table);

2163:       jj = 0;
2164:       while (jj<msize && !ctx->is_duplicate) {
2165:         if (jj == mrank) {jj++; continue;}
2166:         MPIU_Win_shared_query(swin,jj,&sz,&dsp_unit,&optr[jj]);
2167:         for (j=1; j<=optr[jj][0]; j++) { /* optr[jj][0] = nx */
2168:           if (PetscBTLookupSet(table,optr[jj][j])) {
2169:             ctx->is_duplicate = PETSC_TRUE; /* only mrank=0 has this info., will be checked in VecScatterEndMPI3Node() */
2170:             break;
2171:           }
2172:         }
2173:         jj++;
2174:       }
2175:       PetscBTDestroy(&table);
2176:     }

2178:     if (swin != MPI_WIN_NULL) {MPI_Win_free(&swin);}
2179:     PetscFree(optr);
2180:   }

2182:   owners = xin->map->range;
2183:   VecGetSize(yin,&lengthy);
2184:   VecGetSize(xin,&lengthx);

2186:   /*  first count number of contributors to each processor */
2187:   PetscMalloc2(size,&nprocs,nx,&owner);
2188:   PetscArrayzero(nprocs,size);

2190:   j      = 0;
2191:   nsends = 0;
2192:   for (i=0; i<nx; i++) {
2193:     PetscIntMultError(bs,inidx[i],&idx);
2194:     if (idx < owners[j]) j = 0;
2195:     for (; j<size; j++) {
2196:       if (idx < owners[j+1]) {
2197:         if (!nprocs[j]++) nsends++;
2198:         owner[i] = j;
2199:         break;
2200:       }
2201:     }
2202:     if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
2203:   }

2205:   nprocslocal  = nprocs[rank];
2206:   nprocs[rank] = 0;
2207:   if (nprocslocal) nsends--;
2208:   /* inform other processors of number of messages and max length*/
2209:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2210:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2211:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2212:   recvtotal = 0;
2213:   for (i=0; i<nrecvs; i++) {
2214:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2215:   }

2217:   /* post receives:   */
2218:   PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);
2219:   count = 0;
2220:   for (i=0; i<nrecvs; i++) {
2221:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2222:     count += olengths1[i];
2223:   }

2225:   /* do sends:
2226:      1) starts[i] gives the starting index in svalues for stuff going to
2227:      the ith processor
2228:   */
2229:   nxr = 0;
2230:   for (i=0; i<nx; i++) {
2231:     if (owner[i] != rank) nxr++;
2232:   }
2233:   PetscMalloc3(nxr,&svalues,nsends,&send_waits,size+1,&starts);

2235:   starts[0]  = 0;
2236:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2237:   for (i=0; i<nx; i++) {
2238:     if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
2239:   }
2240:   starts[0] = 0;
2241:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2242:   count = 0;
2243:   for (i=0; i<size; i++) {
2244:     if (nprocs[i]) {
2245:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
2246:     }
2247:   }

2249:   /*  wait on receives; this is everyone who we need to deliver data to */
2250:   count = nrecvs;
2251:   slen  = 0;
2252:   nrecvshared = 0;
2253:   slenshared  = 0;
2254:   while (count) {
2255:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2256:     /* unpack receives into our local space */
2257:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2258:     PetscShmCommGlobalToLocal(scomm,onodes1[imdex],&jj);
2259:     if (jj > -1) {
2260:       PetscInfo3(NULL,"[%d] Sending values to shared memory partner %d,global rank %d\n",rank,jj,onodes1[imdex]);
2261:       nrecvshared++;
2262:       slenshared += n;
2263:     }
2264:     slen += n;
2265:     count--;
2266:   }

2268:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);

2270:   /* allocate entire send scatter context */
2271:   PetscNewLog(ctx,&to);
2272:   to->sharedwin = MPI_WIN_NULL;
2273:   to->n         = nrecvs-nrecvshared;

2275:   PetscMalloc1(nrecvs-nrecvshared,&to->requests);
2276:   PetscMalloc4(bs*(slen-slenshared),&to->values,slen-slenshared,&to->indices,nrecvs-nrecvshared+1,&to->starts,nrecvs-nrecvshared,&to->procs);
2277:   PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);

2279:   MPI_Comm_size(mscomm,&to->msize);
2280:   PetscMalloc1(slenshared,&to->sharedspaceindices);
2281:   PetscCalloc1(to->msize+1,&to->sharedspacestarts);

2283:   ctx->todata              = (void*)to;
2284:   to->starts[0]            = 0;
2285:   to->sharedspacestarts[0] = 0;

2287:   /* Allocate shared memory space for shared memory partner communication */
2288:   MPIU_Win_allocate_shared(to->msize*sizeof(PetscInt),sizeof(PetscInt),info,mscomm,&sharedspaceoffset,&sharedoffsetwin);
2289:   for (i=0; i<to->msize; i++) sharedspaceoffset[i] = -1; /* mark with -1 for later error checking */
2290:   if (nrecvs) {
2291:     /* move the data into the send scatter */
2292:     base     = owners[rank];
2293:     rsvalues = rvalues;
2294:     to->n    = 0;
2295:     for (i=0; i<nrecvs; i++) {
2296:       values = rsvalues;
2297:       PetscShmCommGlobalToLocal(scomm,(PetscMPIInt)onodes1[i],&jj);
2298:       if (jj > -1) {
2299:         to->sharedspacestarts[jj]   = to->sharedcnt;
2300:         to->sharedspacestarts[jj+1] = to->sharedcnt + olengths1[i];
2301:         for (j=0; j<olengths1[i]; j++) to->sharedspaceindices[to->sharedcnt + j] = values[j] - base;
2302:         sharedspaceoffset[jj]      = to->sharedcnt;
2303:         to->sharedcnt             += olengths1[i];
2304:         PetscInfo4(NULL,"[%d] Sending %d values to shared memory partner %d,global rank %d\n",rank,olengths1[i],jj,onodes1[i]);
2305:       } else {
2306:         to->starts[to->n+1] = to->starts[to->n] + olengths1[i];
2307:         to->procs[to->n]    = onodes1[i];
2308:         for (j=0; j<olengths1[i]; j++) to->indices[to->starts[to->n] + j] = values[j] - base;
2309:         to->n++;
2310:       }
2311:       rsvalues += olengths1[i];
2312:     }
2313:   }
2314:   PetscFree(olengths1);
2315:   PetscFree(onodes1);
2316:   PetscFree3(rvalues,source,recv_waits);

2318:   if (mpi3node) {
2319:     /* to->sharedspace is used only as a flag */
2320:     i = 0;
2321:     if (to->sharedcnt) i = 1;
2322:     MPIU_Win_allocate_shared(i*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2323:   } else { /* mpi3 */
2324:     MPIU_Win_allocate_shared(bs*to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2325:   }
2326:   if (to->sharedwin == MPI_WIN_NULL) SETERRQ(PETSC_COMM_SELF,100,"what the");
2327:   MPI_Info_free(&info);

2329:   /* allocate entire receive scatter context */
2330:   PetscNewLog(ctx,&from);
2331:   from->sharedwin       = MPI_WIN_NULL;
2332:   from->msize           = to->msize;
2333:   /* we don't actually know the correct value for from->n at this point so we use the upper bound */
2334:   from->n = nsends;

2336:   PetscMalloc1(nsends,&from->requests);
2337:   PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);
2338:   ctx->fromdata = (void*)from;

2340:   PetscCalloc1(to->msize+1,&from->sharedspacestarts);
2341:   /* move data into receive scatter */
2342:   PetscMalloc2(size,&lowner,nsends+1,&start);
2343:   PetscMalloc2(size,&lsharedowner,to->msize+1,&sharedstart);
2344:   count = 0; from->starts[0] = start[0] = 0;
2345:   from->sharedcnt = 0;
2346:   for (i=0; i<size; i++) {
2347:     lsharedowner[i] = -1;
2348:     lowner[i]       = -1;
2349:     if (nprocs[i]) {
2350:       PetscShmCommGlobalToLocal(scomm,i,&jj);
2351:       if (jj > -1) {
2352:         from->sharedspacestarts[jj] = sharedstart[jj] = from->sharedcnt;
2353:         from->sharedspacestarts[jj+1] = sharedstart[jj+1] = from->sharedspacestarts[jj] + nprocs[i];
2354:         from->sharedcnt += nprocs[i];
2355:         lsharedowner[i] = jj;
2356:       } else {
2357:         lowner[i]            = count++;
2358:         from->procs[count-1]  = i;
2359:         from->starts[count] = start[count] = start[count-1] + nprocs[i];
2360:       }
2361:     }
2362:   }
2363:   from->n = count;

2365:   for (i=0; i<nx; i++) {
2366:     if (owner[i] != rank && lowner[owner[i]] > -1) {
2367:       from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
2368:       if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2369:     }
2370:   }

2372:   /* copy over appropriate parts of inidy[] into from->sharedspaceindices */
2373:   if (mpi3node) {
2374:     /* in addition, copy over appropriate parts of inidx[] into 2nd part of from->sharedspaceindices */
2375:     PetscInt *sharedspaceindicesx;
2376:     PetscMalloc1(2*from->sharedcnt,&from->sharedspaceindices);
2377:     sharedspaceindicesx = from->sharedspaceindices + from->sharedcnt;
2378:     for (i=0; i<nx; i++) {
2379:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2380:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");

2382:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]] = bs*inidy[i];
2383:         sharedspaceindicesx[sharedstart[lsharedowner[owner[i]]]] = bs*inidx[i]-owners[owner[i]]; /* local inidx */
2384:         sharedstart[lsharedowner[owner[i]]]++;
2385:       }
2386:     }
2387:   } else { /* mpi3 */
2388:     PetscMalloc1(from->sharedcnt,&from->sharedspaceindices);
2389:     for (i=0; i<nx; i++) {
2390:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2391:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2392:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]++] = bs*inidy[i];
2393:       }
2394:     }
2395:   }

2397:   PetscFree2(lowner,start);
2398:   PetscFree2(lsharedowner,sharedstart);
2399:   PetscFree2(nprocs,owner);

2401:   /* wait on sends */
2402:   if (nsends) {
2403:     PetscMalloc1(nsends,&send_status);
2404:     MPI_Waitall(nsends,send_waits,send_status);
2405:     PetscFree(send_status);
2406:   }
2407:   PetscFree3(svalues,send_waits,starts);

2409:   if (nprocslocal) {
2410:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
2411:     /* we have a scatter to ourselves */
2412:     PetscMalloc1(nt,&to->local.vslots);
2413:     PetscMalloc1(nt,&from->local.vslots);
2414:     nt   = 0;
2415:     for (i=0; i<nx; i++) {
2416:       idx = bs*inidx[i];
2417:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2418:         to->local.vslots[nt]     = idx - owners[rank];
2419:         from->local.vslots[nt++] = bs*inidy[i];
2420:         if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2421:       }
2422:     }
2423:     PetscLogObjectMemory((PetscObject)ctx,2*nt*sizeof(PetscInt));
2424:   } else {
2425:     from->local.n      = 0;
2426:     from->local.vslots = NULL;
2427:     to->local.n        = 0;
2428:     to->local.vslots   = NULL;
2429:   }

2431:   /* Get the shared memory address for all processes we will be copying data from */
2432:   PetscCalloc1(to->msize,&from->sharedspaces);
2433:   PetscCalloc1(to->msize,&from->sharedspacesoffset);
2434:   MPI_Comm_rank(mscomm,&mrank);
2435:   for (jj=0; jj<to->msize; jj++) {
2436:     MPI_Aint    isize;
2437:     PetscMPIInt disp_unit;
2438:     PetscInt    *ptr;
2439:     MPIU_Win_shared_query(to->sharedwin,jj,&isize,&disp_unit,&from->sharedspaces[jj]);
2440:     MPIU_Win_shared_query(sharedoffsetwin,jj,&isize,&disp_unit,&ptr);
2441:     from->sharedspacesoffset[jj] = ptr[mrank];
2442:   }
2443:   MPI_Win_free(&sharedoffsetwin);

2445:   if (mpi3node && !from->sharedspace) {
2446:     /* comput from->notdone to be used by VecScatterEndMPI3Node() */
2447:     PetscInt notdone = 0;
2448:     for (i=0; i<from->msize; i++) {
2449:       if (from->sharedspacesoffset && from->sharedspacesoffset[i] > -1) {
2450:         notdone++;
2451:       }
2452:     }
2453:     from->notdone = notdone;
2454:   }

2456:   from->local.nonmatching_computed = PETSC_FALSE;
2457:   from->local.n_nonmatching        = 0;
2458:   from->local.slots_nonmatching    = NULL;
2459:   to->local.nonmatching_computed   = PETSC_FALSE;
2460:   to->local.n_nonmatching          = 0;
2461:   to->local.slots_nonmatching      = NULL;

2463:   from->format = VEC_SCATTER_MPI_GENERAL;
2464:   to->format   = VEC_SCATTER_MPI_GENERAL;
2465:   from->bs     = bs;
2466:   to->bs       = bs;

2468:   VecScatterCreateCommon_PtoS_MPI3(from,to,ctx);
2469:   return(0);
2470: }

2472: /*
2473:    bs indicates how many elements there are in each block. Normally this would be 1.
2474: */
2475: PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
2476: {
2477:   MPI_Comm       comm;
2478:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
2479:   PetscInt       bs   = to->bs;
2480:   PetscMPIInt    size;
2481:   PetscInt       i, n;
2483:   VecScatterType type;
2484:   PetscBool      mpi3node;

2487:   PetscObjectGetComm((PetscObject)ctx,&comm);
2488:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
2489:   ctx->ops->destroy = VecScatterDestroy_PtoP_MPI3;

2491:   MPI_Comm_size(comm,&size);
2492:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
2493:   to->contiq = PETSC_FALSE;
2494:   n = from->starts[from->n];
2495:   from->contiq = PETSC_TRUE;
2496:   for (i=1; i<n; i++) {
2497:     if (from->indices[i] != from->indices[i-1] + bs) {
2498:       from->contiq = PETSC_FALSE;
2499:       break;
2500:     }
2501:   }

2503:   ctx->ops->copy      = VecScatterCopy_PtoP_AllToAll;
2504:   {
2505:     PetscInt    *sstarts  = to->starts,  *rstarts = from->starts;
2506:     PetscMPIInt *sprocs   = to->procs,   *rprocs  = from->procs;
2507:     MPI_Request *swaits   = to->requests,*rwaits  = from->requests;
2508:     MPI_Request *rev_swaits,*rev_rwaits;
2509:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2511:     /* allocate additional wait variables for the "reverse" scatter */
2512:     PetscMalloc1(to->n,&rev_rwaits);
2513:     PetscMalloc1(from->n,&rev_swaits);
2514:     to->rev_requests   = rev_rwaits;
2515:     from->rev_requests = rev_swaits;

2517:     for (i=0; i<from->n; i++) {
2518:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2519:     }

2521:     for (i=0; i<to->n; i++) {
2522:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2523:     }
2524:     /* Register receives for scatter and reverse */
2525:     for (i=0; i<from->n; i++) {
2526:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2527:     }
2528:     for (i=0; i<to->n; i++) {
2529:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2530:     }
2531:     ctx->ops->copy = VecScatterCopy_PtoP_X;
2532:   }
2533:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);

2535:   if (PetscDefined(USE_DEBUG)) {
2536:     MPIU_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));
2537:     MPIU_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));
2538:     if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2539:   }

2541:   VecScatterGetType(ctx,&type);
2542:   PetscStrcmp(type,"mpi3node",&mpi3node);

2544:   if (mpi3node) {
2545:     switch (bs) {
2546:     case 12:
2547:       ctx->ops->begin = VecScatterBeginMPI3Node_12;
2548:       ctx->ops->end   = VecScatterEndMPI3Node_12;
2549:       break;
2550:     case 11:
2551:       ctx->ops->begin = VecScatterBeginMPI3Node_11;
2552:       ctx->ops->end   = VecScatterEndMPI3Node_11;
2553:       break;
2554:     case 10:
2555:       ctx->ops->begin = VecScatterBeginMPI3Node_10;
2556:       ctx->ops->end   = VecScatterEndMPI3Node_10;
2557:       break;
2558:     case 9:
2559:       ctx->ops->begin = VecScatterBeginMPI3Node_9;
2560:       ctx->ops->end   = VecScatterEndMPI3Node_9;
2561:       break;
2562:     case 8:
2563:       ctx->ops->begin = VecScatterBeginMPI3Node_8;
2564:       ctx->ops->end   = VecScatterEndMPI3Node_8;
2565:       break;
2566:     case 7:
2567:       ctx->ops->begin = VecScatterBeginMPI3Node_7;
2568:       ctx->ops->end   = VecScatterEndMPI3Node_7;
2569:       break;
2570:     case 6:
2571:       ctx->ops->begin = VecScatterBeginMPI3Node_6;
2572:       ctx->ops->end   = VecScatterEndMPI3Node_6;
2573:       break;
2574:     case 5:
2575:       ctx->ops->begin = VecScatterBeginMPI3Node_5;
2576:       ctx->ops->end   = VecScatterEndMPI3Node_5;
2577:       break;
2578:     case 4:
2579:       ctx->ops->begin = VecScatterBeginMPI3Node_4;
2580:       ctx->ops->end   = VecScatterEndMPI3Node_4;
2581:       break;
2582:     case 3:
2583:       ctx->ops->begin = VecScatterBeginMPI3Node_3;
2584:       ctx->ops->end   = VecScatterEndMPI3Node_3;
2585:       break;
2586:     case 2:
2587:       ctx->ops->begin = VecScatterBeginMPI3Node_2;
2588:       ctx->ops->end   = VecScatterEndMPI3Node_2;
2589:       break;
2590:     case 1:
2591:       ctx->ops->begin = VecScatterBeginMPI3Node_1;
2592:       ctx->ops->end   = VecScatterEndMPI3Node_1;
2593:       break;
2594:     default:
2595:       ctx->ops->begin = VecScatterBegin_bs;
2596:       ctx->ops->end   = VecScatterEnd_bs;
2597:     }
2598:   } else { /* !mpi3node */

2600:     switch (bs) {
2601:     case 12:
2602:       ctx->ops->begin = VecScatterBegin_12;
2603:       ctx->ops->end   = VecScatterEnd_12;
2604:       break;
2605:     case 11:
2606:       ctx->ops->begin = VecScatterBegin_11;
2607:       ctx->ops->end   = VecScatterEnd_11;
2608:       break;
2609:     case 10:
2610:       ctx->ops->begin = VecScatterBegin_10;
2611:       ctx->ops->end   = VecScatterEnd_10;
2612:       break;
2613:     case 9:
2614:       ctx->ops->begin = VecScatterBegin_9;
2615:       ctx->ops->end   = VecScatterEnd_9;
2616:       break;
2617:     case 8:
2618:       ctx->ops->begin = VecScatterBegin_8;
2619:       ctx->ops->end   = VecScatterEnd_8;
2620:       break;
2621:     case 7:
2622:       ctx->ops->begin = VecScatterBegin_7;
2623:       ctx->ops->end   = VecScatterEnd_7;
2624:       break;
2625:     case 6:
2626:       ctx->ops->begin = VecScatterBegin_6;
2627:       ctx->ops->end   = VecScatterEnd_6;
2628:       break;
2629:     case 5:
2630:       ctx->ops->begin = VecScatterBegin_5;
2631:       ctx->ops->end   = VecScatterEnd_5;
2632:       break;
2633:     case 4:
2634:       ctx->ops->begin = VecScatterBegin_4;
2635:       ctx->ops->end   = VecScatterEnd_4;
2636:       break;
2637:     case 3:
2638:       ctx->ops->begin = VecScatterBegin_3;
2639:       ctx->ops->end   = VecScatterEnd_3;
2640:       break;
2641:     case 2:
2642:       ctx->ops->begin = VecScatterBegin_2;
2643:       ctx->ops->end   = VecScatterEnd_2;
2644:       break;
2645:     case 1:
2646:       if (mpi3node) {
2647:         ctx->ops->begin = VecScatterBeginMPI3Node_1;
2648:         ctx->ops->end   = VecScatterEndMPI3Node_1;
2649:       } else {
2650:         ctx->ops->begin = VecScatterBegin_1;
2651:         ctx->ops->end   = VecScatterEnd_1;
2652:       }
2653:       break;
2654:     default:
2655:       ctx->ops->begin = VecScatterBegin_bs;
2656:       ctx->ops->end   = VecScatterEnd_bs;
2657:     }
2658:   }

2660:   ctx->ops->view = VecScatterView_MPI;

2662:   /* try to optimize PtoP vecscatter with memcpy's */
2663:   VecScatterMemcpyPlanCreate_PtoP(to,from);
2664:   return(0);
2665: }

2667: /* ------------------------------------------------------------------------------------*/
2668: /*
2669:          Scatter from local Seq vectors to a parallel vector.
2670:          Reverses the order of the arguments, calls VecScatterCreateLocal_PtoS() then
2671:          reverses the result.
2672: */
2673: PetscErrorCode VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2674: {
2675:   PetscErrorCode         ierr;
2676:   MPI_Request            *waits;
2677:   VecScatter_MPI_General *to,*from;

2680:   VecScatterCreateLocal_PtoS_MPI3(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2681:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2682:   from          = (VecScatter_MPI_General*)ctx->todata;
2683:   ctx->todata   = (void*)to;
2684:   ctx->fromdata = (void*)from;
2685:   /* these two are special, they are ALWAYS stored in to struct */
2686:   to->sstatus   = from->sstatus;
2687:   to->rstatus   = from->rstatus;

2689:   from->sstatus = NULL;
2690:   from->rstatus = NULL;
2691:   waits              = from->rev_requests;
2692:   from->rev_requests = from->requests;
2693:   from->requests     = waits;
2694:   waits              = to->rev_requests;
2695:   to->rev_requests   = to->requests;
2696:   to->requests       = waits;
2697:   return(0);
2698: }

2700: /* ---------------------------------------------------------------------------------*/
2701: PetscErrorCode VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2702: {
2704:   PetscMPIInt    size,rank,tag,imdex,n;
2705:   PetscInt       *owners = xin->map->range;
2706:   PetscMPIInt    *nprocs = NULL;
2707:   PetscInt       i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2708:   PetscMPIInt    *owner   = NULL;
2709:   PetscInt       *starts  = NULL,count,slen;
2710:   PetscInt       *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2711:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2712:   MPI_Comm       comm;
2713:   MPI_Request    *send_waits = NULL,*recv_waits = NULL;
2714:   MPI_Status     recv_status,*send_status = NULL;
2715:   PetscBool      duplicate = PETSC_FALSE;
2716:   PetscBool      found = PETSC_FALSE;

2719:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2720:   PetscObjectGetComm((PetscObject)xin,&comm);
2721:   MPI_Comm_size(comm,&size);
2722:   MPI_Comm_rank(comm,&rank);
2723:   if (size == 1) {
2724:     VecScatterCreateLocal_StoP_MPI3(nx,inidx,ny,inidy,xin,yin,bs,ctx);
2725:     return(0);
2726:   }

2728:   /*
2729:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2730:      They then call the StoPScatterCreate()
2731:   */
2732:   /*  first count number of contributors to each processor */
2733:   PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);
2734:   PetscArrayzero(nprocs,size);

2736:   lastidx = -1;
2737:   j       = 0;
2738:   for (i=0; i<nx; i++) {
2739:     /* if indices are NOT locally sorted, need to start search at the beginning */
2740:     if (lastidx > (idx = bs*inidx[i])) j = 0;
2741:     lastidx = idx;
2742:     for (; j<size; j++) {
2743:       if (idx >= owners[j] && idx < owners[j+1]) {
2744:         nprocs[j]++;
2745:         owner[i] = j;
2746:         found = PETSC_TRUE;
2747:         break;
2748:       }
2749:     }
2750:     if (PetscUnlikelyDebug(!found)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2751:     found = PETSC_FALSE;
2752:   }
2753:   nsends = 0;
2754:   for (i=0; i<size; i++) nsends += (nprocs[i] > 0);

2756:   /* inform other processors of number of messages and max length*/
2757:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2758:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2759:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2760:   recvtotal = 0;
2761:   for (i=0; i<nrecvs; i++) {
2762:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2763:   }

2765:   /* post receives:   */
2766:   PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);

2768:   count = 0;
2769:   for (i=0; i<nrecvs; i++) {
2770:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2771:     PetscIntSumError(count,olengths1[i],&count);
2772:   }
2773:   PetscFree(onodes1);

2775:   /* do sends:
2776:       1) starts[i] gives the starting index in svalues for stuff going to
2777:          the ith processor
2778:   */
2779:   starts[0]= 0;
2780:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2781:   for (i=0; i<nx; i++) {
2782:     svalues[2*starts[owner[i]]]       = bs*inidx[i];
2783:     svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2784:   }

2786:   starts[0] = 0;
2787:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2788:   count = 0;
2789:   for (i=0; i<size; i++) {
2790:     if (nprocs[i]) {
2791:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2792:       count++;
2793:     }
2794:   }
2795:   PetscFree3(nprocs,owner,starts);

2797:   /*  wait on receives */
2798:   count = nrecvs;
2799:   slen  = 0;
2800:   while (count) {
2801:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2802:     /* unpack receives into our local space */
2803:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2804:     slen += n/2;
2805:     count--;
2806:   }
2807:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);

2809:   PetscMalloc2(slen,&local_inidx,slen,&local_inidy);
2810:   base     = owners[rank];
2811:   count    = 0;
2812:   rsvalues = rvalues;
2813:   for (i=0; i<nrecvs; i++) {
2814:     values    = rsvalues;
2815:     rsvalues += 2*olengths1[i];
2816:     for (j=0; j<olengths1[i]; j++) {
2817:       local_inidx[count]   = values[2*j] - base;
2818:       local_inidy[count++] = values[2*j+1];
2819:     }
2820:   }
2821:   PetscFree(olengths1);

2823:   /* wait on sends */
2824:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2825:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);

2827:   /*
2828:      should sort and remove duplicates from local_inidx,local_inidy
2829:   */
2830: #if defined(do_it_slow)
2831:   /* sort on the from index */
2832:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2833:   start = 0;
2834:   while (start < slen) {
2835:     count = start+1;
2836:     last  = local_inidx[start];
2837:     while (count < slen && last == local_inidx[count]) count++;
2838:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2839:       /* sort on to index */
2840:       PetscSortInt(count-start,local_inidy+start);
2841:     }
2842:     /* remove duplicates; not most efficient way, but probably good enough */
2843:     i = start;
2844:     while (i < count-1) {
2845:       if (local_inidy[i] != local_inidy[i+1]) i++;
2846:       else { /* found a duplicate */
2847:         duplicate = PETSC_TRUE;
2848:         for (j=i; j<slen-1; j++) {
2849:           local_inidx[j] = local_inidx[j+1];
2850:           local_inidy[j] = local_inidy[j+1];
2851:         }
2852:         slen--;
2853:         count--;
2854:       }
2855:     }
2856:     start = count;
2857:   }
2858: #endif
2859:   if (duplicate) {
2860:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2861:   }
2862:   VecScatterCreateLocal_StoP_MPI3(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
2863:   PetscFree2(local_inidx,local_inidy);
2864:   return(0);
2865: }

2867: PetscErrorCode VecScatterSetUp_MPI3(VecScatter ctx)
2868: {

2872:   VecScatterSetUp_vectype_private(ctx,VecScatterCreateLocal_PtoS_MPI3,VecScatterCreateLocal_StoP_MPI3,VecScatterCreateLocal_PtoP_MPI3);
2873:   return(0);
2874: }

2876: PetscErrorCode VecScatterCreate_MPI3(VecScatter ctx)
2877: {
2878:   PetscErrorCode    ierr;

2881:   ctx->ops->setup = VecScatterSetUp_MPI3;
2882:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3);
2883:   PetscInfo(ctx,"Using MPI3 for vector scatter\n");
2884:   return(0);
2885: }

2887: PetscErrorCode VecScatterCreate_MPI3Node(VecScatter ctx)
2888: {
2889:   PetscErrorCode    ierr;

2892:   ctx->ops->setup = VecScatterSetUp_MPI3;
2893:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3NODE);
2894:   PetscInfo(ctx,"Using MPI3NODE for vector scatter\n");
2895:   return(0);
2896: }