Actual source code: vpscat.c

petsc-3.10.5 2019-03-28
Report Typos and Errors
  1: /*
  2:     Defines parallel vector scatters.
  3: */

  5:  #include <petsc/private/vecscatterimpl.h>

  7: #if defined(PETSC_HAVE_MPI_WIN_CREATE_FEATURE)

  9: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 10: {
 11:   VecScatter_MPI_General *to  =(VecScatter_MPI_General*)ctx->todata;
 12:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 13:   PetscErrorCode         ierr;
 14:   PetscInt               i;
 15:   PetscMPIInt            rank;
 16:   PetscViewerFormat      format;
 17:   PetscBool              iascii;

 20:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 21:   if (iascii) {
 22:     MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);
 23:     PetscViewerGetFormat(viewer,&format);
 24:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 25:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 27:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 28:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 29:       itmp = to->starts[to->n+1];
 30:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 31:       itmp = from->starts[from->n+1];
 32:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 33:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));

 35:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 36:       PetscViewerASCIIPrintf(viewer,"  Blocksize %D\n",to->bs);
 37:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 41:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
 42:     } else {
 43:       PetscViewerASCIIPrintf(viewer,"  VecScatter Blocksize %D\n",to->bs);
 44:       PetscViewerASCIIPushSynchronized(viewer);
 45:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 46:       if (to->n) {
 47:         for (i=0; i<to->n; i++) {
 48:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 49:         }
 50:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote sends (in order by process sent to)\n",rank);
 51:         for (i=0; i<to->starts[to->n]; i++) {
 52:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
 53:         }
 54:       }

 56:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 57:       if (from->n) {
 58:         for (i=0; i<from->n; i++) {
 59:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 60:         }

 62:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote receives (in order by process received from)\n",rank);
 63:         for (i=0; i<from->starts[from->n]; i++) {
 64:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
 65:         }
 66:       }
 67:       if (to->local.n) {
 68:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
 69:         for (i=0; i<to->local.n; i++) {  /* the to and from have the opposite meaning from what you would expect */
 70:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,to->local.vslots[i],from->local.vslots[i]);
 71:         }
 72:       }

 74:      for (i=0; i<to->msize; i++) {
 75:        if (to->sharedspacestarts[i+1] > to->sharedspacestarts[i]) {
 76:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory to local memory partner %d count %d\n",rank,i,to->sharedspacestarts[i+1]-to->sharedspacestarts[i]);
 77:         }
 78:       }
 79:      for (i=0; i<from->msize; i++) {
 80:        if (from->sharedspacestarts[i+1] > from->sharedspacestarts[i]) {
 81:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory from local memory partner %d count %d\n",rank,i,from->sharedspacestarts[i+1]-from->sharedspacestarts[i]);
 82:         }
 83:       }

 85:       PetscViewerFlush(viewer);
 86:       PetscViewerASCIIPopSynchronized(viewer);

 88:       PetscViewerASCIIPrintf(viewer,"Method used to implement the VecScatter: ");
 89:     }
 90:   }
 91:   return(0);
 92: }

 94: /* -----------------------------------------------------------------------------------*/
 95: /*
 96:       The next routine determines what part of  the local part of the scatter is an
 97:   exact copy of values into their current location. We check this here and
 98:   then know that we need not perform that portion of the scatter when the vector is
 99:   scattering to itself with INSERT_VALUES.

101:      This is currently not used but would speed up, for example DMLocalToLocalBegin/End()

103: */
104: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
105: {
106:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
108:   PetscInt       *nto_slots,*nfrom_slots,j = 0;

111:   for (i=0; i<n; i++) {
112:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
113:   }

115:   if (!n_nonmatching) {
116:     to->nonmatching_computed = PETSC_TRUE;
117:     to->n_nonmatching        = from->n_nonmatching = 0;
118:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
119:   } else if (n_nonmatching == n) {
120:     to->nonmatching_computed = PETSC_FALSE;
121:     PetscInfo(scatter,"All values non-matching\n");
122:   } else {
123:     to->nonmatching_computed= PETSC_TRUE;
124:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;

126:     PetscMalloc1(n_nonmatching,&nto_slots);
127:     PetscMalloc1(n_nonmatching,&nfrom_slots);

129:     to->slots_nonmatching   = nto_slots;
130:     from->slots_nonmatching = nfrom_slots;
131:     for (i=0; i<n; i++) {
132:       if (to_slots[i] != from_slots[i]) {
133:         nto_slots[j]   = to_slots[i];
134:         nfrom_slots[j] = from_slots[i];
135:         j++;
136:       }
137:     }
138:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
139:   }
140:   return(0);
141: }

143: /* --------------------------------------------------------------------------------------*/

145: /* -------------------------------------------------------------------------------------*/
146: PetscErrorCode VecScatterDestroy_PtoP_MPI3(VecScatter ctx)
147: {
148:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
149:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
150:   PetscErrorCode         ierr;
151:   PetscInt               i;

154:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
155:   if (to->requests) {
156:     for (i=0; i<to->n; i++) {
157:       MPI_Request_free(to->requests + i);
158:     }
159:   }
160:   if (to->rev_requests) {
161:     for (i=0; i<to->n; i++) {
162:       MPI_Request_free(to->rev_requests + i);
163:     }
164:   }
165:   if (from->requests) {
166:     for (i=0; i<from->n; i++) {
167:       MPI_Request_free(from->requests + i);
168:     }
169:   }
170:   if (from->rev_requests) {
171:     for (i=0; i<from->n; i++) {
172:       MPI_Request_free(from->rev_requests + i);
173:     }
174:   }
175:   if (to->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&to->sharedwin);}
176:   if (from->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&from->sharedwin);}
177:   PetscFree(to->sharedspaces);
178:   PetscFree(to->sharedspacesoffset);
179:   PetscFree(to->sharedspaceindices);
180:   PetscFree(to->sharedspacestarts);

182:   PetscFree(from->sharedspaceindices);
183:   PetscFree(from->sharedspaces);
184:   PetscFree(from->sharedspacesoffset);
185:   PetscFree(from->sharedspacestarts);

187:   PetscFree(to->local.vslots);
188:   PetscFree(from->local.vslots);
189:   PetscFree(to->local.slots_nonmatching);
190:   PetscFree(from->local.slots_nonmatching);
191:   PetscFree(to->rev_requests);
192:   PetscFree(from->rev_requests);
193:   PetscFree(to->requests);
194:   PetscFree(from->requests);
195:   PetscFree4(to->values,to->indices,to->starts,to->procs);
196:   PetscFree2(to->sstatus,to->rstatus);
197:   PetscFree4(from->values,from->indices,from->starts,from->procs);
198:   VecScatterMemcpyPlanDestroy_PtoP(to,from);
199:   PetscFree(from);
200:   PetscFree(to);
201:   return(0);
202: }

204: /* --------------------------------------------------------------------------------------*/

206: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
207: {
208:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
209:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
210:   PetscErrorCode         ierr;
211:   PetscInt               ny,bs = in_from->bs,jj;
212:   PetscShmComm           scomm;
213:   MPI_Comm               mscomm;
214:   MPI_Info               info;

217:   out->ops->begin   = in->ops->begin;
218:   out->ops->end     = in->ops->end;
219:   out->ops->copy    = in->ops->copy;
220:   out->ops->destroy = in->ops->destroy;
221:   out->ops->view    = in->ops->view;

223:   /* allocate entire send scatter context */
224:   PetscNewLog(out,&out_to);
225:   out_to->sharedwin       = MPI_WIN_NULL;
226:   PetscNewLog(out,&out_from);
227:   out_from->sharedwin       = MPI_WIN_NULL;

229:   ny                = in_to->starts[in_to->n];
230:   out_to->n         = in_to->n;
231:   out_to->format    = in_to->format;

233:   PetscMalloc1(out_to->n,&out_to->requests);
234:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
235:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
236:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
237:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
238:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

240:   out->todata                        = (void*)out_to;
241:   out_to->local.n                    = in_to->local.n;
242:   out_to->local.nonmatching_computed = PETSC_FALSE;
243:   out_to->local.n_nonmatching        = 0;
244:   out_to->local.slots_nonmatching    = 0;
245:   if (in_to->local.n) {
246:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
247:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
248:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
249:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
250:   } else {
251:     out_to->local.vslots   = 0;
252:     out_from->local.vslots = 0;
253:   }

255:   /* allocate entire receive context */
256:   VecScatterMemcpyPlanCopy(&in_to->local.memcpy_plan,&out_to->local.memcpy_plan);
257:   VecScatterMemcpyPlanCopy(&in_from->local.memcpy_plan,&out_from->local.memcpy_plan);
258:   VecScatterMemcpyPlanCopy(&in_to->memcpy_plan,&out_to->memcpy_plan);
259:   VecScatterMemcpyPlanCopy(&in_from->memcpy_plan,&out_from->memcpy_plan);

261:   out_from->format    = in_from->format;
262:   ny                  = in_from->starts[in_from->n];
263:   out_from->n         = in_from->n;

265:   PetscMalloc1(out_from->n,&out_from->requests);
266:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
267:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
268:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
269:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

271:   out->fromdata                        = (void*)out_from;
272:   out_from->local.n                    = in_from->local.n;
273:   out_from->local.nonmatching_computed = PETSC_FALSE;
274:   out_from->local.n_nonmatching        = 0;
275:   out_from->local.slots_nonmatching    = 0;

277:   /*
278:       set up the request arrays for use with isend_init() and irecv_init()
279:   */
280:   {
281:     PetscMPIInt tag;
282:     MPI_Comm    comm;
283:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
284:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
285:     PetscInt    i;
286:     MPI_Request *swaits   = out_to->requests,*rwaits  = out_from->requests;
287:     MPI_Request *rev_swaits,*rev_rwaits;
288:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

290:     PetscMalloc1(in_to->n,&out_to->rev_requests);
291:     PetscMalloc1(in_from->n,&out_from->rev_requests);

293:     rev_rwaits = out_to->rev_requests;
294:     rev_swaits = out_from->rev_requests;

296:     out_from->bs = out_to->bs = bs;
297:     tag          = ((PetscObject)out)->tag;
298:     PetscObjectGetComm((PetscObject)out,&comm);

300:     /* Register the receives that you will use later (sends for scatter reverse) */
301:     for (i=0; i<out_from->n; i++) {
302:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
303:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
304:     }

306:     for (i=0; i<out_to->n; i++) {
307:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
308:     }

310:     /* Register receives for scatter reverse */
311:     for (i=0; i<out_to->n; i++) {
312:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
313:     }
314:   }

316:     /* since the to and from data structures are not symmetric for shared memory copies we insure they always listed in "standard" form */
317:   if (!in_to->sharedwin) {
318:     VecScatter_MPI_General *totmp = in_to,*out_totmp = out_to;
319:     in_to       = in_from;
320:     in_from     = totmp;
321:     out_to      = out_from;
322:     out_from    = out_totmp;
323:   }

325:   /* copy the to parts for the shared memory copies between processes */
326:   out_to->sharedcnt = in_to->sharedcnt;
327:   out_to->msize     = in_to->msize;
328:   PetscMalloc1(out_to->msize+1,&out_to->sharedspacestarts);
329:   PetscMemcpy(out_to->sharedspacestarts,in_to->sharedspacestarts,(out_to->msize+1)*sizeof(PetscInt));
330:   PetscMalloc1(out_to->sharedcnt,&out_to->sharedspaceindices);
331:   PetscMemcpy(out_to->sharedspaceindices,in_to->sharedspaceindices,out_to->sharedcnt*sizeof(PetscInt));

333:   PetscShmCommGet(PetscObjectComm((PetscObject)in),&scomm);
334:   PetscShmCommGetMpiShmComm(scomm,&mscomm);
335:   MPI_Info_create(&info);
336:   MPI_Info_set(info, "alloc_shared_noncontig", "true");
337:   MPIU_Win_allocate_shared(bs*out_to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&out_to->sharedspace,&out_to->sharedwin);
338:   MPI_Info_free(&info);

340:   /* copy the to parts for the shared memory copies between processes */
341:   out_from->sharedcnt = in_from->sharedcnt;
342:   out_from->msize     = in_from->msize;
343:   PetscMalloc1(out_from->msize+1,&out_from->sharedspacestarts);
344:   PetscMemcpy(out_from->sharedspacestarts,in_from->sharedspacestarts,(out_from->msize+1)*sizeof(PetscInt));
345:   PetscMalloc1(out_from->sharedcnt,&out_from->sharedspaceindices);
346:   PetscMemcpy(out_from->sharedspaceindices,in_from->sharedspaceindices,out_from->sharedcnt*sizeof(PetscInt));
347:   PetscMalloc1(out_from->msize,&out_from->sharedspacesoffset);
348:   PetscMemcpy(out_from->sharedspacesoffset,in_from->sharedspacesoffset,out_from->msize*sizeof(PetscInt));
349:   PetscCalloc1(out_from->msize,&out_from->sharedspaces);
350:   for (jj=0; jj<out_from->msize; jj++) {
351:     MPI_Aint    isize;
352:     PetscMPIInt disp_unit;
353:     MPIU_Win_shared_query(out_to->sharedwin,jj,&isize,&disp_unit,&out_from->sharedspaces[jj]);
354:   }
355:   return(0);
356: }

358: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
359: {
360:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
361:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
362:   PetscErrorCode         ierr;
363:   PetscInt               ny,bs = in_from->bs;
364:   PetscMPIInt            size;

367:   MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);

369:   out->ops->begin     = in->ops->begin;
370:   out->ops->end       = in->ops->end;
371:   out->ops->copy      = in->ops->copy;
372:   out->ops->destroy   = in->ops->destroy;
373:   out->ops->view      = in->ops->view;

375:   /* allocate entire send scatter context */
376:   PetscNewLog(out,&out_to);
377:   out_to->sharedwin       = MPI_WIN_NULL;
378:   PetscNewLog(out,&out_from);
379:   out_from->sharedwin       = MPI_WIN_NULL;

381:   ny                = in_to->starts[in_to->n];
382:   out_to->n         = in_to->n;
383:   out_to->format    = in_to->format;

385:   PetscMalloc1(out_to->n,&out_to->requests);
386:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
387:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
388:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
389:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
390:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

392:   out->todata                        = (void*)out_to;
393:   out_to->local.n                    = in_to->local.n;
394:   out_to->local.nonmatching_computed = PETSC_FALSE;
395:   out_to->local.n_nonmatching        = 0;
396:   out_to->local.slots_nonmatching    = 0;
397:   if (in_to->local.n) {
398:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
399:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
400:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
401:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
402:   } else {
403:     out_to->local.vslots   = 0;
404:     out_from->local.vslots = 0;
405:   }

407:   /* allocate entire receive context */
408:   VecScatterMemcpyPlanCopy_PtoP(in_to,in_from,out_to,out_from);

410:   out_from->format    = in_from->format;
411:   ny                  = in_from->starts[in_from->n];
412:   out_from->n         = in_from->n;

414:   PetscMalloc1(out_from->n,&out_from->requests);
415:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
416:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
417:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
418:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

420:   out->fromdata                        = (void*)out_from;
421:   out_from->local.n                    = in_from->local.n;
422:   out_from->local.nonmatching_computed = PETSC_FALSE;
423:   out_from->local.n_nonmatching        = 0;
424:   out_from->local.slots_nonmatching    = 0;

426:   return(0);
427: }
428: /* --------------------------------------------------------------------------------------------------
429:     Packs and unpacks the message data into send or from receive buffers.

431:     These could be generated automatically.

433:     Fortran kernels etc. could be used.
434: */
435: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
436: {
437:   PetscInt i;
438:   for (i=0; i<n; i++) y[i] = x[indicesx[i]];
439: }

441: PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
442: {
443:   PetscInt i;

446:   switch (addv) {
447:   case INSERT_VALUES:
448:   case INSERT_ALL_VALUES:
449:     for (i=0; i<n; i++) y[indicesy[i]] = x[i];
450:     break;
451:   case ADD_VALUES:
452:   case ADD_ALL_VALUES:
453:     for (i=0; i<n; i++) y[indicesy[i]] += x[i];
454:     break;
455: #if !defined(PETSC_USE_COMPLEX)
456:   case MAX_VALUES:
457:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
458: #else
459:   case MAX_VALUES:
460: #endif
461:   case NOT_SET_VALUES:
462:     break;
463:   default:
464:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
465:   }
466:   return(0);
467: }

469: PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
470: {
471:   PetscInt i;

474:   switch (addv) {
475:   case INSERT_VALUES:
476:   case INSERT_ALL_VALUES:
477:     for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
478:     break;
479:   case ADD_VALUES:
480:   case ADD_ALL_VALUES:
481:     for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
482:     break;
483: #if !defined(PETSC_USE_COMPLEX)
484:   case MAX_VALUES:
485:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
486: #else
487:   case MAX_VALUES:
488: #endif
489:   case NOT_SET_VALUES:
490:     break;
491:   default:
492:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
493:   }
494:   return(0);
495: }

497: /* ----------------------------------------------------------------------------------------------- */
498: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
499: {
500:   PetscInt i,idx;

502:   for (i=0; i<n; i++) {
503:     idx  = *indicesx++;
504:     y[0] = x[idx];
505:     y[1] = x[idx+1];
506:     y   += 2;
507:   }
508: }

510: PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
511: {
512:   PetscInt i,idy;

515:   switch (addv) {
516:   case INSERT_VALUES:
517:   case INSERT_ALL_VALUES:
518:     for (i=0; i<n; i++) {
519:       idy      = *indicesy++;
520:       y[idy]   = x[0];
521:       y[idy+1] = x[1];
522:       x       += 2;
523:     }
524:     break;
525:   case ADD_VALUES:
526:   case ADD_ALL_VALUES:
527:     for (i=0; i<n; i++) {
528:       idy       = *indicesy++;
529:       y[idy]   += x[0];
530:       y[idy+1] += x[1];
531:       x        += 2;
532:     }
533:     break;
534: #if !defined(PETSC_USE_COMPLEX)
535:   case MAX_VALUES:
536:     for (i=0; i<n; i++) {
537:       idy      = *indicesy++;
538:       y[idy]   = PetscMax(y[idy],x[0]);
539:       y[idy+1] = PetscMax(y[idy+1],x[1]);
540:       x       += 2;
541:     }
542: #else
543:   case MAX_VALUES:
544: #endif
545:   case NOT_SET_VALUES:
546:     break;
547:   default:
548:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
549:   }
550:   return(0);
551: }

553: PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
554: {
555:   PetscInt i,idx,idy;

558:   switch (addv) {
559:   case INSERT_VALUES:
560:   case INSERT_ALL_VALUES:
561:     for (i=0; i<n; i++) {
562:       idx      = *indicesx++;
563:       idy      = *indicesy++;
564:       y[idy]   = x[idx];
565:       y[idy+1] = x[idx+1];
566:     }
567:     break;
568:   case ADD_VALUES:
569:   case ADD_ALL_VALUES:
570:     for (i=0; i<n; i++) {
571:       idx       = *indicesx++;
572:       idy       = *indicesy++;
573:       y[idy]   += x[idx];
574:       y[idy+1] += x[idx+1];
575:     }
576:     break;
577: #if !defined(PETSC_USE_COMPLEX)
578:   case MAX_VALUES:
579:     for (i=0; i<n; i++) {
580:       idx      = *indicesx++;
581:       idy      = *indicesy++;
582:       y[idy]   = PetscMax(y[idy],x[idx]);
583:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
584:     }
585: #else
586:   case MAX_VALUES:
587: #endif
588:   case NOT_SET_VALUES:
589:     break;
590:   default:
591:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
592:   }
593:   return(0);
594: }
595: /* ----------------------------------------------------------------------------------------------- */
596: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
597: {
598:   PetscInt i,idx;

600:   for (i=0; i<n; i++) {
601:     idx  = *indicesx++;
602:     y[0] = x[idx];
603:     y[1] = x[idx+1];
604:     y[2] = x[idx+2];
605:     y   += 3;
606:   }
607: }
608: PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
609: {
610:   PetscInt i,idy;

613:   switch (addv) {
614:   case INSERT_VALUES:
615:   case INSERT_ALL_VALUES:
616:     for (i=0; i<n; i++) {
617:       idy      = *indicesy++;
618:       y[idy]   = x[0];
619:       y[idy+1] = x[1];
620:       y[idy+2] = x[2];
621:       x       += 3;
622:     }
623:     break;
624:   case ADD_VALUES:
625:   case ADD_ALL_VALUES:
626:     for (i=0; i<n; i++) {
627:       idy       = *indicesy++;
628:       y[idy]   += x[0];
629:       y[idy+1] += x[1];
630:       y[idy+2] += x[2];
631:       x        += 3;
632:     }
633:     break;
634: #if !defined(PETSC_USE_COMPLEX)
635:   case MAX_VALUES:
636:     for (i=0; i<n; i++) {
637:       idy      = *indicesy++;
638:       y[idy]   = PetscMax(y[idy],x[0]);
639:       y[idy+1] = PetscMax(y[idy+1],x[1]);
640:       y[idy+2] = PetscMax(y[idy+2],x[2]);
641:       x       += 3;
642:     }
643: #else
644:   case MAX_VALUES:
645: #endif
646:   case NOT_SET_VALUES:
647:     break;
648:   default:
649:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
650:   }
651:   return(0);
652: }

654: PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
655: {
656:   PetscInt i,idx,idy;

659:   switch (addv) {
660:   case INSERT_VALUES:
661:   case INSERT_ALL_VALUES:
662:     for (i=0; i<n; i++) {
663:       idx      = *indicesx++;
664:       idy      = *indicesy++;
665:       y[idy]   = x[idx];
666:       y[idy+1] = x[idx+1];
667:       y[idy+2] = x[idx+2];
668:     }
669:     break;
670:   case ADD_VALUES:
671:   case ADD_ALL_VALUES:
672:     for (i=0; i<n; i++) {
673:       idx       = *indicesx++;
674:       idy       = *indicesy++;
675:       y[idy]   += x[idx];
676:       y[idy+1] += x[idx+1];
677:       y[idy+2] += x[idx+2];
678:     }
679:     break;
680: #if !defined(PETSC_USE_COMPLEX)
681:   case MAX_VALUES:
682:     for (i=0; i<n; i++) {
683:       idx      = *indicesx++;
684:       idy      = *indicesy++;
685:       y[idy]   = PetscMax(y[idy],x[idx]);
686:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
687:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
688:     }
689: #else
690:   case MAX_VALUES:
691: #endif
692:   case NOT_SET_VALUES:
693:     break;
694:   default:
695:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
696:   }
697:   return(0);
698: }
699: /* ----------------------------------------------------------------------------------------------- */
700: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
701: {
702:   PetscInt i,idx;

704:   for (i=0; i<n; i++) {
705:     idx  = *indicesx++;
706:     y[0] = x[idx];
707:     y[1] = x[idx+1];
708:     y[2] = x[idx+2];
709:     y[3] = x[idx+3];
710:     y   += 4;
711:   }
712: }
713: PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
714: {
715:   PetscInt i,idy;

718:   switch (addv) {
719:   case INSERT_VALUES:
720:   case INSERT_ALL_VALUES:
721:     for (i=0; i<n; i++) {
722:       idy      = *indicesy++;
723:       y[idy]   = x[0];
724:       y[idy+1] = x[1];
725:       y[idy+2] = x[2];
726:       y[idy+3] = x[3];
727:       x       += 4;
728:     }
729:     break;
730:   case ADD_VALUES:
731:   case ADD_ALL_VALUES:
732:     for (i=0; i<n; i++) {
733:       idy       = *indicesy++;
734:       y[idy]   += x[0];
735:       y[idy+1] += x[1];
736:       y[idy+2] += x[2];
737:       y[idy+3] += x[3];
738:       x        += 4;
739:     }
740:     break;
741: #if !defined(PETSC_USE_COMPLEX)
742:   case MAX_VALUES:
743:     for (i=0; i<n; i++) {
744:       idy      = *indicesy++;
745:       y[idy]   = PetscMax(y[idy],x[0]);
746:       y[idy+1] = PetscMax(y[idy+1],x[1]);
747:       y[idy+2] = PetscMax(y[idy+2],x[2]);
748:       y[idy+3] = PetscMax(y[idy+3],x[3]);
749:       x       += 4;
750:     }
751: #else
752:   case MAX_VALUES:
753: #endif
754:   case NOT_SET_VALUES:
755:     break;
756:   default:
757:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
758:   }
759:   return(0);
760: }

762: PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
763: {
764:   PetscInt i,idx,idy;

767:   switch (addv) {
768:   case INSERT_VALUES:
769:   case INSERT_ALL_VALUES:
770:     for (i=0; i<n; i++) {
771:       idx      = *indicesx++;
772:       idy      = *indicesy++;
773:       y[idy]   = x[idx];
774:       y[idy+1] = x[idx+1];
775:       y[idy+2] = x[idx+2];
776:       y[idy+3] = x[idx+3];
777:     }
778:     break;
779:   case ADD_VALUES:
780:   case ADD_ALL_VALUES:
781:     for (i=0; i<n; i++) {
782:       idx       = *indicesx++;
783:       idy       = *indicesy++;
784:       y[idy]   += x[idx];
785:       y[idy+1] += x[idx+1];
786:       y[idy+2] += x[idx+2];
787:       y[idy+3] += x[idx+3];
788:     }
789:     break;
790: #if !defined(PETSC_USE_COMPLEX)
791:   case MAX_VALUES:
792:     for (i=0; i<n; i++) {
793:       idx      = *indicesx++;
794:       idy      = *indicesy++;
795:       y[idy]   = PetscMax(y[idy],x[idx]);
796:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
797:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
798:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
799:     }
800: #else
801:   case MAX_VALUES:
802: #endif
803:   case NOT_SET_VALUES:
804:     break;
805:   default:
806:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
807:   }
808:   return(0);
809: }
810: /* ----------------------------------------------------------------------------------------------- */
811: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
812: {
813:   PetscInt i,idx;

815:   for (i=0; i<n; i++) {
816:     idx  = *indicesx++;
817:     y[0] = x[idx];
818:     y[1] = x[idx+1];
819:     y[2] = x[idx+2];
820:     y[3] = x[idx+3];
821:     y[4] = x[idx+4];
822:     y   += 5;
823:   }
824: }

826: PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
827: {
828:   PetscInt i,idy;

831:   switch (addv) {
832:   case INSERT_VALUES:
833:   case INSERT_ALL_VALUES:
834:     for (i=0; i<n; i++) {
835:       idy      = *indicesy++;
836:       y[idy]   = x[0];
837:       y[idy+1] = x[1];
838:       y[idy+2] = x[2];
839:       y[idy+3] = x[3];
840:       y[idy+4] = x[4];
841:       x       += 5;
842:     }
843:     break;
844:   case ADD_VALUES:
845:   case ADD_ALL_VALUES:
846:     for (i=0; i<n; i++) {
847:       idy       = *indicesy++;
848:       y[idy]   += x[0];
849:       y[idy+1] += x[1];
850:       y[idy+2] += x[2];
851:       y[idy+3] += x[3];
852:       y[idy+4] += x[4];
853:       x        += 5;
854:     }
855:     break;
856: #if !defined(PETSC_USE_COMPLEX)
857:   case MAX_VALUES:
858:     for (i=0; i<n; i++) {
859:       idy      = *indicesy++;
860:       y[idy]   = PetscMax(y[idy],x[0]);
861:       y[idy+1] = PetscMax(y[idy+1],x[1]);
862:       y[idy+2] = PetscMax(y[idy+2],x[2]);
863:       y[idy+3] = PetscMax(y[idy+3],x[3]);
864:       y[idy+4] = PetscMax(y[idy+4],x[4]);
865:       x       += 5;
866:     }
867: #else
868:   case MAX_VALUES:
869: #endif
870:   case NOT_SET_VALUES:
871:     break;
872:   default:
873:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
874:   }
875:   return(0);
876: }

878: PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
879: {
880:   PetscInt i,idx,idy;

883:   switch (addv) {
884:   case INSERT_VALUES:
885:   case INSERT_ALL_VALUES:
886:     for (i=0; i<n; i++) {
887:       idx      = *indicesx++;
888:       idy      = *indicesy++;
889:       y[idy]   = x[idx];
890:       y[idy+1] = x[idx+1];
891:       y[idy+2] = x[idx+2];
892:       y[idy+3] = x[idx+3];
893:       y[idy+4] = x[idx+4];
894:     }
895:     break;
896:   case ADD_VALUES:
897:   case ADD_ALL_VALUES:
898:     for (i=0; i<n; i++) {
899:       idx       = *indicesx++;
900:       idy       = *indicesy++;
901:       y[idy]   += x[idx];
902:       y[idy+1] += x[idx+1];
903:       y[idy+2] += x[idx+2];
904:       y[idy+3] += x[idx+3];
905:       y[idy+4] += x[idx+4];
906:     }
907:     break;
908: #if !defined(PETSC_USE_COMPLEX)
909:   case MAX_VALUES:
910:     for (i=0; i<n; i++) {
911:       idx      = *indicesx++;
912:       idy      = *indicesy++;
913:       y[idy]   = PetscMax(y[idy],x[idx]);
914:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
915:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
916:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
917:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
918:     }
919: #else
920:   case MAX_VALUES:
921: #endif
922:   case NOT_SET_VALUES:
923:     break;
924:   default:
925:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
926:   }
927:   return(0);
928: }
929: /* ----------------------------------------------------------------------------------------------- */
930: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
931: {
932:   PetscInt i,idx;

934:   for (i=0; i<n; i++) {
935:     idx  = *indicesx++;
936:     y[0] = x[idx];
937:     y[1] = x[idx+1];
938:     y[2] = x[idx+2];
939:     y[3] = x[idx+3];
940:     y[4] = x[idx+4];
941:     y[5] = x[idx+5];
942:     y   += 6;
943:   }
944: }

946: PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
947: {
948:   PetscInt i,idy;

951:   switch (addv) {
952:   case INSERT_VALUES:
953:   case INSERT_ALL_VALUES:
954:     for (i=0; i<n; i++) {
955:       idy      = *indicesy++;
956:       y[idy]   = x[0];
957:       y[idy+1] = x[1];
958:       y[idy+2] = x[2];
959:       y[idy+3] = x[3];
960:       y[idy+4] = x[4];
961:       y[idy+5] = x[5];
962:       x       += 6;
963:     }
964:     break;
965:   case ADD_VALUES:
966:   case ADD_ALL_VALUES:
967:     for (i=0; i<n; i++) {
968:       idy       = *indicesy++;
969:       y[idy]   += x[0];
970:       y[idy+1] += x[1];
971:       y[idy+2] += x[2];
972:       y[idy+3] += x[3];
973:       y[idy+4] += x[4];
974:       y[idy+5] += x[5];
975:       x        += 6;
976:     }
977:     break;
978: #if !defined(PETSC_USE_COMPLEX)
979:   case MAX_VALUES:
980:     for (i=0; i<n; i++) {
981:       idy      = *indicesy++;
982:       y[idy]   = PetscMax(y[idy],x[0]);
983:       y[idy+1] = PetscMax(y[idy+1],x[1]);
984:       y[idy+2] = PetscMax(y[idy+2],x[2]);
985:       y[idy+3] = PetscMax(y[idy+3],x[3]);
986:       y[idy+4] = PetscMax(y[idy+4],x[4]);
987:       y[idy+5] = PetscMax(y[idy+5],x[5]);
988:       x       += 6;
989:     }
990: #else
991:   case MAX_VALUES:
992: #endif
993:   case NOT_SET_VALUES:
994:     break;
995:   default:
996:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
997:   }
998:   return(0);
999: }

1001: PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1002: {
1003:   PetscInt i,idx,idy;

1006:   switch (addv) {
1007:   case INSERT_VALUES:
1008:   case INSERT_ALL_VALUES:
1009:     for (i=0; i<n; i++) {
1010:       idx      = *indicesx++;
1011:       idy      = *indicesy++;
1012:       y[idy]   = x[idx];
1013:       y[idy+1] = x[idx+1];
1014:       y[idy+2] = x[idx+2];
1015:       y[idy+3] = x[idx+3];
1016:       y[idy+4] = x[idx+4];
1017:       y[idy+5] = x[idx+5];
1018:     }
1019:     break;
1020:   case ADD_VALUES:
1021:   case ADD_ALL_VALUES:
1022:     for (i=0; i<n; i++) {
1023:       idx       = *indicesx++;
1024:       idy       = *indicesy++;
1025:       y[idy]   += x[idx];
1026:       y[idy+1] += x[idx+1];
1027:       y[idy+2] += x[idx+2];
1028:       y[idy+3] += x[idx+3];
1029:       y[idy+4] += x[idx+4];
1030:       y[idy+5] += x[idx+5];
1031:     }
1032:     break;
1033: #if !defined(PETSC_USE_COMPLEX)
1034:   case MAX_VALUES:
1035:     for (i=0; i<n; i++) {
1036:       idx      = *indicesx++;
1037:       idy      = *indicesy++;
1038:       y[idy]   = PetscMax(y[idy],x[idx]);
1039:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1040:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1041:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1042:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1043:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1044:     }
1045: #else
1046:   case MAX_VALUES:
1047: #endif
1048:   case NOT_SET_VALUES:
1049:     break;
1050:   default:
1051:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1052:   }
1053:   return(0);
1054: }
1055: /* ----------------------------------------------------------------------------------------------- */
1056: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1057: {
1058:   PetscInt i,idx;

1060:   for (i=0; i<n; i++) {
1061:     idx  = *indicesx++;
1062:     y[0] = x[idx];
1063:     y[1] = x[idx+1];
1064:     y[2] = x[idx+2];
1065:     y[3] = x[idx+3];
1066:     y[4] = x[idx+4];
1067:     y[5] = x[idx+5];
1068:     y[6] = x[idx+6];
1069:     y   += 7;
1070:   }
1071: }

1073: PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1074: {
1075:   PetscInt i,idy;

1078:   switch (addv) {
1079:   case INSERT_VALUES:
1080:   case INSERT_ALL_VALUES:
1081:     for (i=0; i<n; i++) {
1082:       idy      = *indicesy++;
1083:       y[idy]   = x[0];
1084:       y[idy+1] = x[1];
1085:       y[idy+2] = x[2];
1086:       y[idy+3] = x[3];
1087:       y[idy+4] = x[4];
1088:       y[idy+5] = x[5];
1089:       y[idy+6] = x[6];
1090:       x       += 7;
1091:     }
1092:     break;
1093:   case ADD_VALUES:
1094:   case ADD_ALL_VALUES:
1095:     for (i=0; i<n; i++) {
1096:       idy       = *indicesy++;
1097:       y[idy]   += x[0];
1098:       y[idy+1] += x[1];
1099:       y[idy+2] += x[2];
1100:       y[idy+3] += x[3];
1101:       y[idy+4] += x[4];
1102:       y[idy+5] += x[5];
1103:       y[idy+6] += x[6];
1104:       x        += 7;
1105:     }
1106:     break;
1107: #if !defined(PETSC_USE_COMPLEX)
1108:   case MAX_VALUES:
1109:     for (i=0; i<n; i++) {
1110:       idy      = *indicesy++;
1111:       y[idy]   = PetscMax(y[idy],x[0]);
1112:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1113:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1114:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1115:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1116:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1117:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1118:       x       += 7;
1119:     }
1120: #else
1121:   case MAX_VALUES:
1122: #endif
1123:   case NOT_SET_VALUES:
1124:     break;
1125:   default:
1126:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1127:   }
1128:   return(0);
1129: }

1131: PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1132: {
1133:   PetscInt i,idx,idy;

1136:   switch (addv) {
1137:   case INSERT_VALUES:
1138:   case INSERT_ALL_VALUES:
1139:     for (i=0; i<n; i++) {
1140:       idx      = *indicesx++;
1141:       idy      = *indicesy++;
1142:       y[idy]   = x[idx];
1143:       y[idy+1] = x[idx+1];
1144:       y[idy+2] = x[idx+2];
1145:       y[idy+3] = x[idx+3];
1146:       y[idy+4] = x[idx+4];
1147:       y[idy+5] = x[idx+5];
1148:       y[idy+6] = x[idx+6];
1149:     }
1150:     break;
1151:   case ADD_VALUES:
1152:   case ADD_ALL_VALUES:
1153:     for (i=0; i<n; i++) {
1154:       idx       = *indicesx++;
1155:       idy       = *indicesy++;
1156:       y[idy]   += x[idx];
1157:       y[idy+1] += x[idx+1];
1158:       y[idy+2] += x[idx+2];
1159:       y[idy+3] += x[idx+3];
1160:       y[idy+4] += x[idx+4];
1161:       y[idy+5] += x[idx+5];
1162:       y[idy+6] += x[idx+6];
1163:     }
1164:     break;
1165: #if !defined(PETSC_USE_COMPLEX)
1166:   case MAX_VALUES:
1167:     for (i=0; i<n; i++) {
1168:       idx      = *indicesx++;
1169:       idy      = *indicesy++;
1170:       y[idy]   = PetscMax(y[idy],x[idx]);
1171:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1172:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1173:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1174:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1175:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1176:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1177:     }
1178: #else
1179:   case MAX_VALUES:
1180: #endif
1181:   case NOT_SET_VALUES:
1182:     break;
1183:   default:
1184:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1185:   }
1186:   return(0);
1187: }
1188: /* ----------------------------------------------------------------------------------------------- */
1189: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1190: {
1191:   PetscInt i,idx;

1193:   for (i=0; i<n; i++) {
1194:     idx  = *indicesx++;
1195:     y[0] = x[idx];
1196:     y[1] = x[idx+1];
1197:     y[2] = x[idx+2];
1198:     y[3] = x[idx+3];
1199:     y[4] = x[idx+4];
1200:     y[5] = x[idx+5];
1201:     y[6] = x[idx+6];
1202:     y[7] = x[idx+7];
1203:     y   += 8;
1204:   }
1205: }

1207: PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1208: {
1209:   PetscInt i,idy;

1212:   switch (addv) {
1213:   case INSERT_VALUES:
1214:   case INSERT_ALL_VALUES:
1215:     for (i=0; i<n; i++) {
1216:       idy      = *indicesy++;
1217:       y[idy]   = x[0];
1218:       y[idy+1] = x[1];
1219:       y[idy+2] = x[2];
1220:       y[idy+3] = x[3];
1221:       y[idy+4] = x[4];
1222:       y[idy+5] = x[5];
1223:       y[idy+6] = x[6];
1224:       y[idy+7] = x[7];
1225:       x       += 8;
1226:     }
1227:     break;
1228:   case ADD_VALUES:
1229:   case ADD_ALL_VALUES:
1230:     for (i=0; i<n; i++) {
1231:       idy       = *indicesy++;
1232:       y[idy]   += x[0];
1233:       y[idy+1] += x[1];
1234:       y[idy+2] += x[2];
1235:       y[idy+3] += x[3];
1236:       y[idy+4] += x[4];
1237:       y[idy+5] += x[5];
1238:       y[idy+6] += x[6];
1239:       y[idy+7] += x[7];
1240:       x        += 8;
1241:     }
1242:     break;
1243: #if !defined(PETSC_USE_COMPLEX)
1244:   case MAX_VALUES:
1245:     for (i=0; i<n; i++) {
1246:       idy      = *indicesy++;
1247:       y[idy]   = PetscMax(y[idy],x[0]);
1248:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1249:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1250:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1251:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1252:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1253:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1254:       y[idy+7] = PetscMax(y[idy+7],x[7]);
1255:       x       += 8;
1256:     }
1257: #else
1258:   case MAX_VALUES:
1259: #endif
1260:   case NOT_SET_VALUES:
1261:     break;
1262:   default:
1263:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1264:   }
1265:   return(0);
1266: }

1268: PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1269: {
1270:   PetscInt i,idx,idy;

1273:   switch (addv) {
1274:   case INSERT_VALUES:
1275:   case INSERT_ALL_VALUES:
1276:     for (i=0; i<n; i++) {
1277:       idx      = *indicesx++;
1278:       idy      = *indicesy++;
1279:       y[idy]   = x[idx];
1280:       y[idy+1] = x[idx+1];
1281:       y[idy+2] = x[idx+2];
1282:       y[idy+3] = x[idx+3];
1283:       y[idy+4] = x[idx+4];
1284:       y[idy+5] = x[idx+5];
1285:       y[idy+6] = x[idx+6];
1286:       y[idy+7] = x[idx+7];
1287:     }
1288:     break;
1289:   case ADD_VALUES:
1290:   case ADD_ALL_VALUES:
1291:     for (i=0; i<n; i++) {
1292:       idx       = *indicesx++;
1293:       idy       = *indicesy++;
1294:       y[idy]   += x[idx];
1295:       y[idy+1] += x[idx+1];
1296:       y[idy+2] += x[idx+2];
1297:       y[idy+3] += x[idx+3];
1298:       y[idy+4] += x[idx+4];
1299:       y[idy+5] += x[idx+5];
1300:       y[idy+6] += x[idx+6];
1301:       y[idy+7] += x[idx+7];
1302:     }
1303:     break;
1304: #if !defined(PETSC_USE_COMPLEX)
1305:   case MAX_VALUES:
1306:     for (i=0; i<n; i++) {
1307:       idx      = *indicesx++;
1308:       idy      = *indicesy++;
1309:       y[idy]   = PetscMax(y[idy],x[idx]);
1310:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1311:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1312:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1313:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1314:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1315:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1316:       y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1317:     }
1318: #else
1319:   case MAX_VALUES:
1320: #endif
1321:   case NOT_SET_VALUES:
1322:     break;
1323:   default:
1324:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1325:   }
1326:   return(0);
1327: }

1329: PETSC_STATIC_INLINE void Pack_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1330: {
1331:   PetscInt i,idx;

1333:   for (i=0; i<n; i++) {
1334:     idx   = *indicesx++;
1335:     y[0]  = x[idx];
1336:     y[1]  = x[idx+1];
1337:     y[2]  = x[idx+2];
1338:     y[3]  = x[idx+3];
1339:     y[4]  = x[idx+4];
1340:     y[5]  = x[idx+5];
1341:     y[6]  = x[idx+6];
1342:     y[7]  = x[idx+7];
1343:     y[8]  = x[idx+8];
1344:     y    += 9;
1345:   }
1346: }

1348: PETSC_STATIC_INLINE PetscErrorCode UnPack_9(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1349: {
1350:   PetscInt i,idy;

1353:   switch (addv) {
1354:   case INSERT_VALUES:
1355:   case INSERT_ALL_VALUES:
1356:     for (i=0; i<n; i++) {
1357:       idy       = *indicesy++;
1358:       y[idy]    = x[0];
1359:       y[idy+1]  = x[1];
1360:       y[idy+2]  = x[2];
1361:       y[idy+3]  = x[3];
1362:       y[idy+4]  = x[4];
1363:       y[idy+5]  = x[5];
1364:       y[idy+6]  = x[6];
1365:       y[idy+7]  = x[7];
1366:       y[idy+8]  = x[8];
1367:       x        += 9;
1368:     }
1369:     break;
1370:   case ADD_VALUES:
1371:   case ADD_ALL_VALUES:
1372:     for (i=0; i<n; i++) {
1373:       idy        = *indicesy++;
1374:       y[idy]    += x[0];
1375:       y[idy+1]  += x[1];
1376:       y[idy+2]  += x[2];
1377:       y[idy+3]  += x[3];
1378:       y[idy+4]  += x[4];
1379:       y[idy+5]  += x[5];
1380:       y[idy+6]  += x[6];
1381:       y[idy+7]  += x[7];
1382:       y[idy+8]  += x[8];
1383:       x         += 9;
1384:     }
1385:     break;
1386: #if !defined(PETSC_USE_COMPLEX)
1387:   case MAX_VALUES:
1388:     for (i=0; i<n; i++) {
1389:       idy       = *indicesy++;
1390:       y[idy]    = PetscMax(y[idy],x[0]);
1391:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1392:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1393:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1394:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1395:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1396:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1397:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1398:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1399:       x        += 9;
1400:     }
1401: #else
1402:   case MAX_VALUES:
1403: #endif
1404:   case NOT_SET_VALUES:
1405:     break;
1406:   default:
1407:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1408:   }
1409:   return(0);
1410: }

1412: PETSC_STATIC_INLINE PetscErrorCode Scatter_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1413: {
1414:   PetscInt i,idx,idy;

1417:   switch (addv) {
1418:   case INSERT_VALUES:
1419:   case INSERT_ALL_VALUES:
1420:     for (i=0; i<n; i++) {
1421:       idx       = *indicesx++;
1422:       idy       = *indicesy++;
1423:       y[idy]    = x[idx];
1424:       y[idy+1]  = x[idx+1];
1425:       y[idy+2]  = x[idx+2];
1426:       y[idy+3]  = x[idx+3];
1427:       y[idy+4]  = x[idx+4];
1428:       y[idy+5]  = x[idx+5];
1429:       y[idy+6]  = x[idx+6];
1430:       y[idy+7]  = x[idx+7];
1431:       y[idy+8]  = x[idx+8];
1432:     }
1433:     break;
1434:   case ADD_VALUES:
1435:   case ADD_ALL_VALUES:
1436:     for (i=0; i<n; i++) {
1437:       idx        = *indicesx++;
1438:       idy        = *indicesy++;
1439:       y[idy]    += x[idx];
1440:       y[idy+1]  += x[idx+1];
1441:       y[idy+2]  += x[idx+2];
1442:       y[idy+3]  += x[idx+3];
1443:       y[idy+4]  += x[idx+4];
1444:       y[idy+5]  += x[idx+5];
1445:       y[idy+6]  += x[idx+6];
1446:       y[idy+7]  += x[idx+7];
1447:       y[idy+8]  += x[idx+8];
1448:     }
1449:     break;
1450: #if !defined(PETSC_USE_COMPLEX)
1451:   case MAX_VALUES:
1452:     for (i=0; i<n; i++) {
1453:       idx       = *indicesx++;
1454:       idy       = *indicesy++;
1455:       y[idy]    = PetscMax(y[idy],x[idx]);
1456:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1457:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1458:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1459:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1460:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1461:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1462:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1463:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1464:     }
1465: #else
1466:   case MAX_VALUES:
1467: #endif
1468:   case NOT_SET_VALUES:
1469:     break;
1470:   default:
1471:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1472:   }
1473:   return(0);
1474: }

1476: PETSC_STATIC_INLINE void Pack_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1477: {
1478:   PetscInt i,idx;

1480:   for (i=0; i<n; i++) {
1481:     idx   = *indicesx++;
1482:     y[0]  = x[idx];
1483:     y[1]  = x[idx+1];
1484:     y[2]  = x[idx+2];
1485:     y[3]  = x[idx+3];
1486:     y[4]  = x[idx+4];
1487:     y[5]  = x[idx+5];
1488:     y[6]  = x[idx+6];
1489:     y[7]  = x[idx+7];
1490:     y[8]  = x[idx+8];
1491:     y[9]  = x[idx+9];
1492:     y    += 10;
1493:   }
1494: }

1496: PETSC_STATIC_INLINE PetscErrorCode UnPack_10(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1497: {
1498:   PetscInt i,idy;

1501:   switch (addv) {
1502:   case INSERT_VALUES:
1503:   case INSERT_ALL_VALUES:
1504:     for (i=0; i<n; i++) {
1505:       idy       = *indicesy++;
1506:       y[idy]    = x[0];
1507:       y[idy+1]  = x[1];
1508:       y[idy+2]  = x[2];
1509:       y[idy+3]  = x[3];
1510:       y[idy+4]  = x[4];
1511:       y[idy+5]  = x[5];
1512:       y[idy+6]  = x[6];
1513:       y[idy+7]  = x[7];
1514:       y[idy+8]  = x[8];
1515:       y[idy+9]  = x[9];
1516:       x        += 10;
1517:     }
1518:     break;
1519:   case ADD_VALUES:
1520:   case ADD_ALL_VALUES:
1521:     for (i=0; i<n; i++) {
1522:       idy        = *indicesy++;
1523:       y[idy]    += x[0];
1524:       y[idy+1]  += x[1];
1525:       y[idy+2]  += x[2];
1526:       y[idy+3]  += x[3];
1527:       y[idy+4]  += x[4];
1528:       y[idy+5]  += x[5];
1529:       y[idy+6]  += x[6];
1530:       y[idy+7]  += x[7];
1531:       y[idy+8]  += x[8];
1532:       y[idy+9]  += x[9];
1533:       x         += 10;
1534:     }
1535:     break;
1536: #if !defined(PETSC_USE_COMPLEX)
1537:   case MAX_VALUES:
1538:     for (i=0; i<n; i++) {
1539:       idy       = *indicesy++;
1540:       y[idy]    = PetscMax(y[idy],x[0]);
1541:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1542:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1543:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1544:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1545:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1546:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1547:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1548:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1549:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1550:       x        += 10;
1551:     }
1552: #else
1553:   case MAX_VALUES:
1554: #endif
1555:   case NOT_SET_VALUES:
1556:     break;
1557:   default:
1558:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1559:   }
1560:   return(0);
1561: }

1563: PETSC_STATIC_INLINE PetscErrorCode Scatter_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1564: {
1565:   PetscInt i,idx,idy;

1568:   switch (addv) {
1569:   case INSERT_VALUES:
1570:   case INSERT_ALL_VALUES:
1571:     for (i=0; i<n; i++) {
1572:       idx       = *indicesx++;
1573:       idy       = *indicesy++;
1574:       y[idy]    = x[idx];
1575:       y[idy+1]  = x[idx+1];
1576:       y[idy+2]  = x[idx+2];
1577:       y[idy+3]  = x[idx+3];
1578:       y[idy+4]  = x[idx+4];
1579:       y[idy+5]  = x[idx+5];
1580:       y[idy+6]  = x[idx+6];
1581:       y[idy+7]  = x[idx+7];
1582:       y[idy+8]  = x[idx+8];
1583:       y[idy+9]  = x[idx+9];
1584:     }
1585:     break;
1586:   case ADD_VALUES:
1587:   case ADD_ALL_VALUES:
1588:     for (i=0; i<n; i++) {
1589:       idx        = *indicesx++;
1590:       idy        = *indicesy++;
1591:       y[idy]    += x[idx];
1592:       y[idy+1]  += x[idx+1];
1593:       y[idy+2]  += x[idx+2];
1594:       y[idy+3]  += x[idx+3];
1595:       y[idy+4]  += x[idx+4];
1596:       y[idy+5]  += x[idx+5];
1597:       y[idy+6]  += x[idx+6];
1598:       y[idy+7]  += x[idx+7];
1599:       y[idy+8]  += x[idx+8];
1600:       y[idy+9]  += x[idx+9];
1601:     }
1602:     break;
1603: #if !defined(PETSC_USE_COMPLEX)
1604:   case MAX_VALUES:
1605:     for (i=0; i<n; i++) {
1606:       idx       = *indicesx++;
1607:       idy       = *indicesy++;
1608:       y[idy]    = PetscMax(y[idy],x[idx]);
1609:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1610:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1611:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1612:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1613:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1614:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1615:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1616:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1617:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1618:     }
1619: #else
1620:   case MAX_VALUES:
1621: #endif
1622:   case NOT_SET_VALUES:
1623:     break;
1624:   default:
1625:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1626:   }
1627:   return(0);
1628: }

1630: PETSC_STATIC_INLINE void Pack_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1631: {
1632:   PetscInt i,idx;

1634:   for (i=0; i<n; i++) {
1635:     idx   = *indicesx++;
1636:     y[0]  = x[idx];
1637:     y[1]  = x[idx+1];
1638:     y[2]  = x[idx+2];
1639:     y[3]  = x[idx+3];
1640:     y[4]  = x[idx+4];
1641:     y[5]  = x[idx+5];
1642:     y[6]  = x[idx+6];
1643:     y[7]  = x[idx+7];
1644:     y[8]  = x[idx+8];
1645:     y[9]  = x[idx+9];
1646:     y[10] = x[idx+10];
1647:     y    += 11;
1648:   }
1649: }

1651: PETSC_STATIC_INLINE PetscErrorCode UnPack_11(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1652: {
1653:   PetscInt i,idy;

1656:   switch (addv) {
1657:   case INSERT_VALUES:
1658:   case INSERT_ALL_VALUES:
1659:     for (i=0; i<n; i++) {
1660:       idy       = *indicesy++;
1661:       y[idy]    = x[0];
1662:       y[idy+1]  = x[1];
1663:       y[idy+2]  = x[2];
1664:       y[idy+3]  = x[3];
1665:       y[idy+4]  = x[4];
1666:       y[idy+5]  = x[5];
1667:       y[idy+6]  = x[6];
1668:       y[idy+7]  = x[7];
1669:       y[idy+8]  = x[8];
1670:       y[idy+9]  = x[9];
1671:       y[idy+10] = x[10];
1672:       x        += 11;
1673:     }
1674:     break;
1675:   case ADD_VALUES:
1676:   case ADD_ALL_VALUES:
1677:     for (i=0; i<n; i++) {
1678:       idy        = *indicesy++;
1679:       y[idy]    += x[0];
1680:       y[idy+1]  += x[1];
1681:       y[idy+2]  += x[2];
1682:       y[idy+3]  += x[3];
1683:       y[idy+4]  += x[4];
1684:       y[idy+5]  += x[5];
1685:       y[idy+6]  += x[6];
1686:       y[idy+7]  += x[7];
1687:       y[idy+8]  += x[8];
1688:       y[idy+9]  += x[9];
1689:       y[idy+10] += x[10];
1690:       x         += 11;
1691:     }
1692:     break;
1693: #if !defined(PETSC_USE_COMPLEX)
1694:   case MAX_VALUES:
1695:     for (i=0; i<n; i++) {
1696:       idy       = *indicesy++;
1697:       y[idy]    = PetscMax(y[idy],x[0]);
1698:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1699:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1700:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1701:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1702:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1703:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1704:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1705:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1706:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1707:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1708:       x        += 11;
1709:     }
1710: #else
1711:   case MAX_VALUES:
1712: #endif
1713:   case NOT_SET_VALUES:
1714:     break;
1715:   default:
1716:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1717:   }
1718:   return(0);
1719: }

1721: PETSC_STATIC_INLINE PetscErrorCode Scatter_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1722: {
1723:   PetscInt i,idx,idy;

1726:   switch (addv) {
1727:   case INSERT_VALUES:
1728:   case INSERT_ALL_VALUES:
1729:     for (i=0; i<n; i++) {
1730:       idx       = *indicesx++;
1731:       idy       = *indicesy++;
1732:       y[idy]    = x[idx];
1733:       y[idy+1]  = x[idx+1];
1734:       y[idy+2]  = x[idx+2];
1735:       y[idy+3]  = x[idx+3];
1736:       y[idy+4]  = x[idx+4];
1737:       y[idy+5]  = x[idx+5];
1738:       y[idy+6]  = x[idx+6];
1739:       y[idy+7]  = x[idx+7];
1740:       y[idy+8]  = x[idx+8];
1741:       y[idy+9]  = x[idx+9];
1742:       y[idy+10] = x[idx+10];
1743:     }
1744:     break;
1745:   case ADD_VALUES:
1746:   case ADD_ALL_VALUES:
1747:     for (i=0; i<n; i++) {
1748:       idx        = *indicesx++;
1749:       idy        = *indicesy++;
1750:       y[idy]    += x[idx];
1751:       y[idy+1]  += x[idx+1];
1752:       y[idy+2]  += x[idx+2];
1753:       y[idy+3]  += x[idx+3];
1754:       y[idy+4]  += x[idx+4];
1755:       y[idy+5]  += x[idx+5];
1756:       y[idy+6]  += x[idx+6];
1757:       y[idy+7]  += x[idx+7];
1758:       y[idy+8]  += x[idx+8];
1759:       y[idy+9]  += x[idx+9];
1760:       y[idy+10] += x[idx+10];
1761:     }
1762:     break;
1763: #if !defined(PETSC_USE_COMPLEX)
1764:   case MAX_VALUES:
1765:     for (i=0; i<n; i++) {
1766:       idx       = *indicesx++;
1767:       idy       = *indicesy++;
1768:       y[idy]    = PetscMax(y[idy],x[idx]);
1769:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1770:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1771:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1772:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1773:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1774:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1775:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1776:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1777:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1778:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1779:     }
1780: #else
1781:   case MAX_VALUES:
1782: #endif
1783:   case NOT_SET_VALUES:
1784:     break;
1785:   default:
1786:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1787:   }
1788:   return(0);
1789: }

1791: /* ----------------------------------------------------------------------------------------------- */
1792: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1793: {
1794:   PetscInt i,idx;

1796:   for (i=0; i<n; i++) {
1797:     idx   = *indicesx++;
1798:     y[0]  = x[idx];
1799:     y[1]  = x[idx+1];
1800:     y[2]  = x[idx+2];
1801:     y[3]  = x[idx+3];
1802:     y[4]  = x[idx+4];
1803:     y[5]  = x[idx+5];
1804:     y[6]  = x[idx+6];
1805:     y[7]  = x[idx+7];
1806:     y[8]  = x[idx+8];
1807:     y[9]  = x[idx+9];
1808:     y[10] = x[idx+10];
1809:     y[11] = x[idx+11];
1810:     y    += 12;
1811:   }
1812: }

1814: PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1815: {
1816:   PetscInt i,idy;

1819:   switch (addv) {
1820:   case INSERT_VALUES:
1821:   case INSERT_ALL_VALUES:
1822:     for (i=0; i<n; i++) {
1823:       idy       = *indicesy++;
1824:       y[idy]    = x[0];
1825:       y[idy+1]  = x[1];
1826:       y[idy+2]  = x[2];
1827:       y[idy+3]  = x[3];
1828:       y[idy+4]  = x[4];
1829:       y[idy+5]  = x[5];
1830:       y[idy+6]  = x[6];
1831:       y[idy+7]  = x[7];
1832:       y[idy+8]  = x[8];
1833:       y[idy+9]  = x[9];
1834:       y[idy+10] = x[10];
1835:       y[idy+11] = x[11];
1836:       x        += 12;
1837:     }
1838:     break;
1839:   case ADD_VALUES:
1840:   case ADD_ALL_VALUES:
1841:     for (i=0; i<n; i++) {
1842:       idy        = *indicesy++;
1843:       y[idy]    += x[0];
1844:       y[idy+1]  += x[1];
1845:       y[idy+2]  += x[2];
1846:       y[idy+3]  += x[3];
1847:       y[idy+4]  += x[4];
1848:       y[idy+5]  += x[5];
1849:       y[idy+6]  += x[6];
1850:       y[idy+7]  += x[7];
1851:       y[idy+8]  += x[8];
1852:       y[idy+9]  += x[9];
1853:       y[idy+10] += x[10];
1854:       y[idy+11] += x[11];
1855:       x         += 12;
1856:     }
1857:     break;
1858: #if !defined(PETSC_USE_COMPLEX)
1859:   case MAX_VALUES:
1860:     for (i=0; i<n; i++) {
1861:       idy       = *indicesy++;
1862:       y[idy]    = PetscMax(y[idy],x[0]);
1863:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1864:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1865:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1866:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1867:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1868:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1869:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1870:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1871:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1872:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1873:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1874:       x        += 12;
1875:     }
1876: #else
1877:   case MAX_VALUES:
1878: #endif
1879:   case NOT_SET_VALUES:
1880:     break;
1881:   default:
1882:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1883:   }
1884:   return(0);
1885: }

1887: PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1888: {
1889:   PetscInt i,idx,idy;

1892:   switch (addv) {
1893:   case INSERT_VALUES:
1894:   case INSERT_ALL_VALUES:
1895:     for (i=0; i<n; i++) {
1896:       idx       = *indicesx++;
1897:       idy       = *indicesy++;
1898:       y[idy]    = x[idx];
1899:       y[idy+1]  = x[idx+1];
1900:       y[idy+2]  = x[idx+2];
1901:       y[idy+3]  = x[idx+3];
1902:       y[idy+4]  = x[idx+4];
1903:       y[idy+5]  = x[idx+5];
1904:       y[idy+6]  = x[idx+6];
1905:       y[idy+7]  = x[idx+7];
1906:       y[idy+8]  = x[idx+8];
1907:       y[idy+9]  = x[idx+9];
1908:       y[idy+10] = x[idx+10];
1909:       y[idy+11] = x[idx+11];
1910:     }
1911:     break;
1912:   case ADD_VALUES:
1913:   case ADD_ALL_VALUES:
1914:     for (i=0; i<n; i++) {
1915:       idx        = *indicesx++;
1916:       idy        = *indicesy++;
1917:       y[idy]    += x[idx];
1918:       y[idy+1]  += x[idx+1];
1919:       y[idy+2]  += x[idx+2];
1920:       y[idy+3]  += x[idx+3];
1921:       y[idy+4]  += x[idx+4];
1922:       y[idy+5]  += x[idx+5];
1923:       y[idy+6]  += x[idx+6];
1924:       y[idy+7]  += x[idx+7];
1925:       y[idy+8]  += x[idx+8];
1926:       y[idy+9]  += x[idx+9];
1927:       y[idy+10] += x[idx+10];
1928:       y[idy+11] += x[idx+11];
1929:     }
1930:     break;
1931: #if !defined(PETSC_USE_COMPLEX)
1932:   case MAX_VALUES:
1933:     for (i=0; i<n; i++) {
1934:       idx       = *indicesx++;
1935:       idy       = *indicesy++;
1936:       y[idy]    = PetscMax(y[idy],x[idx]);
1937:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1938:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1939:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1940:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1941:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1942:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1943:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1944:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1945:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1946:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1947:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1948:     }
1949: #else
1950:   case MAX_VALUES:
1951: #endif
1952:   case NOT_SET_VALUES:
1953:     break;
1954:   default:
1955:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1956:   }
1957:   return(0);
1958: }

1960: /* ----------------------------------------------------------------------------------------------- */
1961: PETSC_STATIC_INLINE void Pack_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1962: {
1963:   PetscInt       i,idx;

1965:   for (i=0; i<n; i++) {
1966:     idx   = *indicesx++;
1967:     PetscMemcpy(y,x + idx,bs*sizeof(PetscScalar));
1968:     y    += bs;
1969:   }
1970: }

1972: PETSC_STATIC_INLINE PetscErrorCode UnPack_bs(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1973: {
1974:   PetscInt i,idy,j;

1977:   switch (addv) {
1978:   case INSERT_VALUES:
1979:   case INSERT_ALL_VALUES:
1980:     for (i=0; i<n; i++) {
1981:       idy       = *indicesy++;
1982:       PetscMemcpy(y + idy,x,bs*sizeof(PetscScalar));
1983:       x        += bs;
1984:     }
1985:     break;
1986:   case ADD_VALUES:
1987:   case ADD_ALL_VALUES:
1988:     for (i=0; i<n; i++) {
1989:       idy        = *indicesy++;
1990:       for (j=0; j<bs; j++) y[idy+j] += x[j];
1991:       x         += bs;
1992:     }
1993:     break;
1994: #if !defined(PETSC_USE_COMPLEX)
1995:   case MAX_VALUES:
1996:     for (i=0; i<n; i++) {
1997:       idy = *indicesy++;
1998:       for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[j]);
1999:       x  += bs;
2000:     }
2001: #else
2002:   case MAX_VALUES:
2003: #endif
2004:   case NOT_SET_VALUES:
2005:     break;
2006:   default:
2007:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2008:   }
2009:   return(0);
2010: }

2012: PETSC_STATIC_INLINE PetscErrorCode Scatter_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2013: {
2014:   PetscInt i,idx,idy,j;

2017:   switch (addv) {
2018:   case INSERT_VALUES:
2019:   case INSERT_ALL_VALUES:
2020:     for (i=0; i<n; i++) {
2021:       idx       = *indicesx++;
2022:       idy       = *indicesy++;
2023:       PetscMemcpy(y + idy, x + idx,bs*sizeof(PetscScalar));
2024:     }
2025:     break;
2026:   case ADD_VALUES:
2027:   case ADD_ALL_VALUES:
2028:     for (i=0; i<n; i++) {
2029:       idx        = *indicesx++;
2030:       idy        = *indicesy++;
2031:       for (j=0; j<bs; j++ )  y[idy+j] += x[idx+j];
2032:     }
2033:     break;
2034: #if !defined(PETSC_USE_COMPLEX)
2035:   case MAX_VALUES:
2036:     for (i=0; i<n; i++) {
2037:       idx       = *indicesx++;
2038:       idy       = *indicesy++;
2039:       for (j=0; j<bs; j++ )  y[idy+j] = PetscMax(y[idy+j],x[idx+j]);
2040:     }
2041: #else
2042:   case MAX_VALUES:
2043: #endif
2044:   case NOT_SET_VALUES:
2045:     break;
2046:   default:
2047:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2048:   }
2049:   return(0);
2050: }

2052: /* Create the VecScatterBegin/End_P for our chosen block sizes */
2053: #define BS 1
2054:  #include <../src/vec/vscat/impls/vpscat.h>
2055: #define BS 2
2056:  #include <../src/vec/vscat/impls/vpscat.h>
2057: #define BS 3
2058:  #include <../src/vec/vscat/impls/vpscat.h>
2059: #define BS 4
2060:  #include <../src/vec/vscat/impls/vpscat.h>
2061: #define BS 5
2062:  #include <../src/vec/vscat/impls/vpscat.h>
2063: #define BS 6
2064:  #include <../src/vec/vscat/impls/vpscat.h>
2065: #define BS 7
2066:  #include <../src/vec/vscat/impls/vpscat.h>
2067: #define BS 8
2068:  #include <../src/vec/vscat/impls/vpscat.h>
2069: #define BS 9
2070:  #include <../src/vec/vscat/impls/vpscat.h>
2071: #define BS 10
2072:  #include <../src/vec/vscat/impls/vpscat.h>
2073: #define BS 11
2074:  #include <../src/vec/vscat/impls/vpscat.h>
2075: #define BS 12
2076:  #include <../src/vec/vscat/impls/vpscat.h>
2077: #define BS bs
2078:  #include <../src/vec/vscat/impls/vpscat.h>

2080: /* ==========================================================================================*/
2081: /*              create parallel to sequential scatter context                                */

2083: extern PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);

2085: /*
2086:    bs indicates how many elements there are in each block. Normally this would be 1.

2088:    contains check that PetscMPIInt can handle the sizes needed
2089: */
2090:  #include <petscbt.h>
2091: PetscErrorCode VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2092: {
2093:   VecScatter_MPI_General *from,*to;
2094:   PetscMPIInt            size,rank,mrank,imdex,tag,n;
2095:   PetscInt               *source = NULL,*owners = NULL,nxr;
2096:   PetscInt               *lowner = NULL,*start = NULL,*lsharedowner = NULL,*sharedstart = NULL,lengthy,lengthx;
2097:   PetscMPIInt            *nprocs = NULL,nrecvs,nrecvshared;
2098:   PetscInt               i,j,idx = 0,nsends;
2099:   PetscMPIInt            *owner = NULL;
2100:   PetscInt               *starts = NULL,count,slen,slenshared;
2101:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
2102:   PetscMPIInt            *onodes1,*olengths1;
2103:   MPI_Comm               comm;
2104:   MPI_Request            *send_waits = NULL,*recv_waits = NULL;
2105:   MPI_Status             recv_status,*send_status;
2106:   PetscErrorCode         ierr;
2107:   PetscShmComm           scomm;
2108:   PetscMPIInt            jj;
2109:   MPI_Info               info;
2110:   MPI_Comm               mscomm;
2111:   MPI_Win                sharedoffsetwin;  /* Window that owns sharedspaceoffset */
2112:   PetscInt               *sharedspaceoffset;
2113:   VecScatterType         type;
2114:   PetscBool              mpi3node;
2116:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2117:   PetscObjectGetComm((PetscObject)xin,&comm);
2118:   MPI_Comm_rank(comm,&rank);
2119:   MPI_Comm_size(comm,&size);

2121:   VecScatterGetType(ctx,&type);
2122:   PetscStrcmp(type,"mpi3node",&mpi3node);

2124:   PetscShmCommGet(comm,&scomm);
2125:   PetscShmCommGetMpiShmComm(scomm,&mscomm);

2127:   MPI_Info_create(&info);
2128:   MPI_Info_set(info, "alloc_shared_noncontig", "true");

2130:   if (mpi3node) {
2131:     /* Check if (parallel) inidx has duplicate indices.
2132:      Current VecScatterEndMPI3Node() (case StoP) writes the sequential vector to the parallel vector.
2133:      Writing to the same shared location without variable locking
2134:      leads to incorrect scattering. See src/vec/vscat/examples/runex2_5 and runex3_5 */

2136:     PetscInt    *mem,**optr;
2137:     MPI_Win     swin;
2138:     PetscMPIInt msize;

2140:     MPI_Comm_size(mscomm,&msize);
2141:     MPI_Comm_rank(mscomm,&mrank);

2143:     PetscMalloc1(msize,&optr);
2144:     MPIU_Win_allocate_shared((nx+1)*sizeof(PetscInt),sizeof(PetscInt),MPI_INFO_NULL,mscomm,&mem,&swin);

2146:     /* Write local nx and inidx into mem */
2147:     mem[0] = nx;
2148:     for (i=1; i<=nx; i++) mem[i] = inidx[i-1];
2149:     MPI_Barrier(mscomm); /* sync shared memory */

2151:     if (!mrank) {
2152:       PetscBT        table;
2153:       /* sz and dsp_unit are not used. Replacing them with NULL would cause MPI_Win_shared_query() crash */
2154:       MPI_Aint       sz;
2155:       PetscMPIInt    dsp_unit;
2156:       PetscInt       N = xin->map->N;

2158:       PetscBTCreate(N,&table); /* may not be scalable */
2159:       PetscBTMemzero(N,table);

2161:       jj = 0;
2162:       while (jj<msize && !ctx->is_duplicate) {
2163:         if (jj == mrank) {jj++; continue;}
2164:         MPIU_Win_shared_query(swin,jj,&sz,&dsp_unit,&optr[jj]);
2165:         for (j=1; j<=optr[jj][0]; j++) { /* optr[jj][0] = nx */
2166:           if (PetscBTLookupSet(table,optr[jj][j])) {
2167:             ctx->is_duplicate = PETSC_TRUE; /* only mrank=0 has this info., will be checked in VecScatterEndMPI3Node() */
2168:             break;
2169:           }
2170:         }
2171:         jj++;
2172:       }
2173:       PetscBTDestroy(&table);
2174:     }

2176:     if (swin != MPI_WIN_NULL) {MPI_Win_free(&swin);}
2177:     PetscFree(optr);
2178:   }

2180:   owners = xin->map->range;
2181:   VecGetSize(yin,&lengthy);
2182:   VecGetSize(xin,&lengthx);

2184:   /*  first count number of contributors to each processor */
2185:   PetscMalloc2(size,&nprocs,nx,&owner);
2186:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

2188:   j      = 0;
2189:   nsends = 0;
2190:   for (i=0; i<nx; i++) {
2191:     PetscIntMultError(bs,inidx[i],&idx);
2192:     if (idx < owners[j]) j = 0;
2193:     for (; j<size; j++) {
2194:       if (idx < owners[j+1]) {
2195:         if (!nprocs[j]++) nsends++;
2196:         owner[i] = j;
2197:         break;
2198:       }
2199:     }
2200:     if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
2201:   }

2203:   nprocslocal  = nprocs[rank];
2204:   nprocs[rank] = 0;
2205:   if (nprocslocal) nsends--;
2206:   /* inform other processors of number of messages and max length*/
2207:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2208:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2209:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2210:   recvtotal = 0;
2211:   for (i=0; i<nrecvs; i++) {
2212:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2213:   }

2215:   /* post receives:   */
2216:   PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);
2217:   count = 0;
2218:   for (i=0; i<nrecvs; i++) {
2219:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2220:     count += olengths1[i];
2221:   }

2223:   /* do sends:
2224:      1) starts[i] gives the starting index in svalues for stuff going to
2225:      the ith processor
2226:   */
2227:   nxr = 0;
2228:   for (i=0; i<nx; i++) {
2229:     if (owner[i] != rank) nxr++;
2230:   }
2231:   PetscMalloc3(nxr,&svalues,nsends,&send_waits,size+1,&starts);

2233:   starts[0]  = 0;
2234:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2235:   for (i=0; i<nx; i++) {
2236:     if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
2237:   }
2238:   starts[0] = 0;
2239:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2240:   count = 0;
2241:   for (i=0; i<size; i++) {
2242:     if (nprocs[i]) {
2243:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
2244:     }
2245:   }

2247:   /*  wait on receives; this is everyone who we need to deliver data to */
2248:   count = nrecvs;
2249:   slen  = 0;
2250:   nrecvshared = 0;
2251:   slenshared  = 0;
2252:   while (count) {
2253:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2254:     /* unpack receives into our local space */
2255:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2256:     PetscShmCommGlobalToLocal(scomm,onodes1[imdex],&jj);
2257:     if (jj > -1) {
2258:       PetscInfo3(NULL,"[%d] Sending values to shared memory partner %d,global rank %d\n",rank,jj,onodes1[imdex]);
2259:       nrecvshared++;
2260:       slenshared += n;
2261:     }
2262:     slen += n;
2263:     count--;
2264:   }

2266:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);

2268:   /* allocate entire send scatter context */
2269:   PetscNewLog(ctx,&to);
2270:   to->sharedwin = MPI_WIN_NULL;
2271:   to->n         = nrecvs-nrecvshared;

2273:   PetscMalloc1(nrecvs-nrecvshared,&to->requests);
2274:   PetscMalloc4(bs*(slen-slenshared),&to->values,slen-slenshared,&to->indices,nrecvs-nrecvshared+1,&to->starts,nrecvs-nrecvshared,&to->procs);
2275:   PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);

2277:   MPI_Comm_size(mscomm,&to->msize);
2278:   PetscMalloc1(slenshared,&to->sharedspaceindices);
2279:   PetscCalloc1(to->msize+1,&to->sharedspacestarts);

2281:   ctx->todata              = (void*)to;
2282:   to->starts[0]            = 0;
2283:   to->sharedspacestarts[0] = 0;

2285:   /* Allocate shared memory space for shared memory partner communication */
2286:   MPIU_Win_allocate_shared(to->msize*sizeof(PetscInt),sizeof(PetscInt),info,mscomm,&sharedspaceoffset,&sharedoffsetwin);
2287:   for (i=0; i<to->msize; i++) sharedspaceoffset[i] = -1; /* mark with -1 for later error checking */
2288:   if (nrecvs) {
2289:     /* move the data into the send scatter */
2290:     base     = owners[rank];
2291:     rsvalues = rvalues;
2292:     to->n    = 0;
2293:     for (i=0; i<nrecvs; i++) {
2294:       values = rsvalues;
2295:       PetscShmCommGlobalToLocal(scomm,(PetscMPIInt)onodes1[i],&jj);
2296:       if (jj > -1) {
2297:         to->sharedspacestarts[jj]   = to->sharedcnt;
2298:         to->sharedspacestarts[jj+1] = to->sharedcnt + olengths1[i];
2299:         for (j=0; j<olengths1[i]; j++) to->sharedspaceindices[to->sharedcnt + j] = values[j] - base;
2300:         sharedspaceoffset[jj]      = to->sharedcnt;
2301:         to->sharedcnt             += olengths1[i];
2302:         PetscInfo4(NULL,"[%d] Sending %d values to shared memory partner %d,global rank %d\n",rank,olengths1[i],jj,onodes1[i]);
2303:       } else {
2304:         to->starts[to->n+1] = to->starts[to->n] + olengths1[i];
2305:         to->procs[to->n]    = onodes1[i];
2306:         for (j=0; j<olengths1[i]; j++) to->indices[to->starts[to->n] + j] = values[j] - base;
2307:         to->n++;
2308:       }
2309:       rsvalues += olengths1[i];
2310:     }
2311:   }
2312:   PetscFree(olengths1);
2313:   PetscFree(onodes1);
2314:   PetscFree3(rvalues,source,recv_waits);

2316:   if (mpi3node) {
2317:     /* to->sharedspace is used only as a flag */
2318:     i = 0;
2319:     if (to->sharedcnt) i = 1;
2320:     MPIU_Win_allocate_shared(i*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2321:   } else { /* mpi3 */
2322:     MPIU_Win_allocate_shared(bs*to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2323:   }
2324:   if (to->sharedwin == MPI_WIN_NULL) SETERRQ(PETSC_COMM_SELF,100,"what the");
2325:   MPI_Info_free(&info);

2327:   /* allocate entire receive scatter context */
2328:   PetscNewLog(ctx,&from);
2329:   from->sharedwin       = MPI_WIN_NULL;
2330:   from->msize           = to->msize;
2331:   /* we don't actually know the correct value for from->n at this point so we use the upper bound */
2332:   from->n = nsends;

2334:   PetscMalloc1(nsends,&from->requests);
2335:   PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);
2336:   ctx->fromdata = (void*)from;

2338:   PetscCalloc1(to->msize+1,&from->sharedspacestarts);
2339:   /* move data into receive scatter */
2340:   PetscMalloc2(size,&lowner,nsends+1,&start);
2341:   PetscMalloc2(size,&lsharedowner,to->msize+1,&sharedstart);
2342:   count = 0; from->starts[0] = start[0] = 0;
2343:   from->sharedcnt = 0;
2344:   for (i=0; i<size; i++) {
2345:     lsharedowner[i] = -1;
2346:     lowner[i]       = -1;
2347:     if (nprocs[i]) {
2348:       PetscShmCommGlobalToLocal(scomm,i,&jj);
2349:       if (jj > -1) {
2350:         from->sharedspacestarts[jj] = sharedstart[jj] = from->sharedcnt;
2351:         from->sharedspacestarts[jj+1] = sharedstart[jj+1] = from->sharedspacestarts[jj] + nprocs[i];
2352:         from->sharedcnt += nprocs[i];
2353:         lsharedowner[i] = jj;
2354:       } else {
2355:         lowner[i]            = count++;
2356:         from->procs[count-1]  = i;
2357:         from->starts[count] = start[count] = start[count-1] + nprocs[i];
2358:       }
2359:     }
2360:   }
2361:   from->n = count;

2363:   for (i=0; i<nx; i++) {
2364:     if (owner[i] != rank && lowner[owner[i]] > -1) {
2365:       from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
2366:       if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2367:     }
2368:   }

2370:   /* copy over appropriate parts of inidy[] into from->sharedspaceindices */
2371:   if (mpi3node) {
2372:     /* in addition, copy over appropriate parts of inidx[] into 2nd part of from->sharedspaceindices */
2373:     PetscInt *sharedspaceindicesx;
2374:     PetscMalloc1(2*from->sharedcnt,&from->sharedspaceindices);
2375:     sharedspaceindicesx = from->sharedspaceindices + from->sharedcnt;
2376:     for (i=0; i<nx; i++) {
2377:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2378:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");

2380:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]] = bs*inidy[i];
2381:         sharedspaceindicesx[sharedstart[lsharedowner[owner[i]]]] = bs*inidx[i]-owners[owner[i]]; /* local inidx */
2382:         sharedstart[lsharedowner[owner[i]]]++;
2383:       }
2384:     }
2385:   } else { /* mpi3 */
2386:     PetscMalloc1(from->sharedcnt,&from->sharedspaceindices);
2387:     for (i=0; i<nx; i++) {
2388:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2389:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2390:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]++] = bs*inidy[i];
2391:       }
2392:     }
2393:   }

2395:   PetscFree2(lowner,start);
2396:   PetscFree2(lsharedowner,sharedstart);
2397:   PetscFree2(nprocs,owner);

2399:   /* wait on sends */
2400:   if (nsends) {
2401:     PetscMalloc1(nsends,&send_status);
2402:     MPI_Waitall(nsends,send_waits,send_status);
2403:     PetscFree(send_status);
2404:   }
2405:   PetscFree3(svalues,send_waits,starts);

2407:   if (nprocslocal) {
2408:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
2409:     /* we have a scatter to ourselves */
2410:     PetscMalloc1(nt,&to->local.vslots);
2411:     PetscMalloc1(nt,&from->local.vslots);
2412:     nt   = 0;
2413:     for (i=0; i<nx; i++) {
2414:       idx = bs*inidx[i];
2415:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2416:         to->local.vslots[nt]     = idx - owners[rank];
2417:         from->local.vslots[nt++] = bs*inidy[i];
2418:         if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2419:       }
2420:     }
2421:     PetscLogObjectMemory((PetscObject)ctx,2*nt*sizeof(PetscInt));
2422:   } else {
2423:     from->local.n      = 0;
2424:     from->local.vslots = 0;
2425:     to->local.n        = 0;
2426:     to->local.vslots   = 0;
2427:   }

2429:   /* Get the shared memory address for all processes we will be copying data from */
2430:   PetscCalloc1(to->msize,&from->sharedspaces);
2431:   PetscCalloc1(to->msize,&from->sharedspacesoffset);
2432:   MPI_Comm_rank(mscomm,&mrank);
2433:   for (jj=0; jj<to->msize; jj++) {
2434:     MPI_Aint    isize;
2435:     PetscMPIInt disp_unit;
2436:     PetscInt    *ptr;
2437:     MPIU_Win_shared_query(to->sharedwin,jj,&isize,&disp_unit,&from->sharedspaces[jj]);
2438:     MPIU_Win_shared_query(sharedoffsetwin,jj,&isize,&disp_unit,&ptr);
2439:     from->sharedspacesoffset[jj] = ptr[mrank];
2440:   }
2441:   MPI_Win_free(&sharedoffsetwin);

2443:   if (mpi3node && !from->sharedspace) {
2444:     /* comput from->notdone to be used by VecScatterEndMPI3Node() */
2445:     PetscInt notdone = 0;
2446:     for (i=0; i<from->msize; i++) {
2447:       if (from->sharedspacesoffset && from->sharedspacesoffset[i] > -1) {
2448:         notdone++;
2449:       }
2450:     }
2451:     from->notdone = notdone;
2452:   }

2454:   from->local.nonmatching_computed = PETSC_FALSE;
2455:   from->local.n_nonmatching        = 0;
2456:   from->local.slots_nonmatching    = 0;
2457:   to->local.nonmatching_computed   = PETSC_FALSE;
2458:   to->local.n_nonmatching          = 0;
2459:   to->local.slots_nonmatching      = 0;

2461:   from->format = VEC_SCATTER_MPI_GENERAL;
2462:   to->format   = VEC_SCATTER_MPI_GENERAL;
2463:   from->bs     = bs;
2464:   to->bs       = bs;

2466:   VecScatterCreateCommon_PtoS_MPI3(from,to,ctx);
2467:   return(0);
2468: }

2470: /*
2471:    bs indicates how many elements there are in each block. Normally this would be 1.
2472: */
2473: PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
2474: {
2475:   MPI_Comm       comm;
2476:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
2477:   PetscInt       bs   = to->bs;
2478:   PetscMPIInt    size;
2479:   PetscInt       i, n;
2481:   VecScatterType type;
2482:   PetscBool      mpi3node;

2485:   PetscObjectGetComm((PetscObject)ctx,&comm);
2486:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
2487:   ctx->ops->destroy = VecScatterDestroy_PtoP_MPI3;

2489:   MPI_Comm_size(comm,&size);
2490:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
2491:   to->contiq = PETSC_FALSE;
2492:   n = from->starts[from->n];
2493:   from->contiq = PETSC_TRUE;
2494:   for (i=1; i<n; i++) {
2495:     if (from->indices[i] != from->indices[i-1] + bs) {
2496:       from->contiq = PETSC_FALSE;
2497:       break;
2498:     }
2499:   }

2501:   ctx->ops->copy      = VecScatterCopy_PtoP_AllToAll;
2502:   {
2503:     PetscInt    *sstarts  = to->starts,  *rstarts = from->starts;
2504:     PetscMPIInt *sprocs   = to->procs,   *rprocs  = from->procs;
2505:     MPI_Request *swaits   = to->requests,*rwaits  = from->requests;
2506:     MPI_Request *rev_swaits,*rev_rwaits;
2507:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2509:     /* allocate additional wait variables for the "reverse" scatter */
2510:     PetscMalloc1(to->n,&rev_rwaits);
2511:     PetscMalloc1(from->n,&rev_swaits);
2512:     to->rev_requests   = rev_rwaits;
2513:     from->rev_requests = rev_swaits;

2515:     for (i=0; i<from->n; i++) {
2516:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2517:     }

2519:     for (i=0; i<to->n; i++) {
2520:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2521:     }
2522:     /* Register receives for scatter and reverse */
2523:     for (i=0; i<from->n; i++) {
2524:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2525:     }
2526:     for (i=0; i<to->n; i++) {
2527:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2528:     }
2529:     ctx->ops->copy = VecScatterCopy_PtoP_X;
2530:   }
2531:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);

2533: #if defined(PETSC_USE_DEBUG)
2534:   MPIU_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));
2535:   MPIU_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));
2536:   if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2537: #endif

2539:   VecScatterGetType(ctx,&type);
2540:   PetscStrcmp(type,"mpi3node",&mpi3node);

2542:   if (mpi3node) {
2543:     switch (bs) {
2544:     case 12:
2545:       ctx->ops->begin = VecScatterBeginMPI3Node_12;
2546:       ctx->ops->end   = VecScatterEndMPI3Node_12;
2547:       break;
2548:     case 11:
2549:       ctx->ops->begin = VecScatterBeginMPI3Node_11;
2550:       ctx->ops->end   = VecScatterEndMPI3Node_11;
2551:       break;
2552:     case 10:
2553:       ctx->ops->begin = VecScatterBeginMPI3Node_10;
2554:       ctx->ops->end   = VecScatterEndMPI3Node_10;
2555:       break;
2556:     case 9:
2557:       ctx->ops->begin = VecScatterBeginMPI3Node_9;
2558:       ctx->ops->end   = VecScatterEndMPI3Node_9;
2559:       break;
2560:     case 8:
2561:       ctx->ops->begin = VecScatterBeginMPI3Node_8;
2562:       ctx->ops->end   = VecScatterEndMPI3Node_8;
2563:       break;
2564:     case 7:
2565:       ctx->ops->begin = VecScatterBeginMPI3Node_7;
2566:       ctx->ops->end   = VecScatterEndMPI3Node_7;
2567:       break;
2568:     case 6:
2569:       ctx->ops->begin = VecScatterBeginMPI3Node_6;
2570:       ctx->ops->end   = VecScatterEndMPI3Node_6;
2571:       break;
2572:     case 5:
2573:       ctx->ops->begin = VecScatterBeginMPI3Node_5;
2574:       ctx->ops->end   = VecScatterEndMPI3Node_5;
2575:       break;
2576:     case 4:
2577:       ctx->ops->begin = VecScatterBeginMPI3Node_4;
2578:       ctx->ops->end   = VecScatterEndMPI3Node_4;
2579:       break;
2580:     case 3:
2581:       ctx->ops->begin = VecScatterBeginMPI3Node_3;
2582:       ctx->ops->end   = VecScatterEndMPI3Node_3;
2583:       break;
2584:     case 2:
2585:       ctx->ops->begin = VecScatterBeginMPI3Node_2;
2586:       ctx->ops->end   = VecScatterEndMPI3Node_2;
2587:       break;
2588:     case 1:
2589:       ctx->ops->begin = VecScatterBeginMPI3Node_1;
2590:       ctx->ops->end   = VecScatterEndMPI3Node_1;
2591:       break;
2592:     default:
2593:       ctx->ops->begin = VecScatterBegin_bs;
2594:       ctx->ops->end   = VecScatterEnd_bs;
2595:     }
2596:   } else { /* !mpi3node */

2598:     switch (bs) {
2599:     case 12:
2600:       ctx->ops->begin = VecScatterBegin_12;
2601:       ctx->ops->end   = VecScatterEnd_12;
2602:       break;
2603:     case 11:
2604:       ctx->ops->begin = VecScatterBegin_11;
2605:       ctx->ops->end   = VecScatterEnd_11;
2606:       break;
2607:     case 10:
2608:       ctx->ops->begin = VecScatterBegin_10;
2609:       ctx->ops->end   = VecScatterEnd_10;
2610:       break;
2611:     case 9:
2612:       ctx->ops->begin = VecScatterBegin_9;
2613:       ctx->ops->end   = VecScatterEnd_9;
2614:       break;
2615:     case 8:
2616:       ctx->ops->begin = VecScatterBegin_8;
2617:       ctx->ops->end   = VecScatterEnd_8;
2618:       break;
2619:     case 7:
2620:       ctx->ops->begin = VecScatterBegin_7;
2621:       ctx->ops->end   = VecScatterEnd_7;
2622:       break;
2623:     case 6:
2624:       ctx->ops->begin = VecScatterBegin_6;
2625:       ctx->ops->end   = VecScatterEnd_6;
2626:       break;
2627:     case 5:
2628:       ctx->ops->begin = VecScatterBegin_5;
2629:       ctx->ops->end   = VecScatterEnd_5;
2630:       break;
2631:     case 4:
2632:       ctx->ops->begin = VecScatterBegin_4;
2633:       ctx->ops->end   = VecScatterEnd_4;
2634:       break;
2635:     case 3:
2636:       ctx->ops->begin = VecScatterBegin_3;
2637:       ctx->ops->end   = VecScatterEnd_3;
2638:       break;
2639:     case 2:
2640:       ctx->ops->begin = VecScatterBegin_2;
2641:       ctx->ops->end   = VecScatterEnd_2;
2642:       break;
2643:     case 1:
2644:       if (mpi3node) {
2645:         ctx->ops->begin = VecScatterBeginMPI3Node_1;
2646:         ctx->ops->end   = VecScatterEndMPI3Node_1;
2647:       } else {
2648:         ctx->ops->begin = VecScatterBegin_1;
2649:         ctx->ops->end   = VecScatterEnd_1;
2650:       }
2651:       break;
2652:     default:
2653:       ctx->ops->begin = VecScatterBegin_bs;
2654:       ctx->ops->end   = VecScatterEnd_bs;
2655:     }
2656:   }

2658:   ctx->ops->view = VecScatterView_MPI;

2660:   /* try to optimize PtoP vecscatter with memcpy's */
2661:   VecScatterMemcpyPlanCreate_PtoP(to,from);
2662:   return(0);
2663: }

2665: /* ------------------------------------------------------------------------------------*/
2666: /*
2667:          Scatter from local Seq vectors to a parallel vector.
2668:          Reverses the order of the arguments, calls VecScatterCreateLocal_PtoS() then
2669:          reverses the result.
2670: */
2671: PetscErrorCode VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2672: {
2673:   PetscErrorCode         ierr;
2674:   MPI_Request            *waits;
2675:   VecScatter_MPI_General *to,*from;

2678:   VecScatterCreateLocal_PtoS_MPI3(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2679:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2680:   from          = (VecScatter_MPI_General*)ctx->todata;
2681:   ctx->todata   = (void*)to;
2682:   ctx->fromdata = (void*)from;
2683:   /* these two are special, they are ALWAYS stored in to struct */
2684:   to->sstatus   = from->sstatus;
2685:   to->rstatus   = from->rstatus;

2687:   from->sstatus = 0;
2688:   from->rstatus = 0;
2689:   waits              = from->rev_requests;
2690:   from->rev_requests = from->requests;
2691:   from->requests     = waits;
2692:   waits              = to->rev_requests;
2693:   to->rev_requests   = to->requests;
2694:   to->requests       = waits;
2695:   return(0);
2696: }

2698: /* ---------------------------------------------------------------------------------*/
2699: PetscErrorCode VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2700: {
2702:   PetscMPIInt    size,rank,tag,imdex,n;
2703:   PetscInt       *owners = xin->map->range;
2704:   PetscMPIInt    *nprocs = NULL;
2705:   PetscInt       i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2706:   PetscMPIInt    *owner   = NULL;
2707:   PetscInt       *starts  = NULL,count,slen;
2708:   PetscInt       *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2709:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2710:   MPI_Comm       comm;
2711:   MPI_Request    *send_waits = NULL,*recv_waits = NULL;
2712:   MPI_Status     recv_status,*send_status = NULL;
2713:   PetscBool      duplicate = PETSC_FALSE;
2714: #if defined(PETSC_USE_DEBUG)
2715:   PetscBool      found = PETSC_FALSE;
2716: #endif

2719:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2720:   PetscObjectGetComm((PetscObject)xin,&comm);
2721:   MPI_Comm_size(comm,&size);
2722:   MPI_Comm_rank(comm,&rank);
2723:   if (size == 1) {
2724:     VecScatterCreateLocal_StoP_MPI3(nx,inidx,ny,inidy,xin,yin,bs,ctx);
2725:     return(0);
2726:   }

2728:   /*
2729:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2730:      They then call the StoPScatterCreate()
2731:   */
2732:   /*  first count number of contributors to each processor */
2733:   PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);
2734:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

2736:   lastidx = -1;
2737:   j       = 0;
2738:   for (i=0; i<nx; i++) {
2739:     /* if indices are NOT locally sorted, need to start search at the beginning */
2740:     if (lastidx > (idx = bs*inidx[i])) j = 0;
2741:     lastidx = idx;
2742:     for (; j<size; j++) {
2743:       if (idx >= owners[j] && idx < owners[j+1]) {
2744:         nprocs[j]++;
2745:         owner[i] = j;
2746: #if defined(PETSC_USE_DEBUG)
2747:         found = PETSC_TRUE;
2748: #endif
2749:         break;
2750:       }
2751:     }
2752: #if defined(PETSC_USE_DEBUG)
2753:     if (!found) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2754:     found = PETSC_FALSE;
2755: #endif
2756:   }
2757:   nsends = 0;
2758:   for (i=0; i<size; i++) nsends += (nprocs[i] > 0);

2760:   /* inform other processors of number of messages and max length*/
2761:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2762:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2763:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2764:   recvtotal = 0;
2765:   for (i=0; i<nrecvs; i++) {
2766:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2767:   }

2769:   /* post receives:   */
2770:   PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);

2772:   count = 0;
2773:   for (i=0; i<nrecvs; i++) {
2774:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2775:     PetscIntSumError(count,olengths1[i],&count);
2776:   }
2777:   PetscFree(onodes1);

2779:   /* do sends:
2780:       1) starts[i] gives the starting index in svalues for stuff going to
2781:          the ith processor
2782:   */
2783:   starts[0]= 0;
2784:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2785:   for (i=0; i<nx; i++) {
2786:     svalues[2*starts[owner[i]]]       = bs*inidx[i];
2787:     svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2788:   }

2790:   starts[0] = 0;
2791:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2792:   count = 0;
2793:   for (i=0; i<size; i++) {
2794:     if (nprocs[i]) {
2795:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2796:       count++;
2797:     }
2798:   }
2799:   PetscFree3(nprocs,owner,starts);

2801:   /*  wait on receives */
2802:   count = nrecvs;
2803:   slen  = 0;
2804:   while (count) {
2805:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2806:     /* unpack receives into our local space */
2807:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2808:     slen += n/2;
2809:     count--;
2810:   }
2811:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);

2813:   PetscMalloc2(slen,&local_inidx,slen,&local_inidy);
2814:   base     = owners[rank];
2815:   count    = 0;
2816:   rsvalues = rvalues;
2817:   for (i=0; i<nrecvs; i++) {
2818:     values    = rsvalues;
2819:     rsvalues += 2*olengths1[i];
2820:     for (j=0; j<olengths1[i]; j++) {
2821:       local_inidx[count]   = values[2*j] - base;
2822:       local_inidy[count++] = values[2*j+1];
2823:     }
2824:   }
2825:   PetscFree(olengths1);

2827:   /* wait on sends */
2828:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2829:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);

2831:   /*
2832:      should sort and remove duplicates from local_inidx,local_inidy
2833:   */
2834: #if defined(do_it_slow)
2835:   /* sort on the from index */
2836:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2837:   start = 0;
2838:   while (start < slen) {
2839:     count = start+1;
2840:     last  = local_inidx[start];
2841:     while (count < slen && last == local_inidx[count]) count++;
2842:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2843:       /* sort on to index */
2844:       PetscSortInt(count-start,local_inidy+start);
2845:     }
2846:     /* remove duplicates; not most efficient way, but probably good enough */
2847:     i = start;
2848:     while (i < count-1) {
2849:       if (local_inidy[i] != local_inidy[i+1]) i++;
2850:       else { /* found a duplicate */
2851:         duplicate = PETSC_TRUE;
2852:         for (j=i; j<slen-1; j++) {
2853:           local_inidx[j] = local_inidx[j+1];
2854:           local_inidy[j] = local_inidy[j+1];
2855:         }
2856:         slen--;
2857:         count--;
2858:       }
2859:     }
2860:     start = count;
2861:   }
2862: #endif
2863:   if (duplicate) {
2864:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2865:   }
2866:   VecScatterCreateLocal_StoP_MPI3(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
2867:   PetscFree2(local_inidx,local_inidy);
2868:   return(0);
2869: }

2871: #endif