Actual source code: vpscat.c

petsc-3.11.4 2019-09-28
Report Typos and Errors
  1: /*
  2:     Defines parallel vector scatters.
  3: */

  5: #include <petsc/private/vecscatterimpl.h>


  8: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
  9: {
 10:   VecScatter_MPI_General *to  =(VecScatter_MPI_General*)ctx->todata;
 11:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 12:   PetscErrorCode         ierr;
 13:   PetscInt               i;
 14:   PetscMPIInt            rank;
 15:   PetscViewerFormat      format;
 16:   PetscBool              iascii;

 19:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 20:   if (iascii) {
 21:     MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);
 22:     PetscViewerGetFormat(viewer,&format);
 23:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 24:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 26:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 27:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 28:       itmp = to->starts[to->n+1];
 29:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 30:       itmp = from->starts[from->n+1];
 31:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 32:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));

 34:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 35:       PetscViewerASCIIPrintf(viewer,"  Blocksize %D\n",to->bs);
 36:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 37:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 40:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
 41:     } else {
 42:       PetscViewerASCIIPrintf(viewer,"  VecScatter Blocksize %D\n",to->bs);
 43:       PetscViewerASCIIPushSynchronized(viewer);
 44:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 45:       if (to->n) {
 46:         for (i=0; i<to->n; i++) {
 47:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 48:         }
 49:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote sends (in order by process sent to)\n",rank);
 50:         for (i=0; i<to->starts[to->n]; i++) {
 51:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
 52:         }
 53:       }

 55:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 56:       if (from->n) {
 57:         for (i=0; i<from->n; i++) {
 58:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 59:         }

 61:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote receives (in order by process received from)\n",rank);
 62:         for (i=0; i<from->starts[from->n]; i++) {
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
 64:         }
 65:       }
 66:       if (to->local.n) {
 67:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
 68:         for (i=0; i<to->local.n; i++) {  /* the to and from have the opposite meaning from what you would expect */
 69:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,to->local.vslots[i],from->local.vslots[i]);
 70:         }
 71:       }

 73:      for (i=0; i<to->msize; i++) {
 74:        if (to->sharedspacestarts[i+1] > to->sharedspacestarts[i]) {
 75:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory to local memory partner %d count %d\n",rank,i,to->sharedspacestarts[i+1]-to->sharedspacestarts[i]);
 76:         }
 77:       }
 78:      for (i=0; i<from->msize; i++) {
 79:        if (from->sharedspacestarts[i+1] > from->sharedspacestarts[i]) {
 80:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory from local memory partner %d count %d\n",rank,i,from->sharedspacestarts[i+1]-from->sharedspacestarts[i]);
 81:         }
 82:       }

 84:       PetscViewerFlush(viewer);
 85:       PetscViewerASCIIPopSynchronized(viewer);

 87:       PetscViewerASCIIPrintf(viewer,"Method used to implement the VecScatter: ");
 88:     }
 89:   }
 90:   return(0);
 91: }

 93: /* -----------------------------------------------------------------------------------*/
 94: /*
 95:       The next routine determines what part of  the local part of the scatter is an
 96:   exact copy of values into their current location. We check this here and
 97:   then know that we need not perform that portion of the scatter when the vector is
 98:   scattering to itself with INSERT_VALUES.

100:      This is currently not used but would speed up, for example DMLocalToLocalBegin/End()

102: */
103: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
104: {
105:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
107:   PetscInt       *nto_slots,*nfrom_slots,j = 0;

110:   for (i=0; i<n; i++) {
111:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
112:   }

114:   if (!n_nonmatching) {
115:     to->nonmatching_computed = PETSC_TRUE;
116:     to->n_nonmatching        = from->n_nonmatching = 0;
117:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
118:   } else if (n_nonmatching == n) {
119:     to->nonmatching_computed = PETSC_FALSE;
120:     PetscInfo(scatter,"All values non-matching\n");
121:   } else {
122:     to->nonmatching_computed= PETSC_TRUE;
123:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;

125:     PetscMalloc1(n_nonmatching,&nto_slots);
126:     PetscMalloc1(n_nonmatching,&nfrom_slots);

128:     to->slots_nonmatching   = nto_slots;
129:     from->slots_nonmatching = nfrom_slots;
130:     for (i=0; i<n; i++) {
131:       if (to_slots[i] != from_slots[i]) {
132:         nto_slots[j]   = to_slots[i];
133:         nfrom_slots[j] = from_slots[i];
134:         j++;
135:       }
136:     }
137:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
138:   }
139:   return(0);
140: }

142: /* --------------------------------------------------------------------------------------*/

144: /* -------------------------------------------------------------------------------------*/
145: PetscErrorCode VecScatterDestroy_PtoP_MPI3(VecScatter ctx)
146: {
147:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
148:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
149:   PetscErrorCode         ierr;
150:   PetscInt               i;

153:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
154:   if (to->requests) {
155:     for (i=0; i<to->n; i++) {
156:       MPI_Request_free(to->requests + i);
157:     }
158:   }
159:   if (to->rev_requests) {
160:     for (i=0; i<to->n; i++) {
161:       MPI_Request_free(to->rev_requests + i);
162:     }
163:   }
164:   if (from->requests) {
165:     for (i=0; i<from->n; i++) {
166:       MPI_Request_free(from->requests + i);
167:     }
168:   }
169:   if (from->rev_requests) {
170:     for (i=0; i<from->n; i++) {
171:       MPI_Request_free(from->rev_requests + i);
172:     }
173:   }
174:   if (to->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&to->sharedwin);}
175:   if (from->sharedwin != MPI_WIN_NULL) {MPI_Win_free(&from->sharedwin);}
176:   PetscFree(to->sharedspaces);
177:   PetscFree(to->sharedspacesoffset);
178:   PetscFree(to->sharedspaceindices);
179:   PetscFree(to->sharedspacestarts);

181:   PetscFree(from->sharedspaceindices);
182:   PetscFree(from->sharedspaces);
183:   PetscFree(from->sharedspacesoffset);
184:   PetscFree(from->sharedspacestarts);

186:   PetscFree(to->local.vslots);
187:   PetscFree(from->local.vslots);
188:   PetscFree(to->local.slots_nonmatching);
189:   PetscFree(from->local.slots_nonmatching);
190:   PetscFree(to->rev_requests);
191:   PetscFree(from->rev_requests);
192:   PetscFree(to->requests);
193:   PetscFree(from->requests);
194:   PetscFree4(to->values,to->indices,to->starts,to->procs);
195:   PetscFree2(to->sstatus,to->rstatus);
196:   PetscFree4(from->values,from->indices,from->starts,from->procs);
197:   VecScatterMemcpyPlanDestroy_PtoP(to,from);
198:   PetscFree(from);
199:   PetscFree(to);
200:   return(0);
201: }

203: /* --------------------------------------------------------------------------------------*/

205: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
206: {
207:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
208:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
209:   PetscErrorCode         ierr;
210:   PetscInt               ny,bs = in_from->bs,jj;
211:   PetscShmComm           scomm;
212:   MPI_Comm               mscomm;
213:   MPI_Info               info;

216:   out->ops->begin   = in->ops->begin;
217:   out->ops->end     = in->ops->end;
218:   out->ops->copy    = in->ops->copy;
219:   out->ops->destroy = in->ops->destroy;
220:   out->ops->view    = in->ops->view;

222:   /* allocate entire send scatter context */
223:   PetscNewLog(out,&out_to);
224:   out_to->sharedwin       = MPI_WIN_NULL;
225:   PetscNewLog(out,&out_from);
226:   out_from->sharedwin       = MPI_WIN_NULL;

228:   ny                = in_to->starts[in_to->n];
229:   out_to->n         = in_to->n;
230:   out_to->format    = in_to->format;

232:   PetscMalloc1(out_to->n,&out_to->requests);
233:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
234:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
235:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
236:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
237:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

239:   out->todata                        = (void*)out_to;
240:   out_to->local.n                    = in_to->local.n;
241:   out_to->local.nonmatching_computed = PETSC_FALSE;
242:   out_to->local.n_nonmatching        = 0;
243:   out_to->local.slots_nonmatching    = 0;
244:   if (in_to->local.n) {
245:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
246:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
247:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
248:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
249:   } else {
250:     out_to->local.vslots   = 0;
251:     out_from->local.vslots = 0;
252:   }

254:   /* allocate entire receive context */
255:   VecScatterMemcpyPlanCopy(&in_to->local.memcpy_plan,&out_to->local.memcpy_plan);
256:   VecScatterMemcpyPlanCopy(&in_from->local.memcpy_plan,&out_from->local.memcpy_plan);
257:   VecScatterMemcpyPlanCopy(&in_to->memcpy_plan,&out_to->memcpy_plan);
258:   VecScatterMemcpyPlanCopy(&in_from->memcpy_plan,&out_from->memcpy_plan);

260:   out_from->format    = in_from->format;
261:   ny                  = in_from->starts[in_from->n];
262:   out_from->n         = in_from->n;

264:   PetscMalloc1(out_from->n,&out_from->requests);
265:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
266:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
267:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
268:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

270:   out->fromdata                        = (void*)out_from;
271:   out_from->local.n                    = in_from->local.n;
272:   out_from->local.nonmatching_computed = PETSC_FALSE;
273:   out_from->local.n_nonmatching        = 0;
274:   out_from->local.slots_nonmatching    = 0;

276:   /*
277:       set up the request arrays for use with isend_init() and irecv_init()
278:   */
279:   {
280:     PetscMPIInt tag;
281:     MPI_Comm    comm;
282:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
283:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
284:     PetscInt    i;
285:     MPI_Request *swaits   = out_to->requests,*rwaits  = out_from->requests;
286:     MPI_Request *rev_swaits,*rev_rwaits;
287:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

289:     PetscMalloc1(in_to->n,&out_to->rev_requests);
290:     PetscMalloc1(in_from->n,&out_from->rev_requests);

292:     rev_rwaits = out_to->rev_requests;
293:     rev_swaits = out_from->rev_requests;

295:     out_from->bs = out_to->bs = bs;
296:     tag          = ((PetscObject)out)->tag;
297:     PetscObjectGetComm((PetscObject)out,&comm);

299:     /* Register the receives that you will use later (sends for scatter reverse) */
300:     for (i=0; i<out_from->n; i++) {
301:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
302:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
303:     }

305:     for (i=0; i<out_to->n; i++) {
306:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
307:     }

309:     /* Register receives for scatter reverse */
310:     for (i=0; i<out_to->n; i++) {
311:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
312:     }
313:   }

315:     /* since the to and from data structures are not symmetric for shared memory copies we insure they always listed in "standard" form */
316:   if (!in_to->sharedwin) {
317:     VecScatter_MPI_General *totmp = in_to,*out_totmp = out_to;
318:     in_to       = in_from;
319:     in_from     = totmp;
320:     out_to      = out_from;
321:     out_from    = out_totmp;
322:   }

324:   /* copy the to parts for the shared memory copies between processes */
325:   out_to->sharedcnt = in_to->sharedcnt;
326:   out_to->msize     = in_to->msize;
327:   PetscMalloc1(out_to->msize+1,&out_to->sharedspacestarts);
328:   PetscMemcpy(out_to->sharedspacestarts,in_to->sharedspacestarts,(out_to->msize+1)*sizeof(PetscInt));
329:   PetscMalloc1(out_to->sharedcnt,&out_to->sharedspaceindices);
330:   PetscMemcpy(out_to->sharedspaceindices,in_to->sharedspaceindices,out_to->sharedcnt*sizeof(PetscInt));

332:   PetscShmCommGet(PetscObjectComm((PetscObject)in),&scomm);
333:   PetscShmCommGetMpiShmComm(scomm,&mscomm);
334:   MPI_Info_create(&info);
335:   MPI_Info_set(info, "alloc_shared_noncontig", "true");
336:   MPIU_Win_allocate_shared(bs*out_to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&out_to->sharedspace,&out_to->sharedwin);
337:   MPI_Info_free(&info);

339:   /* copy the to parts for the shared memory copies between processes */
340:   out_from->sharedcnt = in_from->sharedcnt;
341:   out_from->msize     = in_from->msize;
342:   PetscMalloc1(out_from->msize+1,&out_from->sharedspacestarts);
343:   PetscMemcpy(out_from->sharedspacestarts,in_from->sharedspacestarts,(out_from->msize+1)*sizeof(PetscInt));
344:   PetscMalloc1(out_from->sharedcnt,&out_from->sharedspaceindices);
345:   PetscMemcpy(out_from->sharedspaceindices,in_from->sharedspaceindices,out_from->sharedcnt*sizeof(PetscInt));
346:   PetscMalloc1(out_from->msize,&out_from->sharedspacesoffset);
347:   PetscMemcpy(out_from->sharedspacesoffset,in_from->sharedspacesoffset,out_from->msize*sizeof(PetscInt));
348:   PetscCalloc1(out_from->msize,&out_from->sharedspaces);
349:   for (jj=0; jj<out_from->msize; jj++) {
350:     MPI_Aint    isize;
351:     PetscMPIInt disp_unit;
352:     MPIU_Win_shared_query(out_to->sharedwin,jj,&isize,&disp_unit,&out_from->sharedspaces[jj]);
353:   }
354:   return(0);
355: }

357: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
358: {
359:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
360:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
361:   PetscErrorCode         ierr;
362:   PetscInt               ny,bs = in_from->bs;
363:   PetscMPIInt            size;

366:   MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);

368:   out->ops->begin     = in->ops->begin;
369:   out->ops->end       = in->ops->end;
370:   out->ops->copy      = in->ops->copy;
371:   out->ops->destroy   = in->ops->destroy;
372:   out->ops->view      = in->ops->view;

374:   /* allocate entire send scatter context */
375:   PetscNewLog(out,&out_to);
376:   out_to->sharedwin       = MPI_WIN_NULL;
377:   PetscNewLog(out,&out_from);
378:   out_from->sharedwin       = MPI_WIN_NULL;

380:   ny                = in_to->starts[in_to->n];
381:   out_to->n         = in_to->n;
382:   out_to->format    = in_to->format;

384:   PetscMalloc1(out_to->n,&out_to->requests);
385:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
386:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
387:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
388:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
389:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

391:   out->todata                        = (void*)out_to;
392:   out_to->local.n                    = in_to->local.n;
393:   out_to->local.nonmatching_computed = PETSC_FALSE;
394:   out_to->local.n_nonmatching        = 0;
395:   out_to->local.slots_nonmatching    = 0;
396:   if (in_to->local.n) {
397:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
398:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
399:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
400:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
401:   } else {
402:     out_to->local.vslots   = 0;
403:     out_from->local.vslots = 0;
404:   }

406:   /* allocate entire receive context */
407:   VecScatterMemcpyPlanCopy_PtoP(in_to,in_from,out_to,out_from);

409:   out_from->format    = in_from->format;
410:   ny                  = in_from->starts[in_from->n];
411:   out_from->n         = in_from->n;

413:   PetscMalloc1(out_from->n,&out_from->requests);
414:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
415:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
416:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
417:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

419:   out->fromdata                        = (void*)out_from;
420:   out_from->local.n                    = in_from->local.n;
421:   out_from->local.nonmatching_computed = PETSC_FALSE;
422:   out_from->local.n_nonmatching        = 0;
423:   out_from->local.slots_nonmatching    = 0;

425:   return(0);
426: }
427: /* --------------------------------------------------------------------------------------------------
428:     Packs and unpacks the message data into send or from receive buffers.

430:     These could be generated automatically.

432:     Fortran kernels etc. could be used.
433: */
434: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
435: {
436:   PetscInt i;
437:   for (i=0; i<n; i++) y[i] = x[indicesx[i]];
438: }

440: PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
441: {
442:   PetscInt i;

445:   switch (addv) {
446:   case INSERT_VALUES:
447:   case INSERT_ALL_VALUES:
448:     for (i=0; i<n; i++) y[indicesy[i]] = x[i];
449:     break;
450:   case ADD_VALUES:
451:   case ADD_ALL_VALUES:
452:     for (i=0; i<n; i++) y[indicesy[i]] += x[i];
453:     break;
454: #if !defined(PETSC_USE_COMPLEX)
455:   case MAX_VALUES:
456:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
457: #else
458:   case MAX_VALUES:
459: #endif
460:   case NOT_SET_VALUES:
461:     break;
462:   default:
463:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
464:   }
465:   return(0);
466: }

468: PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
469: {
470:   PetscInt i;

473:   switch (addv) {
474:   case INSERT_VALUES:
475:   case INSERT_ALL_VALUES:
476:     for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
477:     break;
478:   case ADD_VALUES:
479:   case ADD_ALL_VALUES:
480:     for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
481:     break;
482: #if !defined(PETSC_USE_COMPLEX)
483:   case MAX_VALUES:
484:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
485: #else
486:   case MAX_VALUES:
487: #endif
488:   case NOT_SET_VALUES:
489:     break;
490:   default:
491:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
492:   }
493:   return(0);
494: }

496: /* ----------------------------------------------------------------------------------------------- */
497: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
498: {
499:   PetscInt i,idx;

501:   for (i=0; i<n; i++) {
502:     idx  = *indicesx++;
503:     y[0] = x[idx];
504:     y[1] = x[idx+1];
505:     y   += 2;
506:   }
507: }

509: PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
510: {
511:   PetscInt i,idy;

514:   switch (addv) {
515:   case INSERT_VALUES:
516:   case INSERT_ALL_VALUES:
517:     for (i=0; i<n; i++) {
518:       idy      = *indicesy++;
519:       y[idy]   = x[0];
520:       y[idy+1] = x[1];
521:       x       += 2;
522:     }
523:     break;
524:   case ADD_VALUES:
525:   case ADD_ALL_VALUES:
526:     for (i=0; i<n; i++) {
527:       idy       = *indicesy++;
528:       y[idy]   += x[0];
529:       y[idy+1] += x[1];
530:       x        += 2;
531:     }
532:     break;
533: #if !defined(PETSC_USE_COMPLEX)
534:   case MAX_VALUES:
535:     for (i=0; i<n; i++) {
536:       idy      = *indicesy++;
537:       y[idy]   = PetscMax(y[idy],x[0]);
538:       y[idy+1] = PetscMax(y[idy+1],x[1]);
539:       x       += 2;
540:     }
541: #else
542:   case MAX_VALUES:
543: #endif
544:   case NOT_SET_VALUES:
545:     break;
546:   default:
547:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
548:   }
549:   return(0);
550: }

552: PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
553: {
554:   PetscInt i,idx,idy;

557:   switch (addv) {
558:   case INSERT_VALUES:
559:   case INSERT_ALL_VALUES:
560:     for (i=0; i<n; i++) {
561:       idx      = *indicesx++;
562:       idy      = *indicesy++;
563:       y[idy]   = x[idx];
564:       y[idy+1] = x[idx+1];
565:     }
566:     break;
567:   case ADD_VALUES:
568:   case ADD_ALL_VALUES:
569:     for (i=0; i<n; i++) {
570:       idx       = *indicesx++;
571:       idy       = *indicesy++;
572:       y[idy]   += x[idx];
573:       y[idy+1] += x[idx+1];
574:     }
575:     break;
576: #if !defined(PETSC_USE_COMPLEX)
577:   case MAX_VALUES:
578:     for (i=0; i<n; i++) {
579:       idx      = *indicesx++;
580:       idy      = *indicesy++;
581:       y[idy]   = PetscMax(y[idy],x[idx]);
582:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
583:     }
584: #else
585:   case MAX_VALUES:
586: #endif
587:   case NOT_SET_VALUES:
588:     break;
589:   default:
590:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
591:   }
592:   return(0);
593: }
594: /* ----------------------------------------------------------------------------------------------- */
595: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
596: {
597:   PetscInt i,idx;

599:   for (i=0; i<n; i++) {
600:     idx  = *indicesx++;
601:     y[0] = x[idx];
602:     y[1] = x[idx+1];
603:     y[2] = x[idx+2];
604:     y   += 3;
605:   }
606: }
607: PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
608: {
609:   PetscInt i,idy;

612:   switch (addv) {
613:   case INSERT_VALUES:
614:   case INSERT_ALL_VALUES:
615:     for (i=0; i<n; i++) {
616:       idy      = *indicesy++;
617:       y[idy]   = x[0];
618:       y[idy+1] = x[1];
619:       y[idy+2] = x[2];
620:       x       += 3;
621:     }
622:     break;
623:   case ADD_VALUES:
624:   case ADD_ALL_VALUES:
625:     for (i=0; i<n; i++) {
626:       idy       = *indicesy++;
627:       y[idy]   += x[0];
628:       y[idy+1] += x[1];
629:       y[idy+2] += x[2];
630:       x        += 3;
631:     }
632:     break;
633: #if !defined(PETSC_USE_COMPLEX)
634:   case MAX_VALUES:
635:     for (i=0; i<n; i++) {
636:       idy      = *indicesy++;
637:       y[idy]   = PetscMax(y[idy],x[0]);
638:       y[idy+1] = PetscMax(y[idy+1],x[1]);
639:       y[idy+2] = PetscMax(y[idy+2],x[2]);
640:       x       += 3;
641:     }
642: #else
643:   case MAX_VALUES:
644: #endif
645:   case NOT_SET_VALUES:
646:     break;
647:   default:
648:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
649:   }
650:   return(0);
651: }

653: PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
654: {
655:   PetscInt i,idx,idy;

658:   switch (addv) {
659:   case INSERT_VALUES:
660:   case INSERT_ALL_VALUES:
661:     for (i=0; i<n; i++) {
662:       idx      = *indicesx++;
663:       idy      = *indicesy++;
664:       y[idy]   = x[idx];
665:       y[idy+1] = x[idx+1];
666:       y[idy+2] = x[idx+2];
667:     }
668:     break;
669:   case ADD_VALUES:
670:   case ADD_ALL_VALUES:
671:     for (i=0; i<n; i++) {
672:       idx       = *indicesx++;
673:       idy       = *indicesy++;
674:       y[idy]   += x[idx];
675:       y[idy+1] += x[idx+1];
676:       y[idy+2] += x[idx+2];
677:     }
678:     break;
679: #if !defined(PETSC_USE_COMPLEX)
680:   case MAX_VALUES:
681:     for (i=0; i<n; i++) {
682:       idx      = *indicesx++;
683:       idy      = *indicesy++;
684:       y[idy]   = PetscMax(y[idy],x[idx]);
685:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
686:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
687:     }
688: #else
689:   case MAX_VALUES:
690: #endif
691:   case NOT_SET_VALUES:
692:     break;
693:   default:
694:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
695:   }
696:   return(0);
697: }
698: /* ----------------------------------------------------------------------------------------------- */
699: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
700: {
701:   PetscInt i,idx;

703:   for (i=0; i<n; i++) {
704:     idx  = *indicesx++;
705:     y[0] = x[idx];
706:     y[1] = x[idx+1];
707:     y[2] = x[idx+2];
708:     y[3] = x[idx+3];
709:     y   += 4;
710:   }
711: }
712: PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
713: {
714:   PetscInt i,idy;

717:   switch (addv) {
718:   case INSERT_VALUES:
719:   case INSERT_ALL_VALUES:
720:     for (i=0; i<n; i++) {
721:       idy      = *indicesy++;
722:       y[idy]   = x[0];
723:       y[idy+1] = x[1];
724:       y[idy+2] = x[2];
725:       y[idy+3] = x[3];
726:       x       += 4;
727:     }
728:     break;
729:   case ADD_VALUES:
730:   case ADD_ALL_VALUES:
731:     for (i=0; i<n; i++) {
732:       idy       = *indicesy++;
733:       y[idy]   += x[0];
734:       y[idy+1] += x[1];
735:       y[idy+2] += x[2];
736:       y[idy+3] += x[3];
737:       x        += 4;
738:     }
739:     break;
740: #if !defined(PETSC_USE_COMPLEX)
741:   case MAX_VALUES:
742:     for (i=0; i<n; i++) {
743:       idy      = *indicesy++;
744:       y[idy]   = PetscMax(y[idy],x[0]);
745:       y[idy+1] = PetscMax(y[idy+1],x[1]);
746:       y[idy+2] = PetscMax(y[idy+2],x[2]);
747:       y[idy+3] = PetscMax(y[idy+3],x[3]);
748:       x       += 4;
749:     }
750: #else
751:   case MAX_VALUES:
752: #endif
753:   case NOT_SET_VALUES:
754:     break;
755:   default:
756:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
757:   }
758:   return(0);
759: }

761: PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
762: {
763:   PetscInt i,idx,idy;

766:   switch (addv) {
767:   case INSERT_VALUES:
768:   case INSERT_ALL_VALUES:
769:     for (i=0; i<n; i++) {
770:       idx      = *indicesx++;
771:       idy      = *indicesy++;
772:       y[idy]   = x[idx];
773:       y[idy+1] = x[idx+1];
774:       y[idy+2] = x[idx+2];
775:       y[idy+3] = x[idx+3];
776:     }
777:     break;
778:   case ADD_VALUES:
779:   case ADD_ALL_VALUES:
780:     for (i=0; i<n; i++) {
781:       idx       = *indicesx++;
782:       idy       = *indicesy++;
783:       y[idy]   += x[idx];
784:       y[idy+1] += x[idx+1];
785:       y[idy+2] += x[idx+2];
786:       y[idy+3] += x[idx+3];
787:     }
788:     break;
789: #if !defined(PETSC_USE_COMPLEX)
790:   case MAX_VALUES:
791:     for (i=0; i<n; i++) {
792:       idx      = *indicesx++;
793:       idy      = *indicesy++;
794:       y[idy]   = PetscMax(y[idy],x[idx]);
795:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
796:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
797:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
798:     }
799: #else
800:   case MAX_VALUES:
801: #endif
802:   case NOT_SET_VALUES:
803:     break;
804:   default:
805:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
806:   }
807:   return(0);
808: }
809: /* ----------------------------------------------------------------------------------------------- */
810: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
811: {
812:   PetscInt i,idx;

814:   for (i=0; i<n; i++) {
815:     idx  = *indicesx++;
816:     y[0] = x[idx];
817:     y[1] = x[idx+1];
818:     y[2] = x[idx+2];
819:     y[3] = x[idx+3];
820:     y[4] = x[idx+4];
821:     y   += 5;
822:   }
823: }

825: PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
826: {
827:   PetscInt i,idy;

830:   switch (addv) {
831:   case INSERT_VALUES:
832:   case INSERT_ALL_VALUES:
833:     for (i=0; i<n; i++) {
834:       idy      = *indicesy++;
835:       y[idy]   = x[0];
836:       y[idy+1] = x[1];
837:       y[idy+2] = x[2];
838:       y[idy+3] = x[3];
839:       y[idy+4] = x[4];
840:       x       += 5;
841:     }
842:     break;
843:   case ADD_VALUES:
844:   case ADD_ALL_VALUES:
845:     for (i=0; i<n; i++) {
846:       idy       = *indicesy++;
847:       y[idy]   += x[0];
848:       y[idy+1] += x[1];
849:       y[idy+2] += x[2];
850:       y[idy+3] += x[3];
851:       y[idy+4] += x[4];
852:       x        += 5;
853:     }
854:     break;
855: #if !defined(PETSC_USE_COMPLEX)
856:   case MAX_VALUES:
857:     for (i=0; i<n; i++) {
858:       idy      = *indicesy++;
859:       y[idy]   = PetscMax(y[idy],x[0]);
860:       y[idy+1] = PetscMax(y[idy+1],x[1]);
861:       y[idy+2] = PetscMax(y[idy+2],x[2]);
862:       y[idy+3] = PetscMax(y[idy+3],x[3]);
863:       y[idy+4] = PetscMax(y[idy+4],x[4]);
864:       x       += 5;
865:     }
866: #else
867:   case MAX_VALUES:
868: #endif
869:   case NOT_SET_VALUES:
870:     break;
871:   default:
872:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
873:   }
874:   return(0);
875: }

877: PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
878: {
879:   PetscInt i,idx,idy;

882:   switch (addv) {
883:   case INSERT_VALUES:
884:   case INSERT_ALL_VALUES:
885:     for (i=0; i<n; i++) {
886:       idx      = *indicesx++;
887:       idy      = *indicesy++;
888:       y[idy]   = x[idx];
889:       y[idy+1] = x[idx+1];
890:       y[idy+2] = x[idx+2];
891:       y[idy+3] = x[idx+3];
892:       y[idy+4] = x[idx+4];
893:     }
894:     break;
895:   case ADD_VALUES:
896:   case ADD_ALL_VALUES:
897:     for (i=0; i<n; i++) {
898:       idx       = *indicesx++;
899:       idy       = *indicesy++;
900:       y[idy]   += x[idx];
901:       y[idy+1] += x[idx+1];
902:       y[idy+2] += x[idx+2];
903:       y[idy+3] += x[idx+3];
904:       y[idy+4] += x[idx+4];
905:     }
906:     break;
907: #if !defined(PETSC_USE_COMPLEX)
908:   case MAX_VALUES:
909:     for (i=0; i<n; i++) {
910:       idx      = *indicesx++;
911:       idy      = *indicesy++;
912:       y[idy]   = PetscMax(y[idy],x[idx]);
913:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
914:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
915:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
916:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
917:     }
918: #else
919:   case MAX_VALUES:
920: #endif
921:   case NOT_SET_VALUES:
922:     break;
923:   default:
924:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
925:   }
926:   return(0);
927: }
928: /* ----------------------------------------------------------------------------------------------- */
929: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
930: {
931:   PetscInt i,idx;

933:   for (i=0; i<n; i++) {
934:     idx  = *indicesx++;
935:     y[0] = x[idx];
936:     y[1] = x[idx+1];
937:     y[2] = x[idx+2];
938:     y[3] = x[idx+3];
939:     y[4] = x[idx+4];
940:     y[5] = x[idx+5];
941:     y   += 6;
942:   }
943: }

945: PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
946: {
947:   PetscInt i,idy;

950:   switch (addv) {
951:   case INSERT_VALUES:
952:   case INSERT_ALL_VALUES:
953:     for (i=0; i<n; i++) {
954:       idy      = *indicesy++;
955:       y[idy]   = x[0];
956:       y[idy+1] = x[1];
957:       y[idy+2] = x[2];
958:       y[idy+3] = x[3];
959:       y[idy+4] = x[4];
960:       y[idy+5] = x[5];
961:       x       += 6;
962:     }
963:     break;
964:   case ADD_VALUES:
965:   case ADD_ALL_VALUES:
966:     for (i=0; i<n; i++) {
967:       idy       = *indicesy++;
968:       y[idy]   += x[0];
969:       y[idy+1] += x[1];
970:       y[idy+2] += x[2];
971:       y[idy+3] += x[3];
972:       y[idy+4] += x[4];
973:       y[idy+5] += x[5];
974:       x        += 6;
975:     }
976:     break;
977: #if !defined(PETSC_USE_COMPLEX)
978:   case MAX_VALUES:
979:     for (i=0; i<n; i++) {
980:       idy      = *indicesy++;
981:       y[idy]   = PetscMax(y[idy],x[0]);
982:       y[idy+1] = PetscMax(y[idy+1],x[1]);
983:       y[idy+2] = PetscMax(y[idy+2],x[2]);
984:       y[idy+3] = PetscMax(y[idy+3],x[3]);
985:       y[idy+4] = PetscMax(y[idy+4],x[4]);
986:       y[idy+5] = PetscMax(y[idy+5],x[5]);
987:       x       += 6;
988:     }
989: #else
990:   case MAX_VALUES:
991: #endif
992:   case NOT_SET_VALUES:
993:     break;
994:   default:
995:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
996:   }
997:   return(0);
998: }

1000: PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1001: {
1002:   PetscInt i,idx,idy;

1005:   switch (addv) {
1006:   case INSERT_VALUES:
1007:   case INSERT_ALL_VALUES:
1008:     for (i=0; i<n; i++) {
1009:       idx      = *indicesx++;
1010:       idy      = *indicesy++;
1011:       y[idy]   = x[idx];
1012:       y[idy+1] = x[idx+1];
1013:       y[idy+2] = x[idx+2];
1014:       y[idy+3] = x[idx+3];
1015:       y[idy+4] = x[idx+4];
1016:       y[idy+5] = x[idx+5];
1017:     }
1018:     break;
1019:   case ADD_VALUES:
1020:   case ADD_ALL_VALUES:
1021:     for (i=0; i<n; i++) {
1022:       idx       = *indicesx++;
1023:       idy       = *indicesy++;
1024:       y[idy]   += x[idx];
1025:       y[idy+1] += x[idx+1];
1026:       y[idy+2] += x[idx+2];
1027:       y[idy+3] += x[idx+3];
1028:       y[idy+4] += x[idx+4];
1029:       y[idy+5] += x[idx+5];
1030:     }
1031:     break;
1032: #if !defined(PETSC_USE_COMPLEX)
1033:   case MAX_VALUES:
1034:     for (i=0; i<n; i++) {
1035:       idx      = *indicesx++;
1036:       idy      = *indicesy++;
1037:       y[idy]   = PetscMax(y[idy],x[idx]);
1038:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1039:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1040:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1041:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1042:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1043:     }
1044: #else
1045:   case MAX_VALUES:
1046: #endif
1047:   case NOT_SET_VALUES:
1048:     break;
1049:   default:
1050:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1051:   }
1052:   return(0);
1053: }
1054: /* ----------------------------------------------------------------------------------------------- */
1055: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1056: {
1057:   PetscInt i,idx;

1059:   for (i=0; i<n; i++) {
1060:     idx  = *indicesx++;
1061:     y[0] = x[idx];
1062:     y[1] = x[idx+1];
1063:     y[2] = x[idx+2];
1064:     y[3] = x[idx+3];
1065:     y[4] = x[idx+4];
1066:     y[5] = x[idx+5];
1067:     y[6] = x[idx+6];
1068:     y   += 7;
1069:   }
1070: }

1072: PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1073: {
1074:   PetscInt i,idy;

1077:   switch (addv) {
1078:   case INSERT_VALUES:
1079:   case INSERT_ALL_VALUES:
1080:     for (i=0; i<n; i++) {
1081:       idy      = *indicesy++;
1082:       y[idy]   = x[0];
1083:       y[idy+1] = x[1];
1084:       y[idy+2] = x[2];
1085:       y[idy+3] = x[3];
1086:       y[idy+4] = x[4];
1087:       y[idy+5] = x[5];
1088:       y[idy+6] = x[6];
1089:       x       += 7;
1090:     }
1091:     break;
1092:   case ADD_VALUES:
1093:   case ADD_ALL_VALUES:
1094:     for (i=0; i<n; i++) {
1095:       idy       = *indicesy++;
1096:       y[idy]   += x[0];
1097:       y[idy+1] += x[1];
1098:       y[idy+2] += x[2];
1099:       y[idy+3] += x[3];
1100:       y[idy+4] += x[4];
1101:       y[idy+5] += x[5];
1102:       y[idy+6] += x[6];
1103:       x        += 7;
1104:     }
1105:     break;
1106: #if !defined(PETSC_USE_COMPLEX)
1107:   case MAX_VALUES:
1108:     for (i=0; i<n; i++) {
1109:       idy      = *indicesy++;
1110:       y[idy]   = PetscMax(y[idy],x[0]);
1111:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1112:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1113:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1114:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1115:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1116:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1117:       x       += 7;
1118:     }
1119: #else
1120:   case MAX_VALUES:
1121: #endif
1122:   case NOT_SET_VALUES:
1123:     break;
1124:   default:
1125:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1126:   }
1127:   return(0);
1128: }

1130: PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1131: {
1132:   PetscInt i,idx,idy;

1135:   switch (addv) {
1136:   case INSERT_VALUES:
1137:   case INSERT_ALL_VALUES:
1138:     for (i=0; i<n; i++) {
1139:       idx      = *indicesx++;
1140:       idy      = *indicesy++;
1141:       y[idy]   = x[idx];
1142:       y[idy+1] = x[idx+1];
1143:       y[idy+2] = x[idx+2];
1144:       y[idy+3] = x[idx+3];
1145:       y[idy+4] = x[idx+4];
1146:       y[idy+5] = x[idx+5];
1147:       y[idy+6] = x[idx+6];
1148:     }
1149:     break;
1150:   case ADD_VALUES:
1151:   case ADD_ALL_VALUES:
1152:     for (i=0; i<n; i++) {
1153:       idx       = *indicesx++;
1154:       idy       = *indicesy++;
1155:       y[idy]   += x[idx];
1156:       y[idy+1] += x[idx+1];
1157:       y[idy+2] += x[idx+2];
1158:       y[idy+3] += x[idx+3];
1159:       y[idy+4] += x[idx+4];
1160:       y[idy+5] += x[idx+5];
1161:       y[idy+6] += x[idx+6];
1162:     }
1163:     break;
1164: #if !defined(PETSC_USE_COMPLEX)
1165:   case MAX_VALUES:
1166:     for (i=0; i<n; i++) {
1167:       idx      = *indicesx++;
1168:       idy      = *indicesy++;
1169:       y[idy]   = PetscMax(y[idy],x[idx]);
1170:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1171:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1172:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1173:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1174:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1175:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1176:     }
1177: #else
1178:   case MAX_VALUES:
1179: #endif
1180:   case NOT_SET_VALUES:
1181:     break;
1182:   default:
1183:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1184:   }
1185:   return(0);
1186: }
1187: /* ----------------------------------------------------------------------------------------------- */
1188: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1189: {
1190:   PetscInt i,idx;

1192:   for (i=0; i<n; i++) {
1193:     idx  = *indicesx++;
1194:     y[0] = x[idx];
1195:     y[1] = x[idx+1];
1196:     y[2] = x[idx+2];
1197:     y[3] = x[idx+3];
1198:     y[4] = x[idx+4];
1199:     y[5] = x[idx+5];
1200:     y[6] = x[idx+6];
1201:     y[7] = x[idx+7];
1202:     y   += 8;
1203:   }
1204: }

1206: PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1207: {
1208:   PetscInt i,idy;

1211:   switch (addv) {
1212:   case INSERT_VALUES:
1213:   case INSERT_ALL_VALUES:
1214:     for (i=0; i<n; i++) {
1215:       idy      = *indicesy++;
1216:       y[idy]   = x[0];
1217:       y[idy+1] = x[1];
1218:       y[idy+2] = x[2];
1219:       y[idy+3] = x[3];
1220:       y[idy+4] = x[4];
1221:       y[idy+5] = x[5];
1222:       y[idy+6] = x[6];
1223:       y[idy+7] = x[7];
1224:       x       += 8;
1225:     }
1226:     break;
1227:   case ADD_VALUES:
1228:   case ADD_ALL_VALUES:
1229:     for (i=0; i<n; i++) {
1230:       idy       = *indicesy++;
1231:       y[idy]   += x[0];
1232:       y[idy+1] += x[1];
1233:       y[idy+2] += x[2];
1234:       y[idy+3] += x[3];
1235:       y[idy+4] += x[4];
1236:       y[idy+5] += x[5];
1237:       y[idy+6] += x[6];
1238:       y[idy+7] += x[7];
1239:       x        += 8;
1240:     }
1241:     break;
1242: #if !defined(PETSC_USE_COMPLEX)
1243:   case MAX_VALUES:
1244:     for (i=0; i<n; i++) {
1245:       idy      = *indicesy++;
1246:       y[idy]   = PetscMax(y[idy],x[0]);
1247:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1248:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1249:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1250:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1251:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1252:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1253:       y[idy+7] = PetscMax(y[idy+7],x[7]);
1254:       x       += 8;
1255:     }
1256: #else
1257:   case MAX_VALUES:
1258: #endif
1259:   case NOT_SET_VALUES:
1260:     break;
1261:   default:
1262:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1263:   }
1264:   return(0);
1265: }

1267: PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1268: {
1269:   PetscInt i,idx,idy;

1272:   switch (addv) {
1273:   case INSERT_VALUES:
1274:   case INSERT_ALL_VALUES:
1275:     for (i=0; i<n; i++) {
1276:       idx      = *indicesx++;
1277:       idy      = *indicesy++;
1278:       y[idy]   = x[idx];
1279:       y[idy+1] = x[idx+1];
1280:       y[idy+2] = x[idx+2];
1281:       y[idy+3] = x[idx+3];
1282:       y[idy+4] = x[idx+4];
1283:       y[idy+5] = x[idx+5];
1284:       y[idy+6] = x[idx+6];
1285:       y[idy+7] = x[idx+7];
1286:     }
1287:     break;
1288:   case ADD_VALUES:
1289:   case ADD_ALL_VALUES:
1290:     for (i=0; i<n; i++) {
1291:       idx       = *indicesx++;
1292:       idy       = *indicesy++;
1293:       y[idy]   += x[idx];
1294:       y[idy+1] += x[idx+1];
1295:       y[idy+2] += x[idx+2];
1296:       y[idy+3] += x[idx+3];
1297:       y[idy+4] += x[idx+4];
1298:       y[idy+5] += x[idx+5];
1299:       y[idy+6] += x[idx+6];
1300:       y[idy+7] += x[idx+7];
1301:     }
1302:     break;
1303: #if !defined(PETSC_USE_COMPLEX)
1304:   case MAX_VALUES:
1305:     for (i=0; i<n; i++) {
1306:       idx      = *indicesx++;
1307:       idy      = *indicesy++;
1308:       y[idy]   = PetscMax(y[idy],x[idx]);
1309:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1310:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1311:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1312:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1313:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1314:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1315:       y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1316:     }
1317: #else
1318:   case MAX_VALUES:
1319: #endif
1320:   case NOT_SET_VALUES:
1321:     break;
1322:   default:
1323:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1324:   }
1325:   return(0);
1326: }

1328: PETSC_STATIC_INLINE void Pack_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1329: {
1330:   PetscInt i,idx;

1332:   for (i=0; i<n; i++) {
1333:     idx   = *indicesx++;
1334:     y[0]  = x[idx];
1335:     y[1]  = x[idx+1];
1336:     y[2]  = x[idx+2];
1337:     y[3]  = x[idx+3];
1338:     y[4]  = x[idx+4];
1339:     y[5]  = x[idx+5];
1340:     y[6]  = x[idx+6];
1341:     y[7]  = x[idx+7];
1342:     y[8]  = x[idx+8];
1343:     y    += 9;
1344:   }
1345: }

1347: PETSC_STATIC_INLINE PetscErrorCode UnPack_9(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1348: {
1349:   PetscInt i,idy;

1352:   switch (addv) {
1353:   case INSERT_VALUES:
1354:   case INSERT_ALL_VALUES:
1355:     for (i=0; i<n; i++) {
1356:       idy       = *indicesy++;
1357:       y[idy]    = x[0];
1358:       y[idy+1]  = x[1];
1359:       y[idy+2]  = x[2];
1360:       y[idy+3]  = x[3];
1361:       y[idy+4]  = x[4];
1362:       y[idy+5]  = x[5];
1363:       y[idy+6]  = x[6];
1364:       y[idy+7]  = x[7];
1365:       y[idy+8]  = x[8];
1366:       x        += 9;
1367:     }
1368:     break;
1369:   case ADD_VALUES:
1370:   case ADD_ALL_VALUES:
1371:     for (i=0; i<n; i++) {
1372:       idy        = *indicesy++;
1373:       y[idy]    += x[0];
1374:       y[idy+1]  += x[1];
1375:       y[idy+2]  += x[2];
1376:       y[idy+3]  += x[3];
1377:       y[idy+4]  += x[4];
1378:       y[idy+5]  += x[5];
1379:       y[idy+6]  += x[6];
1380:       y[idy+7]  += x[7];
1381:       y[idy+8]  += x[8];
1382:       x         += 9;
1383:     }
1384:     break;
1385: #if !defined(PETSC_USE_COMPLEX)
1386:   case MAX_VALUES:
1387:     for (i=0; i<n; i++) {
1388:       idy       = *indicesy++;
1389:       y[idy]    = PetscMax(y[idy],x[0]);
1390:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1391:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1392:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1393:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1394:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1395:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1396:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1397:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1398:       x        += 9;
1399:     }
1400: #else
1401:   case MAX_VALUES:
1402: #endif
1403:   case NOT_SET_VALUES:
1404:     break;
1405:   default:
1406:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1407:   }
1408:   return(0);
1409: }

1411: PETSC_STATIC_INLINE PetscErrorCode Scatter_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1412: {
1413:   PetscInt i,idx,idy;

1416:   switch (addv) {
1417:   case INSERT_VALUES:
1418:   case INSERT_ALL_VALUES:
1419:     for (i=0; i<n; i++) {
1420:       idx       = *indicesx++;
1421:       idy       = *indicesy++;
1422:       y[idy]    = x[idx];
1423:       y[idy+1]  = x[idx+1];
1424:       y[idy+2]  = x[idx+2];
1425:       y[idy+3]  = x[idx+3];
1426:       y[idy+4]  = x[idx+4];
1427:       y[idy+5]  = x[idx+5];
1428:       y[idy+6]  = x[idx+6];
1429:       y[idy+7]  = x[idx+7];
1430:       y[idy+8]  = x[idx+8];
1431:     }
1432:     break;
1433:   case ADD_VALUES:
1434:   case ADD_ALL_VALUES:
1435:     for (i=0; i<n; i++) {
1436:       idx        = *indicesx++;
1437:       idy        = *indicesy++;
1438:       y[idy]    += x[idx];
1439:       y[idy+1]  += x[idx+1];
1440:       y[idy+2]  += x[idx+2];
1441:       y[idy+3]  += x[idx+3];
1442:       y[idy+4]  += x[idx+4];
1443:       y[idy+5]  += x[idx+5];
1444:       y[idy+6]  += x[idx+6];
1445:       y[idy+7]  += x[idx+7];
1446:       y[idy+8]  += x[idx+8];
1447:     }
1448:     break;
1449: #if !defined(PETSC_USE_COMPLEX)
1450:   case MAX_VALUES:
1451:     for (i=0; i<n; i++) {
1452:       idx       = *indicesx++;
1453:       idy       = *indicesy++;
1454:       y[idy]    = PetscMax(y[idy],x[idx]);
1455:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1456:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1457:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1458:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1459:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1460:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1461:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1462:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1463:     }
1464: #else
1465:   case MAX_VALUES:
1466: #endif
1467:   case NOT_SET_VALUES:
1468:     break;
1469:   default:
1470:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1471:   }
1472:   return(0);
1473: }

1475: PETSC_STATIC_INLINE void Pack_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1476: {
1477:   PetscInt i,idx;

1479:   for (i=0; i<n; i++) {
1480:     idx   = *indicesx++;
1481:     y[0]  = x[idx];
1482:     y[1]  = x[idx+1];
1483:     y[2]  = x[idx+2];
1484:     y[3]  = x[idx+3];
1485:     y[4]  = x[idx+4];
1486:     y[5]  = x[idx+5];
1487:     y[6]  = x[idx+6];
1488:     y[7]  = x[idx+7];
1489:     y[8]  = x[idx+8];
1490:     y[9]  = x[idx+9];
1491:     y    += 10;
1492:   }
1493: }

1495: PETSC_STATIC_INLINE PetscErrorCode UnPack_10(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1496: {
1497:   PetscInt i,idy;

1500:   switch (addv) {
1501:   case INSERT_VALUES:
1502:   case INSERT_ALL_VALUES:
1503:     for (i=0; i<n; i++) {
1504:       idy       = *indicesy++;
1505:       y[idy]    = x[0];
1506:       y[idy+1]  = x[1];
1507:       y[idy+2]  = x[2];
1508:       y[idy+3]  = x[3];
1509:       y[idy+4]  = x[4];
1510:       y[idy+5]  = x[5];
1511:       y[idy+6]  = x[6];
1512:       y[idy+7]  = x[7];
1513:       y[idy+8]  = x[8];
1514:       y[idy+9]  = x[9];
1515:       x        += 10;
1516:     }
1517:     break;
1518:   case ADD_VALUES:
1519:   case ADD_ALL_VALUES:
1520:     for (i=0; i<n; i++) {
1521:       idy        = *indicesy++;
1522:       y[idy]    += x[0];
1523:       y[idy+1]  += x[1];
1524:       y[idy+2]  += x[2];
1525:       y[idy+3]  += x[3];
1526:       y[idy+4]  += x[4];
1527:       y[idy+5]  += x[5];
1528:       y[idy+6]  += x[6];
1529:       y[idy+7]  += x[7];
1530:       y[idy+8]  += x[8];
1531:       y[idy+9]  += x[9];
1532:       x         += 10;
1533:     }
1534:     break;
1535: #if !defined(PETSC_USE_COMPLEX)
1536:   case MAX_VALUES:
1537:     for (i=0; i<n; i++) {
1538:       idy       = *indicesy++;
1539:       y[idy]    = PetscMax(y[idy],x[0]);
1540:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1541:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1542:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1543:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1544:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1545:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1546:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1547:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1548:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1549:       x        += 10;
1550:     }
1551: #else
1552:   case MAX_VALUES:
1553: #endif
1554:   case NOT_SET_VALUES:
1555:     break;
1556:   default:
1557:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1558:   }
1559:   return(0);
1560: }

1562: PETSC_STATIC_INLINE PetscErrorCode Scatter_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1563: {
1564:   PetscInt i,idx,idy;

1567:   switch (addv) {
1568:   case INSERT_VALUES:
1569:   case INSERT_ALL_VALUES:
1570:     for (i=0; i<n; i++) {
1571:       idx       = *indicesx++;
1572:       idy       = *indicesy++;
1573:       y[idy]    = x[idx];
1574:       y[idy+1]  = x[idx+1];
1575:       y[idy+2]  = x[idx+2];
1576:       y[idy+3]  = x[idx+3];
1577:       y[idy+4]  = x[idx+4];
1578:       y[idy+5]  = x[idx+5];
1579:       y[idy+6]  = x[idx+6];
1580:       y[idy+7]  = x[idx+7];
1581:       y[idy+8]  = x[idx+8];
1582:       y[idy+9]  = x[idx+9];
1583:     }
1584:     break;
1585:   case ADD_VALUES:
1586:   case ADD_ALL_VALUES:
1587:     for (i=0; i<n; i++) {
1588:       idx        = *indicesx++;
1589:       idy        = *indicesy++;
1590:       y[idy]    += x[idx];
1591:       y[idy+1]  += x[idx+1];
1592:       y[idy+2]  += x[idx+2];
1593:       y[idy+3]  += x[idx+3];
1594:       y[idy+4]  += x[idx+4];
1595:       y[idy+5]  += x[idx+5];
1596:       y[idy+6]  += x[idx+6];
1597:       y[idy+7]  += x[idx+7];
1598:       y[idy+8]  += x[idx+8];
1599:       y[idy+9]  += x[idx+9];
1600:     }
1601:     break;
1602: #if !defined(PETSC_USE_COMPLEX)
1603:   case MAX_VALUES:
1604:     for (i=0; i<n; i++) {
1605:       idx       = *indicesx++;
1606:       idy       = *indicesy++;
1607:       y[idy]    = PetscMax(y[idy],x[idx]);
1608:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1609:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1610:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1611:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1612:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1613:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1614:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1615:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1616:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1617:     }
1618: #else
1619:   case MAX_VALUES:
1620: #endif
1621:   case NOT_SET_VALUES:
1622:     break;
1623:   default:
1624:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1625:   }
1626:   return(0);
1627: }

1629: PETSC_STATIC_INLINE void Pack_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1630: {
1631:   PetscInt i,idx;

1633:   for (i=0; i<n; i++) {
1634:     idx   = *indicesx++;
1635:     y[0]  = x[idx];
1636:     y[1]  = x[idx+1];
1637:     y[2]  = x[idx+2];
1638:     y[3]  = x[idx+3];
1639:     y[4]  = x[idx+4];
1640:     y[5]  = x[idx+5];
1641:     y[6]  = x[idx+6];
1642:     y[7]  = x[idx+7];
1643:     y[8]  = x[idx+8];
1644:     y[9]  = x[idx+9];
1645:     y[10] = x[idx+10];
1646:     y    += 11;
1647:   }
1648: }

1650: PETSC_STATIC_INLINE PetscErrorCode UnPack_11(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1651: {
1652:   PetscInt i,idy;

1655:   switch (addv) {
1656:   case INSERT_VALUES:
1657:   case INSERT_ALL_VALUES:
1658:     for (i=0; i<n; i++) {
1659:       idy       = *indicesy++;
1660:       y[idy]    = x[0];
1661:       y[idy+1]  = x[1];
1662:       y[idy+2]  = x[2];
1663:       y[idy+3]  = x[3];
1664:       y[idy+4]  = x[4];
1665:       y[idy+5]  = x[5];
1666:       y[idy+6]  = x[6];
1667:       y[idy+7]  = x[7];
1668:       y[idy+8]  = x[8];
1669:       y[idy+9]  = x[9];
1670:       y[idy+10] = x[10];
1671:       x        += 11;
1672:     }
1673:     break;
1674:   case ADD_VALUES:
1675:   case ADD_ALL_VALUES:
1676:     for (i=0; i<n; i++) {
1677:       idy        = *indicesy++;
1678:       y[idy]    += x[0];
1679:       y[idy+1]  += x[1];
1680:       y[idy+2]  += x[2];
1681:       y[idy+3]  += x[3];
1682:       y[idy+4]  += x[4];
1683:       y[idy+5]  += x[5];
1684:       y[idy+6]  += x[6];
1685:       y[idy+7]  += x[7];
1686:       y[idy+8]  += x[8];
1687:       y[idy+9]  += x[9];
1688:       y[idy+10] += x[10];
1689:       x         += 11;
1690:     }
1691:     break;
1692: #if !defined(PETSC_USE_COMPLEX)
1693:   case MAX_VALUES:
1694:     for (i=0; i<n; i++) {
1695:       idy       = *indicesy++;
1696:       y[idy]    = PetscMax(y[idy],x[0]);
1697:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1698:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1699:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1700:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1701:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1702:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1703:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1704:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1705:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1706:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1707:       x        += 11;
1708:     }
1709: #else
1710:   case MAX_VALUES:
1711: #endif
1712:   case NOT_SET_VALUES:
1713:     break;
1714:   default:
1715:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1716:   }
1717:   return(0);
1718: }

1720: PETSC_STATIC_INLINE PetscErrorCode Scatter_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1721: {
1722:   PetscInt i,idx,idy;

1725:   switch (addv) {
1726:   case INSERT_VALUES:
1727:   case INSERT_ALL_VALUES:
1728:     for (i=0; i<n; i++) {
1729:       idx       = *indicesx++;
1730:       idy       = *indicesy++;
1731:       y[idy]    = x[idx];
1732:       y[idy+1]  = x[idx+1];
1733:       y[idy+2]  = x[idx+2];
1734:       y[idy+3]  = x[idx+3];
1735:       y[idy+4]  = x[idx+4];
1736:       y[idy+5]  = x[idx+5];
1737:       y[idy+6]  = x[idx+6];
1738:       y[idy+7]  = x[idx+7];
1739:       y[idy+8]  = x[idx+8];
1740:       y[idy+9]  = x[idx+9];
1741:       y[idy+10] = x[idx+10];
1742:     }
1743:     break;
1744:   case ADD_VALUES:
1745:   case ADD_ALL_VALUES:
1746:     for (i=0; i<n; i++) {
1747:       idx        = *indicesx++;
1748:       idy        = *indicesy++;
1749:       y[idy]    += x[idx];
1750:       y[idy+1]  += x[idx+1];
1751:       y[idy+2]  += x[idx+2];
1752:       y[idy+3]  += x[idx+3];
1753:       y[idy+4]  += x[idx+4];
1754:       y[idy+5]  += x[idx+5];
1755:       y[idy+6]  += x[idx+6];
1756:       y[idy+7]  += x[idx+7];
1757:       y[idy+8]  += x[idx+8];
1758:       y[idy+9]  += x[idx+9];
1759:       y[idy+10] += x[idx+10];
1760:     }
1761:     break;
1762: #if !defined(PETSC_USE_COMPLEX)
1763:   case MAX_VALUES:
1764:     for (i=0; i<n; i++) {
1765:       idx       = *indicesx++;
1766:       idy       = *indicesy++;
1767:       y[idy]    = PetscMax(y[idy],x[idx]);
1768:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1769:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1770:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1771:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1772:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1773:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1774:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1775:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1776:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1777:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1778:     }
1779: #else
1780:   case MAX_VALUES:
1781: #endif
1782:   case NOT_SET_VALUES:
1783:     break;
1784:   default:
1785:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1786:   }
1787:   return(0);
1788: }

1790: /* ----------------------------------------------------------------------------------------------- */
1791: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1792: {
1793:   PetscInt i,idx;

1795:   for (i=0; i<n; i++) {
1796:     idx   = *indicesx++;
1797:     y[0]  = x[idx];
1798:     y[1]  = x[idx+1];
1799:     y[2]  = x[idx+2];
1800:     y[3]  = x[idx+3];
1801:     y[4]  = x[idx+4];
1802:     y[5]  = x[idx+5];
1803:     y[6]  = x[idx+6];
1804:     y[7]  = x[idx+7];
1805:     y[8]  = x[idx+8];
1806:     y[9]  = x[idx+9];
1807:     y[10] = x[idx+10];
1808:     y[11] = x[idx+11];
1809:     y    += 12;
1810:   }
1811: }

1813: PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1814: {
1815:   PetscInt i,idy;

1818:   switch (addv) {
1819:   case INSERT_VALUES:
1820:   case INSERT_ALL_VALUES:
1821:     for (i=0; i<n; i++) {
1822:       idy       = *indicesy++;
1823:       y[idy]    = x[0];
1824:       y[idy+1]  = x[1];
1825:       y[idy+2]  = x[2];
1826:       y[idy+3]  = x[3];
1827:       y[idy+4]  = x[4];
1828:       y[idy+5]  = x[5];
1829:       y[idy+6]  = x[6];
1830:       y[idy+7]  = x[7];
1831:       y[idy+8]  = x[8];
1832:       y[idy+9]  = x[9];
1833:       y[idy+10] = x[10];
1834:       y[idy+11] = x[11];
1835:       x        += 12;
1836:     }
1837:     break;
1838:   case ADD_VALUES:
1839:   case ADD_ALL_VALUES:
1840:     for (i=0; i<n; i++) {
1841:       idy        = *indicesy++;
1842:       y[idy]    += x[0];
1843:       y[idy+1]  += x[1];
1844:       y[idy+2]  += x[2];
1845:       y[idy+3]  += x[3];
1846:       y[idy+4]  += x[4];
1847:       y[idy+5]  += x[5];
1848:       y[idy+6]  += x[6];
1849:       y[idy+7]  += x[7];
1850:       y[idy+8]  += x[8];
1851:       y[idy+9]  += x[9];
1852:       y[idy+10] += x[10];
1853:       y[idy+11] += x[11];
1854:       x         += 12;
1855:     }
1856:     break;
1857: #if !defined(PETSC_USE_COMPLEX)
1858:   case MAX_VALUES:
1859:     for (i=0; i<n; i++) {
1860:       idy       = *indicesy++;
1861:       y[idy]    = PetscMax(y[idy],x[0]);
1862:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1863:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1864:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1865:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1866:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1867:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1868:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1869:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1870:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1871:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1872:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1873:       x        += 12;
1874:     }
1875: #else
1876:   case MAX_VALUES:
1877: #endif
1878:   case NOT_SET_VALUES:
1879:     break;
1880:   default:
1881:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1882:   }
1883:   return(0);
1884: }

1886: PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1887: {
1888:   PetscInt i,idx,idy;

1891:   switch (addv) {
1892:   case INSERT_VALUES:
1893:   case INSERT_ALL_VALUES:
1894:     for (i=0; i<n; i++) {
1895:       idx       = *indicesx++;
1896:       idy       = *indicesy++;
1897:       y[idy]    = x[idx];
1898:       y[idy+1]  = x[idx+1];
1899:       y[idy+2]  = x[idx+2];
1900:       y[idy+3]  = x[idx+3];
1901:       y[idy+4]  = x[idx+4];
1902:       y[idy+5]  = x[idx+5];
1903:       y[idy+6]  = x[idx+6];
1904:       y[idy+7]  = x[idx+7];
1905:       y[idy+8]  = x[idx+8];
1906:       y[idy+9]  = x[idx+9];
1907:       y[idy+10] = x[idx+10];
1908:       y[idy+11] = x[idx+11];
1909:     }
1910:     break;
1911:   case ADD_VALUES:
1912:   case ADD_ALL_VALUES:
1913:     for (i=0; i<n; i++) {
1914:       idx        = *indicesx++;
1915:       idy        = *indicesy++;
1916:       y[idy]    += x[idx];
1917:       y[idy+1]  += x[idx+1];
1918:       y[idy+2]  += x[idx+2];
1919:       y[idy+3]  += x[idx+3];
1920:       y[idy+4]  += x[idx+4];
1921:       y[idy+5]  += x[idx+5];
1922:       y[idy+6]  += x[idx+6];
1923:       y[idy+7]  += x[idx+7];
1924:       y[idy+8]  += x[idx+8];
1925:       y[idy+9]  += x[idx+9];
1926:       y[idy+10] += x[idx+10];
1927:       y[idy+11] += x[idx+11];
1928:     }
1929:     break;
1930: #if !defined(PETSC_USE_COMPLEX)
1931:   case MAX_VALUES:
1932:     for (i=0; i<n; i++) {
1933:       idx       = *indicesx++;
1934:       idy       = *indicesy++;
1935:       y[idy]    = PetscMax(y[idy],x[idx]);
1936:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1937:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1938:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1939:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1940:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1941:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1942:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1943:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1944:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1945:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1946:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1947:     }
1948: #else
1949:   case MAX_VALUES:
1950: #endif
1951:   case NOT_SET_VALUES:
1952:     break;
1953:   default:
1954:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1955:   }
1956:   return(0);
1957: }

1959: /* ----------------------------------------------------------------------------------------------- */
1960: PETSC_STATIC_INLINE void Pack_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1961: {
1962:   PetscInt       i,idx;

1964:   for (i=0; i<n; i++) {
1965:     idx   = *indicesx++;
1966:     PetscMemcpy(y,x + idx,bs*sizeof(PetscScalar));
1967:     y    += bs;
1968:   }
1969: }

1971: PETSC_STATIC_INLINE PetscErrorCode UnPack_bs(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1972: {
1973:   PetscInt i,idy,j;

1976:   switch (addv) {
1977:   case INSERT_VALUES:
1978:   case INSERT_ALL_VALUES:
1979:     for (i=0; i<n; i++) {
1980:       idy       = *indicesy++;
1981:       PetscMemcpy(y + idy,x,bs*sizeof(PetscScalar));
1982:       x        += bs;
1983:     }
1984:     break;
1985:   case ADD_VALUES:
1986:   case ADD_ALL_VALUES:
1987:     for (i=0; i<n; i++) {
1988:       idy        = *indicesy++;
1989:       for (j=0; j<bs; j++) y[idy+j] += x[j];
1990:       x         += bs;
1991:     }
1992:     break;
1993: #if !defined(PETSC_USE_COMPLEX)
1994:   case MAX_VALUES:
1995:     for (i=0; i<n; i++) {
1996:       idy = *indicesy++;
1997:       for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[j]);
1998:       x  += bs;
1999:     }
2000: #else
2001:   case MAX_VALUES:
2002: #endif
2003:   case NOT_SET_VALUES:
2004:     break;
2005:   default:
2006:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2007:   }
2008:   return(0);
2009: }

2011: PETSC_STATIC_INLINE PetscErrorCode Scatter_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2012: {
2013:   PetscInt i,idx,idy,j;

2016:   switch (addv) {
2017:   case INSERT_VALUES:
2018:   case INSERT_ALL_VALUES:
2019:     for (i=0; i<n; i++) {
2020:       idx       = *indicesx++;
2021:       idy       = *indicesy++;
2022:       PetscMemcpy(y + idy, x + idx,bs*sizeof(PetscScalar));
2023:     }
2024:     break;
2025:   case ADD_VALUES:
2026:   case ADD_ALL_VALUES:
2027:     for (i=0; i<n; i++) {
2028:       idx        = *indicesx++;
2029:       idy        = *indicesy++;
2030:       for (j=0; j<bs; j++ )  y[idy+j] += x[idx+j];
2031:     }
2032:     break;
2033: #if !defined(PETSC_USE_COMPLEX)
2034:   case MAX_VALUES:
2035:     for (i=0; i<n; i++) {
2036:       idx       = *indicesx++;
2037:       idy       = *indicesy++;
2038:       for (j=0; j<bs; j++ )  y[idy+j] = PetscMax(y[idy+j],x[idx+j]);
2039:     }
2040: #else
2041:   case MAX_VALUES:
2042: #endif
2043:   case NOT_SET_VALUES:
2044:     break;
2045:   default:
2046:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2047:   }
2048:   return(0);
2049: }

2051: /* Create the VecScatterBegin/End_P for our chosen block sizes */
2052: #define BS 1
2053: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2054: #define BS 2
2055: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2056: #define BS 3
2057: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2058: #define BS 4
2059: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2060: #define BS 5
2061: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2062: #define BS 6
2063: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2064: #define BS 7
2065: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2066: #define BS 8
2067: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2068: #define BS 9
2069: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2070: #define BS 10
2071: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2072: #define BS 11
2073: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2074: #define BS 12
2075: #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2076: #define BS bs
2077: #include <../src/vec/vscat/impls/mpi3/vpscat.h>

2079: /* ==========================================================================================*/
2080: /*              create parallel to sequential scatter context                                */

2082: extern PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);

2084: /*
2085:    bs indicates how many elements there are in each block. Normally this would be 1.

2087:    contains check that PetscMPIInt can handle the sizes needed
2088: */
2089: #include <petscbt.h>
2090: PetscErrorCode VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2091: {
2092:   VecScatter_MPI_General *from,*to;
2093:   PetscMPIInt            size,rank,mrank,imdex,tag,n;
2094:   PetscInt               *source = NULL,*owners = NULL,nxr;
2095:   PetscInt               *lowner = NULL,*start = NULL,*lsharedowner = NULL,*sharedstart = NULL,lengthy,lengthx;
2096:   PetscMPIInt            *nprocs = NULL,nrecvs,nrecvshared;
2097:   PetscInt               i,j,idx = 0,nsends;
2098:   PetscMPIInt            *owner = NULL;
2099:   PetscInt               *starts = NULL,count,slen,slenshared;
2100:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
2101:   PetscMPIInt            *onodes1,*olengths1;
2102:   MPI_Comm               comm;
2103:   MPI_Request            *send_waits = NULL,*recv_waits = NULL;
2104:   MPI_Status             recv_status,*send_status;
2105:   PetscErrorCode         ierr;
2106:   PetscShmComm           scomm;
2107:   PetscMPIInt            jj;
2108:   MPI_Info               info;
2109:   MPI_Comm               mscomm;
2110:   MPI_Win                sharedoffsetwin;  /* Window that owns sharedspaceoffset */
2111:   PetscInt               *sharedspaceoffset;
2112:   VecScatterType         type;
2113:   PetscBool              mpi3node;
2115:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2116:   PetscObjectGetComm((PetscObject)xin,&comm);
2117:   MPI_Comm_rank(comm,&rank);
2118:   MPI_Comm_size(comm,&size);

2120:   VecScatterGetType(ctx,&type);
2121:   PetscStrcmp(type,"mpi3node",&mpi3node);

2123:   PetscShmCommGet(comm,&scomm);
2124:   PetscShmCommGetMpiShmComm(scomm,&mscomm);

2126:   MPI_Info_create(&info);
2127:   MPI_Info_set(info, "alloc_shared_noncontig", "true");

2129:   if (mpi3node) {
2130:     /* Check if (parallel) inidx has duplicate indices.
2131:      Current VecScatterEndMPI3Node() (case StoP) writes the sequential vector to the parallel vector.
2132:      Writing to the same shared location without variable locking
2133:      leads to incorrect scattering. See src/vec/vscat/examples/runex2_5 and runex3_5 */

2135:     PetscInt    *mem,**optr;
2136:     MPI_Win     swin;
2137:     PetscMPIInt msize;

2139:     MPI_Comm_size(mscomm,&msize);
2140:     MPI_Comm_rank(mscomm,&mrank);

2142:     PetscMalloc1(msize,&optr);
2143:     MPIU_Win_allocate_shared((nx+1)*sizeof(PetscInt),sizeof(PetscInt),MPI_INFO_NULL,mscomm,&mem,&swin);

2145:     /* Write local nx and inidx into mem */
2146:     mem[0] = nx;
2147:     for (i=1; i<=nx; i++) mem[i] = inidx[i-1];
2148:     MPI_Barrier(mscomm); /* sync shared memory */

2150:     if (!mrank) {
2151:       PetscBT        table;
2152:       /* sz and dsp_unit are not used. Replacing them with NULL would cause MPI_Win_shared_query() crash */
2153:       MPI_Aint       sz;
2154:       PetscMPIInt    dsp_unit;
2155:       PetscInt       N = xin->map->N;

2157:       PetscBTCreate(N,&table); /* may not be scalable */
2158:       PetscBTMemzero(N,table);

2160:       jj = 0;
2161:       while (jj<msize && !ctx->is_duplicate) {
2162:         if (jj == mrank) {jj++; continue;}
2163:         MPIU_Win_shared_query(swin,jj,&sz,&dsp_unit,&optr[jj]);
2164:         for (j=1; j<=optr[jj][0]; j++) { /* optr[jj][0] = nx */
2165:           if (PetscBTLookupSet(table,optr[jj][j])) {
2166:             ctx->is_duplicate = PETSC_TRUE; /* only mrank=0 has this info., will be checked in VecScatterEndMPI3Node() */
2167:             break;
2168:           }
2169:         }
2170:         jj++;
2171:       }
2172:       PetscBTDestroy(&table);
2173:     }

2175:     if (swin != MPI_WIN_NULL) {MPI_Win_free(&swin);}
2176:     PetscFree(optr);
2177:   }

2179:   owners = xin->map->range;
2180:   VecGetSize(yin,&lengthy);
2181:   VecGetSize(xin,&lengthx);

2183:   /*  first count number of contributors to each processor */
2184:   PetscMalloc2(size,&nprocs,nx,&owner);
2185:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

2187:   j      = 0;
2188:   nsends = 0;
2189:   for (i=0; i<nx; i++) {
2190:     PetscIntMultError(bs,inidx[i],&idx);
2191:     if (idx < owners[j]) j = 0;
2192:     for (; j<size; j++) {
2193:       if (idx < owners[j+1]) {
2194:         if (!nprocs[j]++) nsends++;
2195:         owner[i] = j;
2196:         break;
2197:       }
2198:     }
2199:     if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
2200:   }

2202:   nprocslocal  = nprocs[rank];
2203:   nprocs[rank] = 0;
2204:   if (nprocslocal) nsends--;
2205:   /* inform other processors of number of messages and max length*/
2206:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2207:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2208:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2209:   recvtotal = 0;
2210:   for (i=0; i<nrecvs; i++) {
2211:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2212:   }

2214:   /* post receives:   */
2215:   PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);
2216:   count = 0;
2217:   for (i=0; i<nrecvs; i++) {
2218:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2219:     count += olengths1[i];
2220:   }

2222:   /* do sends:
2223:      1) starts[i] gives the starting index in svalues for stuff going to
2224:      the ith processor
2225:   */
2226:   nxr = 0;
2227:   for (i=0; i<nx; i++) {
2228:     if (owner[i] != rank) nxr++;
2229:   }
2230:   PetscMalloc3(nxr,&svalues,nsends,&send_waits,size+1,&starts);

2232:   starts[0]  = 0;
2233:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2234:   for (i=0; i<nx; i++) {
2235:     if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
2236:   }
2237:   starts[0] = 0;
2238:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2239:   count = 0;
2240:   for (i=0; i<size; i++) {
2241:     if (nprocs[i]) {
2242:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
2243:     }
2244:   }

2246:   /*  wait on receives; this is everyone who we need to deliver data to */
2247:   count = nrecvs;
2248:   slen  = 0;
2249:   nrecvshared = 0;
2250:   slenshared  = 0;
2251:   while (count) {
2252:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2253:     /* unpack receives into our local space */
2254:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2255:     PetscShmCommGlobalToLocal(scomm,onodes1[imdex],&jj);
2256:     if (jj > -1) {
2257:       PetscInfo3(NULL,"[%d] Sending values to shared memory partner %d,global rank %d\n",rank,jj,onodes1[imdex]);
2258:       nrecvshared++;
2259:       slenshared += n;
2260:     }
2261:     slen += n;
2262:     count--;
2263:   }

2265:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);

2267:   /* allocate entire send scatter context */
2268:   PetscNewLog(ctx,&to);
2269:   to->sharedwin = MPI_WIN_NULL;
2270:   to->n         = nrecvs-nrecvshared;

2272:   PetscMalloc1(nrecvs-nrecvshared,&to->requests);
2273:   PetscMalloc4(bs*(slen-slenshared),&to->values,slen-slenshared,&to->indices,nrecvs-nrecvshared+1,&to->starts,nrecvs-nrecvshared,&to->procs);
2274:   PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);

2276:   MPI_Comm_size(mscomm,&to->msize);
2277:   PetscMalloc1(slenshared,&to->sharedspaceindices);
2278:   PetscCalloc1(to->msize+1,&to->sharedspacestarts);

2280:   ctx->todata              = (void*)to;
2281:   to->starts[0]            = 0;
2282:   to->sharedspacestarts[0] = 0;

2284:   /* Allocate shared memory space for shared memory partner communication */
2285:   MPIU_Win_allocate_shared(to->msize*sizeof(PetscInt),sizeof(PetscInt),info,mscomm,&sharedspaceoffset,&sharedoffsetwin);
2286:   for (i=0; i<to->msize; i++) sharedspaceoffset[i] = -1; /* mark with -1 for later error checking */
2287:   if (nrecvs) {
2288:     /* move the data into the send scatter */
2289:     base     = owners[rank];
2290:     rsvalues = rvalues;
2291:     to->n    = 0;
2292:     for (i=0; i<nrecvs; i++) {
2293:       values = rsvalues;
2294:       PetscShmCommGlobalToLocal(scomm,(PetscMPIInt)onodes1[i],&jj);
2295:       if (jj > -1) {
2296:         to->sharedspacestarts[jj]   = to->sharedcnt;
2297:         to->sharedspacestarts[jj+1] = to->sharedcnt + olengths1[i];
2298:         for (j=0; j<olengths1[i]; j++) to->sharedspaceindices[to->sharedcnt + j] = values[j] - base;
2299:         sharedspaceoffset[jj]      = to->sharedcnt;
2300:         to->sharedcnt             += olengths1[i];
2301:         PetscInfo4(NULL,"[%d] Sending %d values to shared memory partner %d,global rank %d\n",rank,olengths1[i],jj,onodes1[i]);
2302:       } else {
2303:         to->starts[to->n+1] = to->starts[to->n] + olengths1[i];
2304:         to->procs[to->n]    = onodes1[i];
2305:         for (j=0; j<olengths1[i]; j++) to->indices[to->starts[to->n] + j] = values[j] - base;
2306:         to->n++;
2307:       }
2308:       rsvalues += olengths1[i];
2309:     }
2310:   }
2311:   PetscFree(olengths1);
2312:   PetscFree(onodes1);
2313:   PetscFree3(rvalues,source,recv_waits);

2315:   if (mpi3node) {
2316:     /* to->sharedspace is used only as a flag */
2317:     i = 0;
2318:     if (to->sharedcnt) i = 1;
2319:     MPIU_Win_allocate_shared(i*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2320:   } else { /* mpi3 */
2321:     MPIU_Win_allocate_shared(bs*to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);
2322:   }
2323:   if (to->sharedwin == MPI_WIN_NULL) SETERRQ(PETSC_COMM_SELF,100,"what the");
2324:   MPI_Info_free(&info);

2326:   /* allocate entire receive scatter context */
2327:   PetscNewLog(ctx,&from);
2328:   from->sharedwin       = MPI_WIN_NULL;
2329:   from->msize           = to->msize;
2330:   /* we don't actually know the correct value for from->n at this point so we use the upper bound */
2331:   from->n = nsends;

2333:   PetscMalloc1(nsends,&from->requests);
2334:   PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);
2335:   ctx->fromdata = (void*)from;

2337:   PetscCalloc1(to->msize+1,&from->sharedspacestarts);
2338:   /* move data into receive scatter */
2339:   PetscMalloc2(size,&lowner,nsends+1,&start);
2340:   PetscMalloc2(size,&lsharedowner,to->msize+1,&sharedstart);
2341:   count = 0; from->starts[0] = start[0] = 0;
2342:   from->sharedcnt = 0;
2343:   for (i=0; i<size; i++) {
2344:     lsharedowner[i] = -1;
2345:     lowner[i]       = -1;
2346:     if (nprocs[i]) {
2347:       PetscShmCommGlobalToLocal(scomm,i,&jj);
2348:       if (jj > -1) {
2349:         from->sharedspacestarts[jj] = sharedstart[jj] = from->sharedcnt;
2350:         from->sharedspacestarts[jj+1] = sharedstart[jj+1] = from->sharedspacestarts[jj] + nprocs[i];
2351:         from->sharedcnt += nprocs[i];
2352:         lsharedowner[i] = jj;
2353:       } else {
2354:         lowner[i]            = count++;
2355:         from->procs[count-1]  = i;
2356:         from->starts[count] = start[count] = start[count-1] + nprocs[i];
2357:       }
2358:     }
2359:   }
2360:   from->n = count;

2362:   for (i=0; i<nx; i++) {
2363:     if (owner[i] != rank && lowner[owner[i]] > -1) {
2364:       from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
2365:       if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2366:     }
2367:   }

2369:   /* copy over appropriate parts of inidy[] into from->sharedspaceindices */
2370:   if (mpi3node) {
2371:     /* in addition, copy over appropriate parts of inidx[] into 2nd part of from->sharedspaceindices */
2372:     PetscInt *sharedspaceindicesx;
2373:     PetscMalloc1(2*from->sharedcnt,&from->sharedspaceindices);
2374:     sharedspaceindicesx = from->sharedspaceindices + from->sharedcnt;
2375:     for (i=0; i<nx; i++) {
2376:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2377:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");

2379:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]] = bs*inidy[i];
2380:         sharedspaceindicesx[sharedstart[lsharedowner[owner[i]]]] = bs*inidx[i]-owners[owner[i]]; /* local inidx */
2381:         sharedstart[lsharedowner[owner[i]]]++;
2382:       }
2383:     }
2384:   } else { /* mpi3 */
2385:     PetscMalloc1(from->sharedcnt,&from->sharedspaceindices);
2386:     for (i=0; i<nx; i++) {
2387:       if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2388:         if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2389:         from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]++] = bs*inidy[i];
2390:       }
2391:     }
2392:   }

2394:   PetscFree2(lowner,start);
2395:   PetscFree2(lsharedowner,sharedstart);
2396:   PetscFree2(nprocs,owner);

2398:   /* wait on sends */
2399:   if (nsends) {
2400:     PetscMalloc1(nsends,&send_status);
2401:     MPI_Waitall(nsends,send_waits,send_status);
2402:     PetscFree(send_status);
2403:   }
2404:   PetscFree3(svalues,send_waits,starts);

2406:   if (nprocslocal) {
2407:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
2408:     /* we have a scatter to ourselves */
2409:     PetscMalloc1(nt,&to->local.vslots);
2410:     PetscMalloc1(nt,&from->local.vslots);
2411:     nt   = 0;
2412:     for (i=0; i<nx; i++) {
2413:       idx = bs*inidx[i];
2414:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2415:         to->local.vslots[nt]     = idx - owners[rank];
2416:         from->local.vslots[nt++] = bs*inidy[i];
2417:         if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2418:       }
2419:     }
2420:     PetscLogObjectMemory((PetscObject)ctx,2*nt*sizeof(PetscInt));
2421:   } else {
2422:     from->local.n      = 0;
2423:     from->local.vslots = 0;
2424:     to->local.n        = 0;
2425:     to->local.vslots   = 0;
2426:   }

2428:   /* Get the shared memory address for all processes we will be copying data from */
2429:   PetscCalloc1(to->msize,&from->sharedspaces);
2430:   PetscCalloc1(to->msize,&from->sharedspacesoffset);
2431:   MPI_Comm_rank(mscomm,&mrank);
2432:   for (jj=0; jj<to->msize; jj++) {
2433:     MPI_Aint    isize;
2434:     PetscMPIInt disp_unit;
2435:     PetscInt    *ptr;
2436:     MPIU_Win_shared_query(to->sharedwin,jj,&isize,&disp_unit,&from->sharedspaces[jj]);
2437:     MPIU_Win_shared_query(sharedoffsetwin,jj,&isize,&disp_unit,&ptr);
2438:     from->sharedspacesoffset[jj] = ptr[mrank];
2439:   }
2440:   MPI_Win_free(&sharedoffsetwin);

2442:   if (mpi3node && !from->sharedspace) {
2443:     /* comput from->notdone to be used by VecScatterEndMPI3Node() */
2444:     PetscInt notdone = 0;
2445:     for (i=0; i<from->msize; i++) {
2446:       if (from->sharedspacesoffset && from->sharedspacesoffset[i] > -1) {
2447:         notdone++;
2448:       }
2449:     }
2450:     from->notdone = notdone;
2451:   }

2453:   from->local.nonmatching_computed = PETSC_FALSE;
2454:   from->local.n_nonmatching        = 0;
2455:   from->local.slots_nonmatching    = 0;
2456:   to->local.nonmatching_computed   = PETSC_FALSE;
2457:   to->local.n_nonmatching          = 0;
2458:   to->local.slots_nonmatching      = 0;

2460:   from->format = VEC_SCATTER_MPI_GENERAL;
2461:   to->format   = VEC_SCATTER_MPI_GENERAL;
2462:   from->bs     = bs;
2463:   to->bs       = bs;

2465:   VecScatterCreateCommon_PtoS_MPI3(from,to,ctx);
2466:   return(0);
2467: }

2469: /*
2470:    bs indicates how many elements there are in each block. Normally this would be 1.
2471: */
2472: PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
2473: {
2474:   MPI_Comm       comm;
2475:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
2476:   PetscInt       bs   = to->bs;
2477:   PetscMPIInt    size;
2478:   PetscInt       i, n;
2480:   VecScatterType type;
2481:   PetscBool      mpi3node;

2484:   PetscObjectGetComm((PetscObject)ctx,&comm);
2485:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
2486:   ctx->ops->destroy = VecScatterDestroy_PtoP_MPI3;

2488:   MPI_Comm_size(comm,&size);
2489:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
2490:   to->contiq = PETSC_FALSE;
2491:   n = from->starts[from->n];
2492:   from->contiq = PETSC_TRUE;
2493:   for (i=1; i<n; i++) {
2494:     if (from->indices[i] != from->indices[i-1] + bs) {
2495:       from->contiq = PETSC_FALSE;
2496:       break;
2497:     }
2498:   }

2500:   ctx->ops->copy      = VecScatterCopy_PtoP_AllToAll;
2501:   {
2502:     PetscInt    *sstarts  = to->starts,  *rstarts = from->starts;
2503:     PetscMPIInt *sprocs   = to->procs,   *rprocs  = from->procs;
2504:     MPI_Request *swaits   = to->requests,*rwaits  = from->requests;
2505:     MPI_Request *rev_swaits,*rev_rwaits;
2506:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2508:     /* allocate additional wait variables for the "reverse" scatter */
2509:     PetscMalloc1(to->n,&rev_rwaits);
2510:     PetscMalloc1(from->n,&rev_swaits);
2511:     to->rev_requests   = rev_rwaits;
2512:     from->rev_requests = rev_swaits;

2514:     for (i=0; i<from->n; i++) {
2515:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2516:     }

2518:     for (i=0; i<to->n; i++) {
2519:       MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2520:     }
2521:     /* Register receives for scatter and reverse */
2522:     for (i=0; i<from->n; i++) {
2523:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2524:     }
2525:     for (i=0; i<to->n; i++) {
2526:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2527:     }
2528:     ctx->ops->copy = VecScatterCopy_PtoP_X;
2529:   }
2530:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);

2532: #if defined(PETSC_USE_DEBUG)
2533:   MPIU_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));
2534:   MPIU_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));
2535:   if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2536: #endif

2538:   VecScatterGetType(ctx,&type);
2539:   PetscStrcmp(type,"mpi3node",&mpi3node);

2541:   if (mpi3node) {
2542:     switch (bs) {
2543:     case 12:
2544:       ctx->ops->begin = VecScatterBeginMPI3Node_12;
2545:       ctx->ops->end   = VecScatterEndMPI3Node_12;
2546:       break;
2547:     case 11:
2548:       ctx->ops->begin = VecScatterBeginMPI3Node_11;
2549:       ctx->ops->end   = VecScatterEndMPI3Node_11;
2550:       break;
2551:     case 10:
2552:       ctx->ops->begin = VecScatterBeginMPI3Node_10;
2553:       ctx->ops->end   = VecScatterEndMPI3Node_10;
2554:       break;
2555:     case 9:
2556:       ctx->ops->begin = VecScatterBeginMPI3Node_9;
2557:       ctx->ops->end   = VecScatterEndMPI3Node_9;
2558:       break;
2559:     case 8:
2560:       ctx->ops->begin = VecScatterBeginMPI3Node_8;
2561:       ctx->ops->end   = VecScatterEndMPI3Node_8;
2562:       break;
2563:     case 7:
2564:       ctx->ops->begin = VecScatterBeginMPI3Node_7;
2565:       ctx->ops->end   = VecScatterEndMPI3Node_7;
2566:       break;
2567:     case 6:
2568:       ctx->ops->begin = VecScatterBeginMPI3Node_6;
2569:       ctx->ops->end   = VecScatterEndMPI3Node_6;
2570:       break;
2571:     case 5:
2572:       ctx->ops->begin = VecScatterBeginMPI3Node_5;
2573:       ctx->ops->end   = VecScatterEndMPI3Node_5;
2574:       break;
2575:     case 4:
2576:       ctx->ops->begin = VecScatterBeginMPI3Node_4;
2577:       ctx->ops->end   = VecScatterEndMPI3Node_4;
2578:       break;
2579:     case 3:
2580:       ctx->ops->begin = VecScatterBeginMPI3Node_3;
2581:       ctx->ops->end   = VecScatterEndMPI3Node_3;
2582:       break;
2583:     case 2:
2584:       ctx->ops->begin = VecScatterBeginMPI3Node_2;
2585:       ctx->ops->end   = VecScatterEndMPI3Node_2;
2586:       break;
2587:     case 1:
2588:       ctx->ops->begin = VecScatterBeginMPI3Node_1;
2589:       ctx->ops->end   = VecScatterEndMPI3Node_1;
2590:       break;
2591:     default:
2592:       ctx->ops->begin = VecScatterBegin_bs;
2593:       ctx->ops->end   = VecScatterEnd_bs;
2594:     }
2595:   } else { /* !mpi3node */

2597:     switch (bs) {
2598:     case 12:
2599:       ctx->ops->begin = VecScatterBegin_12;
2600:       ctx->ops->end   = VecScatterEnd_12;
2601:       break;
2602:     case 11:
2603:       ctx->ops->begin = VecScatterBegin_11;
2604:       ctx->ops->end   = VecScatterEnd_11;
2605:       break;
2606:     case 10:
2607:       ctx->ops->begin = VecScatterBegin_10;
2608:       ctx->ops->end   = VecScatterEnd_10;
2609:       break;
2610:     case 9:
2611:       ctx->ops->begin = VecScatterBegin_9;
2612:       ctx->ops->end   = VecScatterEnd_9;
2613:       break;
2614:     case 8:
2615:       ctx->ops->begin = VecScatterBegin_8;
2616:       ctx->ops->end   = VecScatterEnd_8;
2617:       break;
2618:     case 7:
2619:       ctx->ops->begin = VecScatterBegin_7;
2620:       ctx->ops->end   = VecScatterEnd_7;
2621:       break;
2622:     case 6:
2623:       ctx->ops->begin = VecScatterBegin_6;
2624:       ctx->ops->end   = VecScatterEnd_6;
2625:       break;
2626:     case 5:
2627:       ctx->ops->begin = VecScatterBegin_5;
2628:       ctx->ops->end   = VecScatterEnd_5;
2629:       break;
2630:     case 4:
2631:       ctx->ops->begin = VecScatterBegin_4;
2632:       ctx->ops->end   = VecScatterEnd_4;
2633:       break;
2634:     case 3:
2635:       ctx->ops->begin = VecScatterBegin_3;
2636:       ctx->ops->end   = VecScatterEnd_3;
2637:       break;
2638:     case 2:
2639:       ctx->ops->begin = VecScatterBegin_2;
2640:       ctx->ops->end   = VecScatterEnd_2;
2641:       break;
2642:     case 1:
2643:       if (mpi3node) {
2644:         ctx->ops->begin = VecScatterBeginMPI3Node_1;
2645:         ctx->ops->end   = VecScatterEndMPI3Node_1;
2646:       } else {
2647:         ctx->ops->begin = VecScatterBegin_1;
2648:         ctx->ops->end   = VecScatterEnd_1;
2649:       }
2650:       break;
2651:     default:
2652:       ctx->ops->begin = VecScatterBegin_bs;
2653:       ctx->ops->end   = VecScatterEnd_bs;
2654:     }
2655:   }

2657:   ctx->ops->view = VecScatterView_MPI;

2659:   /* try to optimize PtoP vecscatter with memcpy's */
2660:   VecScatterMemcpyPlanCreate_PtoP(to,from);
2661:   return(0);
2662: }

2664: /* ------------------------------------------------------------------------------------*/
2665: /*
2666:          Scatter from local Seq vectors to a parallel vector.
2667:          Reverses the order of the arguments, calls VecScatterCreateLocal_PtoS() then
2668:          reverses the result.
2669: */
2670: PetscErrorCode VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2671: {
2672:   PetscErrorCode         ierr;
2673:   MPI_Request            *waits;
2674:   VecScatter_MPI_General *to,*from;

2677:   VecScatterCreateLocal_PtoS_MPI3(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2678:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2679:   from          = (VecScatter_MPI_General*)ctx->todata;
2680:   ctx->todata   = (void*)to;
2681:   ctx->fromdata = (void*)from;
2682:   /* these two are special, they are ALWAYS stored in to struct */
2683:   to->sstatus   = from->sstatus;
2684:   to->rstatus   = from->rstatus;

2686:   from->sstatus = 0;
2687:   from->rstatus = 0;
2688:   waits              = from->rev_requests;
2689:   from->rev_requests = from->requests;
2690:   from->requests     = waits;
2691:   waits              = to->rev_requests;
2692:   to->rev_requests   = to->requests;
2693:   to->requests       = waits;
2694:   return(0);
2695: }

2697: /* ---------------------------------------------------------------------------------*/
2698: PetscErrorCode VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2699: {
2701:   PetscMPIInt    size,rank,tag,imdex,n;
2702:   PetscInt       *owners = xin->map->range;
2703:   PetscMPIInt    *nprocs = NULL;
2704:   PetscInt       i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2705:   PetscMPIInt    *owner   = NULL;
2706:   PetscInt       *starts  = NULL,count,slen;
2707:   PetscInt       *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2708:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2709:   MPI_Comm       comm;
2710:   MPI_Request    *send_waits = NULL,*recv_waits = NULL;
2711:   MPI_Status     recv_status,*send_status = NULL;
2712:   PetscBool      duplicate = PETSC_FALSE;
2713: #if defined(PETSC_USE_DEBUG)
2714:   PetscBool      found = PETSC_FALSE;
2715: #endif

2718:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2719:   PetscObjectGetComm((PetscObject)xin,&comm);
2720:   MPI_Comm_size(comm,&size);
2721:   MPI_Comm_rank(comm,&rank);
2722:   if (size == 1) {
2723:     VecScatterCreateLocal_StoP_MPI3(nx,inidx,ny,inidy,xin,yin,bs,ctx);
2724:     return(0);
2725:   }

2727:   /*
2728:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2729:      They then call the StoPScatterCreate()
2730:   */
2731:   /*  first count number of contributors to each processor */
2732:   PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);
2733:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

2735:   lastidx = -1;
2736:   j       = 0;
2737:   for (i=0; i<nx; i++) {
2738:     /* if indices are NOT locally sorted, need to start search at the beginning */
2739:     if (lastidx > (idx = bs*inidx[i])) j = 0;
2740:     lastidx = idx;
2741:     for (; j<size; j++) {
2742:       if (idx >= owners[j] && idx < owners[j+1]) {
2743:         nprocs[j]++;
2744:         owner[i] = j;
2745: #if defined(PETSC_USE_DEBUG)
2746:         found = PETSC_TRUE;
2747: #endif
2748:         break;
2749:       }
2750:     }
2751: #if defined(PETSC_USE_DEBUG)
2752:     if (!found) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2753:     found = PETSC_FALSE;
2754: #endif
2755:   }
2756:   nsends = 0;
2757:   for (i=0; i<size; i++) nsends += (nprocs[i] > 0);

2759:   /* inform other processors of number of messages and max length*/
2760:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2761:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2762:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2763:   recvtotal = 0;
2764:   for (i=0; i<nrecvs; i++) {
2765:     PetscIntSumError(recvtotal,olengths1[i],&recvtotal);
2766:   }

2768:   /* post receives:   */
2769:   PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);

2771:   count = 0;
2772:   for (i=0; i<nrecvs; i++) {
2773:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2774:     PetscIntSumError(count,olengths1[i],&count);
2775:   }
2776:   PetscFree(onodes1);

2778:   /* do sends:
2779:       1) starts[i] gives the starting index in svalues for stuff going to
2780:          the ith processor
2781:   */
2782:   starts[0]= 0;
2783:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2784:   for (i=0; i<nx; i++) {
2785:     svalues[2*starts[owner[i]]]       = bs*inidx[i];
2786:     svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2787:   }

2789:   starts[0] = 0;
2790:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2791:   count = 0;
2792:   for (i=0; i<size; i++) {
2793:     if (nprocs[i]) {
2794:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2795:       count++;
2796:     }
2797:   }
2798:   PetscFree3(nprocs,owner,starts);

2800:   /*  wait on receives */
2801:   count = nrecvs;
2802:   slen  = 0;
2803:   while (count) {
2804:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2805:     /* unpack receives into our local space */
2806:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2807:     slen += n/2;
2808:     count--;
2809:   }
2810:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);

2812:   PetscMalloc2(slen,&local_inidx,slen,&local_inidy);
2813:   base     = owners[rank];
2814:   count    = 0;
2815:   rsvalues = rvalues;
2816:   for (i=0; i<nrecvs; i++) {
2817:     values    = rsvalues;
2818:     rsvalues += 2*olengths1[i];
2819:     for (j=0; j<olengths1[i]; j++) {
2820:       local_inidx[count]   = values[2*j] - base;
2821:       local_inidy[count++] = values[2*j+1];
2822:     }
2823:   }
2824:   PetscFree(olengths1);

2826:   /* wait on sends */
2827:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2828:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);

2830:   /*
2831:      should sort and remove duplicates from local_inidx,local_inidy
2832:   */
2833: #if defined(do_it_slow)
2834:   /* sort on the from index */
2835:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2836:   start = 0;
2837:   while (start < slen) {
2838:     count = start+1;
2839:     last  = local_inidx[start];
2840:     while (count < slen && last == local_inidx[count]) count++;
2841:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2842:       /* sort on to index */
2843:       PetscSortInt(count-start,local_inidy+start);
2844:     }
2845:     /* remove duplicates; not most efficient way, but probably good enough */
2846:     i = start;
2847:     while (i < count-1) {
2848:       if (local_inidy[i] != local_inidy[i+1]) i++;
2849:       else { /* found a duplicate */
2850:         duplicate = PETSC_TRUE;
2851:         for (j=i; j<slen-1; j++) {
2852:           local_inidx[j] = local_inidx[j+1];
2853:           local_inidy[j] = local_inidy[j+1];
2854:         }
2855:         slen--;
2856:         count--;
2857:       }
2858:     }
2859:     start = count;
2860:   }
2861: #endif
2862:   if (duplicate) {
2863:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2864:   }
2865:   VecScatterCreateLocal_StoP_MPI3(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
2866:   PetscFree2(local_inidx,local_inidy);
2867:   return(0);
2868: }

2870: PetscErrorCode VecScatterSetUp_MPI3(VecScatter ctx)
2871: {

2875:   VecScatterSetUp_vectype_private(ctx,VecScatterCreateLocal_PtoS_MPI3,VecScatterCreateLocal_StoP_MPI3,VecScatterCreateLocal_PtoP_MPI3);
2876:   return(0);
2877: }

2879: PetscErrorCode VecScatterCreate_MPI3(VecScatter ctx)
2880: {
2881:   PetscErrorCode    ierr;

2884:   ctx->ops->setup = VecScatterSetUp_MPI3;
2885:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3);
2886:   PetscInfo(ctx,"Using MPI3 for vector scatter\n");
2887:   return(0);
2888: }

2890: PetscErrorCode VecScatterCreate_MPI3Node(VecScatter ctx)
2891: {
2892:   PetscErrorCode    ierr;

2895:   ctx->ops->setup = VecScatterSetUp_MPI3;
2896:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3NODE);
2897:   PetscInfo(ctx,"Using MPI3NODE for vector scatter\n");
2898:   return(0);
2899: }