Actual source code: aomemscalable.c


  2: /*
  3:     The memory scalable AO application ordering routines. These store the
  4:   orderings on each processor for that processor's range of values
  5: */

  7: #include <../src/vec/is/ao/aoimpl.h>

  9: typedef struct {
 10:   PetscInt    *app_loc;    /* app_loc[i] is the partner for the ith local PETSc slot */
 11:   PetscInt    *petsc_loc;  /* petsc_loc[j] is the partner for the jth local app slot */
 12:   PetscLayout map;         /* determines the local sizes of ao */
 13: } AO_MemoryScalable;

 15: /*
 16:        All processors ship the data to process 0 to be printed; note that this is not scalable because
 17:        process 0 allocates space for all the orderings entry across all the processes
 18: */
 19: PetscErrorCode AOView_MemoryScalable(AO ao,PetscViewer viewer)
 20: {
 21:   PetscMPIInt       rank,size;
 22:   AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
 23:   PetscBool         iascii;
 24:   PetscMPIInt       tag_app,tag_petsc;
 25:   PetscLayout       map = aomems->map;
 26:   PetscInt          *app,*app_loc,*petsc,*petsc_loc,len,i,j;
 27:   MPI_Status        status;

 29:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);

 32:   MPI_Comm_rank(PetscObjectComm((PetscObject)ao),&rank);
 33:   MPI_Comm_size(PetscObjectComm((PetscObject)ao),&size);

 35:   PetscObjectGetNewTag((PetscObject)ao,&tag_app);
 36:   PetscObjectGetNewTag((PetscObject)ao,&tag_petsc);

 38:   if (rank == 0) {
 39:     PetscViewerASCIIPrintf(viewer,"Number of elements in ordering %" PetscInt_FMT "\n",ao->N);
 40:     PetscViewerASCIIPrintf(viewer,  "PETSc->App  App->PETSc\n");

 42:     PetscMalloc2(map->N,&app,map->N,&petsc);
 43:     len  = map->n;
 44:     /* print local AO */
 45:     PetscViewerASCIIPrintf(viewer,"Process [%d]\n",rank);
 46:     for (i=0; i<len; i++) {
 47:       PetscViewerASCIIPrintf(viewer,"%3" PetscInt_FMT "  %3" PetscInt_FMT "    %3" PetscInt_FMT "  %3" PetscInt_FMT "\n",i,aomems->app_loc[i],i,aomems->petsc_loc[i]);
 48:     }

 50:     /* recv and print off-processor's AO */
 51:     for (i=1; i<size; i++) {
 52:       len       = map->range[i+1] - map->range[i];
 53:       app_loc   = app  + map->range[i];
 54:       petsc_loc = petsc+ map->range[i];
 55:       MPI_Recv(app_loc,(PetscMPIInt)len,MPIU_INT,i,tag_app,PetscObjectComm((PetscObject)ao),&status);
 56:       MPI_Recv(petsc_loc,(PetscMPIInt)len,MPIU_INT,i,tag_petsc,PetscObjectComm((PetscObject)ao),&status);
 57:       PetscViewerASCIIPrintf(viewer,"Process [%" PetscInt_FMT "]\n",i);
 58:       for (j=0; j<len; j++) {
 59:         PetscViewerASCIIPrintf(viewer,"%3" PetscInt_FMT "  %3" PetscInt_FMT "    %3" PetscInt_FMT "  %3" PetscInt_FMT "\n",map->range[i]+j,app_loc[j],map->range[i]+j,petsc_loc[j]);
 60:       }
 61:     }
 62:     PetscFree2(app,petsc);

 64:   } else {
 65:     /* send values */
 66:     MPI_Send((void*)aomems->app_loc,map->n,MPIU_INT,0,tag_app,PetscObjectComm((PetscObject)ao));
 67:     MPI_Send((void*)aomems->petsc_loc,map->n,MPIU_INT,0,tag_petsc,PetscObjectComm((PetscObject)ao));
 68:   }
 69:   PetscViewerFlush(viewer);
 70:   return 0;
 71: }

 73: PetscErrorCode AODestroy_MemoryScalable(AO ao)
 74: {
 75:   AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;

 77:   PetscFree2(aomems->app_loc,aomems->petsc_loc);
 78:   PetscLayoutDestroy(&aomems->map);
 79:   PetscFree(aomems);
 80:   return 0;
 81: }

 83: /*
 84:    Input Parameters:
 85: +   ao - the application ordering context
 86: .   n  - the number of integers in ia[]
 87: .   ia - the integers; these are replaced with their mapped value
 88: -   maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"

 90:    Output Parameter:
 91: .   ia - the mapped interges
 92:  */
 93: PetscErrorCode AOMap_MemoryScalable_private(AO ao,PetscInt n,PetscInt *ia,const PetscInt *maploc)
 94: {
 95:   AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
 96:   MPI_Comm          comm;
 97:   PetscMPIInt       rank,size,tag1,tag2;
 98:   PetscInt          *owner,*start,*sizes,nsends,nreceives;
 99:   PetscInt          nmax,count,*sindices,*rindices,i,j,idx,lastidx,*sindices2,*rindices2;
100:   const PetscInt    *owners = aomems->map->range;
101:   MPI_Request       *send_waits,*recv_waits,*send_waits2,*recv_waits2;
102:   MPI_Status        recv_status;
103:   PetscMPIInt       nindices,source,widx;
104:   PetscInt          *rbuf,*sbuf;
105:   MPI_Status        *send_status,*send_status2;

107:   PetscObjectGetComm((PetscObject)ao,&comm);
108:   MPI_Comm_rank(comm,&rank);
109:   MPI_Comm_size(comm,&size);

111:   /*  first count number of contributors to each processor */
112:   PetscMalloc1(size,&start);
113:   PetscCalloc2(2*size,&sizes,n,&owner);

115:   j       = 0;
116:   lastidx = -1;
117:   for (i=0; i<n; i++) {
118:     if (ia[i] < 0) owner[i] = -1; /* mark negative entries (which are not to be mapped) with a special negative value */
119:     if (ia[i] >= ao->N) owner[i] = -2; /* mark out of range entries with special negative value */
120:     else {
121:       /* if indices are NOT locally sorted, need to start search at the beginning */
122:       if (lastidx > (idx = ia[i])) j = 0;
123:       lastidx = idx;
124:       for (; j<size; j++) {
125:         if (idx >= owners[j] && idx < owners[j+1]) {
126:           sizes[2*j]++;     /* num of indices to be sent */
127:           sizes[2*j+1] = 1; /* send to proc[j] */
128:           owner[i]     = j;
129:           break;
130:         }
131:       }
132:     }
133:   }
134:   sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
135:   nsends        = 0;
136:   for (i=0; i<size; i++) nsends += sizes[2*i+1];

138:   /* inform other processors of number of messages and max length*/
139:   PetscMaxSum(comm,sizes,&nmax,&nreceives);

141:   /* allocate arrays */
142:   PetscObjectGetNewTag((PetscObject)ao,&tag1);
143:   PetscObjectGetNewTag((PetscObject)ao,&tag2);

145:   PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
146:   PetscMalloc2(nsends*nmax,&rindices2,nsends,&recv_waits2);

148:   PetscMalloc3(n,&sindices,nsends,&send_waits,nsends,&send_status);
149:   PetscMalloc3(n,&sindices2,nreceives,&send_waits2,nreceives,&send_status2);

151:   /* post 1st receives: receive others requests
152:      since we don't know how long each individual message is we
153:      allocate the largest needed buffer for each receive. Potentially
154:      this is a lot of wasted space.
155:   */
156:   for (i=0,count=0; i<nreceives; i++) {
157:     MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++);
158:   }

160:   /* do 1st sends:
161:       1) starts[i] gives the starting index in svalues for stuff going to
162:          the ith processor
163:   */
164:   start[0] = 0;
165:   for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
166:   for (i=0; i<n; i++) {
167:     j = owner[i];
168:     if (j == -1) continue; /* do not remap negative entries in ia[] */
169:     else if (j == -2) { /* out of range entries get mapped to -1 */
170:       ia[i] = -1;
171:       continue;
172:     } else if (j != rank) {
173:       sindices[start[j]++]  = ia[i];
174:     } else { /* compute my own map */
175:       ia[i] = maploc[ia[i]-owners[rank]];
176:     }
177:   }

179:   start[0] = 0;
180:   for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
181:   for (i=0,count=0; i<size; i++) {
182:     if (sizes[2*i+1]) {
183:       /* send my request to others */
184:       MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag1,comm,send_waits+count);
185:       /* post receive for the answer of my request */
186:       MPI_Irecv(sindices2+start[i],sizes[2*i],MPIU_INT,i,tag2,comm,recv_waits2+count);
187:       count++;
188:     }
189:   }

192:   /* wait on 1st sends */
193:   if (nsends) {
194:     MPI_Waitall(nsends,send_waits,send_status);
195:   }

197:   /* 1st recvs: other's requests */
198:   for (j=0; j< nreceives; j++) {
199:     MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); /* idx: index of handle for operation that completed */
200:     MPI_Get_count(&recv_status,MPIU_INT,&nindices);
201:     rbuf   = rindices+nmax*widx; /* global index */
202:     source = recv_status.MPI_SOURCE;

204:     /* compute mapping */
205:     sbuf = rbuf;
206:     for (i=0; i<nindices; i++) sbuf[i] = maploc[rbuf[i]-owners[rank]];

208:     /* send mapping back to the sender */
209:     MPI_Isend(sbuf,nindices,MPIU_INT,source,tag2,comm,send_waits2+widx);
210:   }

212:   /* wait on 2nd sends */
213:   if (nreceives) {
214:     MPI_Waitall(nreceives,send_waits2,send_status2);
215:   }

217:   /* 2nd recvs: for the answer of my request */
218:   for (j=0; j< nsends; j++) {
219:     MPI_Waitany(nsends,recv_waits2,&widx,&recv_status);
220:     MPI_Get_count(&recv_status,MPIU_INT,&nindices);
221:     source = recv_status.MPI_SOURCE;
222:     /* pack output ia[] */
223:     rbuf  = sindices2+start[source];
224:     count = 0;
225:     for (i=0; i<n; i++) {
226:       if (source == owner[i]) ia[i] = rbuf[count++];
227:     }
228:   }

230:   /* free arrays */
231:   PetscFree(start);
232:   PetscFree2(sizes,owner);
233:   PetscFree2(rindices,recv_waits);
234:   PetscFree2(rindices2,recv_waits2);
235:   PetscFree3(sindices,send_waits,send_status);
236:   PetscFree3(sindices2,send_waits2,send_status2);
237:   return 0;
238: }

240: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
241: {
242:   AO_MemoryScalable *aomems  = (AO_MemoryScalable*)ao->data;
243:   PetscInt          *app_loc = aomems->app_loc;

245:   AOMap_MemoryScalable_private(ao,n,ia,app_loc);
246:   return 0;
247: }

249: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
250: {
251:   AO_MemoryScalable *aomems    = (AO_MemoryScalable*)ao->data;
252:   PetscInt          *petsc_loc = aomems->petsc_loc;

254:   AOMap_MemoryScalable_private(ao,n,ia,petsc_loc);
255:   return 0;
256: }

258: static struct _AOOps AOOps_MemoryScalable = {
259:   PetscDesignatedInitializer(view,AOView_MemoryScalable),
260:   PetscDesignatedInitializer(destroy,AODestroy_MemoryScalable),
261:   PetscDesignatedInitializer(petsctoapplication,AOPetscToApplication_MemoryScalable),
262:   PetscDesignatedInitializer(applicationtopetsc,AOApplicationToPetsc_MemoryScalable),
263: };

265: PetscErrorCode  AOCreateMemoryScalable_private(MPI_Comm comm,PetscInt napp,const PetscInt from_array[],const PetscInt to_array[],AO ao, PetscInt *aomap_loc)
266: {
267:   AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
268:   PetscLayout       map     = aomems->map;
269:   PetscInt          n_local = map->n,i,j;
270:   PetscMPIInt       rank,size,tag;
271:   PetscInt          *owner,*start,*sizes,nsends,nreceives;
272:   PetscInt          nmax,count,*sindices,*rindices,idx,lastidx;
273:   PetscInt          *owners = aomems->map->range;
274:   MPI_Request       *send_waits,*recv_waits;
275:   MPI_Status        recv_status;
276:   PetscMPIInt       nindices,widx;
277:   PetscInt          *rbuf;
278:   PetscInt          n=napp,ip,ia;
279:   MPI_Status        *send_status;

281:   PetscArrayzero(aomap_loc,n_local);

283:   MPI_Comm_rank(comm,&rank);
284:   MPI_Comm_size(comm,&size);

286:   /*  first count number of contributors (of from_array[]) to each processor */
287:   PetscCalloc1(2*size,&sizes);
288:   PetscMalloc1(n,&owner);

290:   j       = 0;
291:   lastidx = -1;
292:   for (i=0; i<n; i++) {
293:     /* if indices are NOT locally sorted, need to start search at the beginning */
294:     if (lastidx > (idx = from_array[i])) j = 0;
295:     lastidx = idx;
296:     for (; j<size; j++) {
297:       if (idx >= owners[j] && idx < owners[j+1]) {
298:         sizes[2*j]  += 2; /* num of indices to be sent - in pairs (ip,ia) */
299:         sizes[2*j+1] = 1; /* send to proc[j] */
300:         owner[i]     = j;
301:         break;
302:       }
303:     }
304:   }
305:   sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
306:   nsends        = 0;
307:   for (i=0; i<size; i++) nsends += sizes[2*i+1];

309:   /* inform other processors of number of messages and max length*/
310:   PetscMaxSum(comm,sizes,&nmax,&nreceives);

312:   /* allocate arrays */
313:   PetscObjectGetNewTag((PetscObject)ao,&tag);
314:   PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
315:   PetscMalloc3(2*n,&sindices,nsends,&send_waits,nsends,&send_status);
316:   PetscMalloc1(size,&start);

318:   /* post receives: */
319:   for (i=0; i<nreceives; i++) {
320:     MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
321:   }

323:   /* do sends:
324:       1) starts[i] gives the starting index in svalues for stuff going to
325:          the ith processor
326:   */
327:   start[0] = 0;
328:   for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
329:   for (i=0; i<n; i++) {
330:     j = owner[i];
331:     if (j != rank) {
332:       ip                   = from_array[i];
333:       ia                   = to_array[i];
334:       sindices[start[j]++] = ip;
335:       sindices[start[j]++] = ia;
336:     } else { /* compute my own map */
337:       ip            = from_array[i] - owners[rank];
338:       ia            = to_array[i];
339:       aomap_loc[ip] = ia;
340:     }
341:   }

343:   start[0] = 0;
344:   for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
345:   for (i=0,count=0; i<size; i++) {
346:     if (sizes[2*i+1]) {
347:       MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag,comm,send_waits+count);
348:       count++;
349:     }
350:   }

353:   /* wait on sends */
354:   if (nsends) {
355:     MPI_Waitall(nsends,send_waits,send_status);
356:   }

358:   /* recvs */
359:   count=0;
360:   for (j= nreceives; j>0; j--) {
361:     MPI_Waitany(nreceives,recv_waits,&widx,&recv_status);
362:     MPI_Get_count(&recv_status,MPIU_INT,&nindices);
363:     rbuf = rindices+nmax*widx; /* global index */

365:     /* compute local mapping */
366:     for (i=0; i<nindices; i+=2) { /* pack aomap_loc */
367:       ip            = rbuf[i] - owners[rank]; /* local index */
368:       ia            = rbuf[i+1];
369:       aomap_loc[ip] = ia;
370:     }
371:     count++;
372:   }

374:   PetscFree(start);
375:   PetscFree3(sindices,send_waits,send_status);
376:   PetscFree2(rindices,recv_waits);
377:   PetscFree(owner);
378:   PetscFree(sizes);
379:   return 0;
380: }

382: PETSC_EXTERN PetscErrorCode AOCreate_MemoryScalable(AO ao)
383: {
384:   IS                isapp=ao->isapp,ispetsc=ao->ispetsc;
385:   const PetscInt    *mypetsc,*myapp;
386:   PetscInt          napp,n_local,N,i,start,*petsc,*lens,*disp;
387:   MPI_Comm          comm;
388:   AO_MemoryScalable *aomems;
389:   PetscLayout       map;
390:   PetscMPIInt       size,rank;

393:   /* create special struct aomems */
394:   PetscNewLog(ao,&aomems);
395:   ao->data = (void*) aomems;
396:   PetscMemcpy(ao->ops,&AOOps_MemoryScalable,sizeof(struct _AOOps));
397:   PetscObjectChangeTypeName((PetscObject)ao,AOMEMORYSCALABLE);

399:   /* transmit all local lengths of isapp to all processors */
400:   PetscObjectGetComm((PetscObject)isapp,&comm);
401:   MPI_Comm_size(comm, &size);
402:   MPI_Comm_rank(comm, &rank);
403:   PetscMalloc2(size,&lens,size,&disp);
404:   ISGetLocalSize(isapp,&napp);
405:   MPI_Allgather(&napp, 1, MPIU_INT, lens, 1, MPIU_INT, comm);

407:   N = 0;
408:   for (i = 0; i < size; i++) {
409:     disp[i] = N;
410:     N      += lens[i];
411:   }

413:   /* If ispetsc is 0 then use "natural" numbering */
414:   if (napp) {
415:     if (!ispetsc) {
416:       start = disp[rank];
417:       PetscMalloc1(napp+1, &petsc);
418:       for (i=0; i<napp; i++) petsc[i] = start + i;
419:     } else {
420:       ISGetIndices(ispetsc,&mypetsc);
421:       petsc = (PetscInt*)mypetsc;
422:     }
423:   } else {
424:     petsc = NULL;
425:   }

427:   /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
428:   PetscLayoutCreate(comm,&map);
429:   map->bs = 1;
430:   map->N  = N;
431:   PetscLayoutSetUp(map);

433:   ao->N       = N;
434:   ao->n       = map->n;
435:   aomems->map = map;

437:   /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
438:   n_local = map->n;
439:   PetscCalloc2(n_local, &aomems->app_loc,n_local,&aomems->petsc_loc);
440:   PetscLogObjectMemory((PetscObject)ao,2*n_local*sizeof(PetscInt));
441:   ISGetIndices(isapp,&myapp);

443:   AOCreateMemoryScalable_private(comm,napp,petsc,myapp,ao,aomems->app_loc);
444:   AOCreateMemoryScalable_private(comm,napp,myapp,petsc,ao,aomems->petsc_loc);

446:   ISRestoreIndices(isapp,&myapp);
447:   if (napp) {
448:     if (ispetsc) {
449:       ISRestoreIndices(ispetsc,&mypetsc);
450:     } else {
451:       PetscFree(petsc);
452:     }
453:   }
454:   PetscFree2(lens,disp);
455:   return 0;
456: }

458: /*@C
459:    AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.

461:    Collective

463:    Input Parameters:
464: +  comm - MPI communicator that is to share AO
465: .  napp - size of integer arrays
466: .  myapp - integer array that defines an ordering
467: -  mypetsc - integer array that defines another ordering (may be NULL to
468:              indicate the natural ordering, that is 0,1,2,3,...)

470:    Output Parameter:
471: .  aoout - the new application ordering

473:    Level: beginner

475:     Notes:
476:     The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
477:            in the indices. Use AOCreateMapping() or AOCreateMappingIS() if you wish to have "holes" in the indices.
478:            Comparing with AOCreateBasic(), this routine trades memory with message communication.

480: .seealso: AOCreateMemoryScalableIS(), AODestroy(), AOPetscToApplication(), AOApplicationToPetsc()
481: @*/
482: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm,PetscInt napp,const PetscInt myapp[],const PetscInt mypetsc[],AO *aoout)
483: {
484:   IS             isapp,ispetsc;
485:   const PetscInt *app=myapp,*petsc=mypetsc;

487:   ISCreateGeneral(comm,napp,app,PETSC_USE_POINTER,&isapp);
488:   if (mypetsc) {
489:     ISCreateGeneral(comm,napp,petsc,PETSC_USE_POINTER,&ispetsc);
490:   } else {
491:     ispetsc = NULL;
492:   }
493:   AOCreateMemoryScalableIS(isapp,ispetsc,aoout);
494:   ISDestroy(&isapp);
495:   if (mypetsc) {
496:     ISDestroy(&ispetsc);
497:   }
498:   return 0;
499: }

501: /*@C
502:    AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.

504:    Collective on IS

506:    Input Parameters:
507: +  isapp - index set that defines an ordering
508: -  ispetsc - index set that defines another ordering (may be NULL to use the
509:              natural ordering)

511:    Output Parameter:
512: .  aoout - the new application ordering

514:    Level: beginner

516:     Notes:
517:     The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
518:            that is there cannot be any "holes".
519:            Comparing with AOCreateBasicIS(), this routine trades memory with message communication.
520: .seealso: AOCreateMemoryScalable(),  AODestroy()
521: @*/
522: PetscErrorCode  AOCreateMemoryScalableIS(IS isapp,IS ispetsc,AO *aoout)
523: {
524:   MPI_Comm       comm;
525:   AO             ao;

527:   PetscObjectGetComm((PetscObject)isapp,&comm);
528:   AOCreate(comm,&ao);
529:   AOSetIS(ao,isapp,ispetsc);
530:   AOSetType(ao,AOMEMORYSCALABLE);
531:   AOViewFromOptions(ao,NULL,"-ao_view");
532:   *aoout = ao;
533:   return 0;
534: }