Actual source code: aomemscalable.c
2: /*
3: The memory scalable AO application ordering routines. These store the
4: orderings on each processor for that processor's range of values
5: */
7: #include <../src/vec/is/ao/aoimpl.h>
9: typedef struct {
10: PetscInt *app_loc; /* app_loc[i] is the partner for the ith local PETSc slot */
11: PetscInt *petsc_loc; /* petsc_loc[j] is the partner for the jth local app slot */
12: PetscLayout map; /* determines the local sizes of ao */
13: } AO_MemoryScalable;
15: /*
16: All processors ship the data to process 0 to be printed; note that this is not scalable because
17: process 0 allocates space for all the orderings entry across all the processes
18: */
19: PetscErrorCode AOView_MemoryScalable(AO ao,PetscViewer viewer)
20: {
21: PetscMPIInt rank,size;
22: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
23: PetscBool iascii;
24: PetscMPIInt tag_app,tag_petsc;
25: PetscLayout map = aomems->map;
26: PetscInt *app,*app_loc,*petsc,*petsc_loc,len,i,j;
27: MPI_Status status;
29: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
32: MPI_Comm_rank(PetscObjectComm((PetscObject)ao),&rank);
33: MPI_Comm_size(PetscObjectComm((PetscObject)ao),&size);
35: PetscObjectGetNewTag((PetscObject)ao,&tag_app);
36: PetscObjectGetNewTag((PetscObject)ao,&tag_petsc);
38: if (rank == 0) {
39: PetscViewerASCIIPrintf(viewer,"Number of elements in ordering %" PetscInt_FMT "\n",ao->N);
40: PetscViewerASCIIPrintf(viewer, "PETSc->App App->PETSc\n");
42: PetscMalloc2(map->N,&app,map->N,&petsc);
43: len = map->n;
44: /* print local AO */
45: PetscViewerASCIIPrintf(viewer,"Process [%d]\n",rank);
46: for (i=0; i<len; i++) {
47: PetscViewerASCIIPrintf(viewer,"%3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT "\n",i,aomems->app_loc[i],i,aomems->petsc_loc[i]);
48: }
50: /* recv and print off-processor's AO */
51: for (i=1; i<size; i++) {
52: len = map->range[i+1] - map->range[i];
53: app_loc = app + map->range[i];
54: petsc_loc = petsc+ map->range[i];
55: MPI_Recv(app_loc,(PetscMPIInt)len,MPIU_INT,i,tag_app,PetscObjectComm((PetscObject)ao),&status);
56: MPI_Recv(petsc_loc,(PetscMPIInt)len,MPIU_INT,i,tag_petsc,PetscObjectComm((PetscObject)ao),&status);
57: PetscViewerASCIIPrintf(viewer,"Process [%" PetscInt_FMT "]\n",i);
58: for (j=0; j<len; j++) {
59: PetscViewerASCIIPrintf(viewer,"%3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT "\n",map->range[i]+j,app_loc[j],map->range[i]+j,petsc_loc[j]);
60: }
61: }
62: PetscFree2(app,petsc);
64: } else {
65: /* send values */
66: MPI_Send((void*)aomems->app_loc,map->n,MPIU_INT,0,tag_app,PetscObjectComm((PetscObject)ao));
67: MPI_Send((void*)aomems->petsc_loc,map->n,MPIU_INT,0,tag_petsc,PetscObjectComm((PetscObject)ao));
68: }
69: PetscViewerFlush(viewer);
70: return 0;
71: }
73: PetscErrorCode AODestroy_MemoryScalable(AO ao)
74: {
75: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
77: PetscFree2(aomems->app_loc,aomems->petsc_loc);
78: PetscLayoutDestroy(&aomems->map);
79: PetscFree(aomems);
80: return 0;
81: }
83: /*
84: Input Parameters:
85: + ao - the application ordering context
86: . n - the number of integers in ia[]
87: . ia - the integers; these are replaced with their mapped value
88: - maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"
90: Output Parameter:
91: . ia - the mapped interges
92: */
93: PetscErrorCode AOMap_MemoryScalable_private(AO ao,PetscInt n,PetscInt *ia,const PetscInt *maploc)
94: {
95: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
96: MPI_Comm comm;
97: PetscMPIInt rank,size,tag1,tag2;
98: PetscInt *owner,*start,*sizes,nsends,nreceives;
99: PetscInt nmax,count,*sindices,*rindices,i,j,idx,lastidx,*sindices2,*rindices2;
100: const PetscInt *owners = aomems->map->range;
101: MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2;
102: MPI_Status recv_status;
103: PetscMPIInt nindices,source,widx;
104: PetscInt *rbuf,*sbuf;
105: MPI_Status *send_status,*send_status2;
107: PetscObjectGetComm((PetscObject)ao,&comm);
108: MPI_Comm_rank(comm,&rank);
109: MPI_Comm_size(comm,&size);
111: /* first count number of contributors to each processor */
112: PetscMalloc1(size,&start);
113: PetscCalloc2(2*size,&sizes,n,&owner);
115: j = 0;
116: lastidx = -1;
117: for (i=0; i<n; i++) {
118: if (ia[i] < 0) owner[i] = -1; /* mark negative entries (which are not to be mapped) with a special negative value */
119: if (ia[i] >= ao->N) owner[i] = -2; /* mark out of range entries with special negative value */
120: else {
121: /* if indices are NOT locally sorted, need to start search at the beginning */
122: if (lastidx > (idx = ia[i])) j = 0;
123: lastidx = idx;
124: for (; j<size; j++) {
125: if (idx >= owners[j] && idx < owners[j+1]) {
126: sizes[2*j]++; /* num of indices to be sent */
127: sizes[2*j+1] = 1; /* send to proc[j] */
128: owner[i] = j;
129: break;
130: }
131: }
132: }
133: }
134: sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
135: nsends = 0;
136: for (i=0; i<size; i++) nsends += sizes[2*i+1];
138: /* inform other processors of number of messages and max length*/
139: PetscMaxSum(comm,sizes,&nmax,&nreceives);
141: /* allocate arrays */
142: PetscObjectGetNewTag((PetscObject)ao,&tag1);
143: PetscObjectGetNewTag((PetscObject)ao,&tag2);
145: PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
146: PetscMalloc2(nsends*nmax,&rindices2,nsends,&recv_waits2);
148: PetscMalloc3(n,&sindices,nsends,&send_waits,nsends,&send_status);
149: PetscMalloc3(n,&sindices2,nreceives,&send_waits2,nreceives,&send_status2);
151: /* post 1st receives: receive others requests
152: since we don't know how long each individual message is we
153: allocate the largest needed buffer for each receive. Potentially
154: this is a lot of wasted space.
155: */
156: for (i=0,count=0; i<nreceives; i++) {
157: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++);
158: }
160: /* do 1st sends:
161: 1) starts[i] gives the starting index in svalues for stuff going to
162: the ith processor
163: */
164: start[0] = 0;
165: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
166: for (i=0; i<n; i++) {
167: j = owner[i];
168: if (j == -1) continue; /* do not remap negative entries in ia[] */
169: else if (j == -2) { /* out of range entries get mapped to -1 */
170: ia[i] = -1;
171: continue;
172: } else if (j != rank) {
173: sindices[start[j]++] = ia[i];
174: } else { /* compute my own map */
175: ia[i] = maploc[ia[i]-owners[rank]];
176: }
177: }
179: start[0] = 0;
180: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
181: for (i=0,count=0; i<size; i++) {
182: if (sizes[2*i+1]) {
183: /* send my request to others */
184: MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag1,comm,send_waits+count);
185: /* post receive for the answer of my request */
186: MPI_Irecv(sindices2+start[i],sizes[2*i],MPIU_INT,i,tag2,comm,recv_waits2+count);
187: count++;
188: }
189: }
192: /* wait on 1st sends */
193: if (nsends) {
194: MPI_Waitall(nsends,send_waits,send_status);
195: }
197: /* 1st recvs: other's requests */
198: for (j=0; j< nreceives; j++) {
199: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); /* idx: index of handle for operation that completed */
200: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
201: rbuf = rindices+nmax*widx; /* global index */
202: source = recv_status.MPI_SOURCE;
204: /* compute mapping */
205: sbuf = rbuf;
206: for (i=0; i<nindices; i++) sbuf[i] = maploc[rbuf[i]-owners[rank]];
208: /* send mapping back to the sender */
209: MPI_Isend(sbuf,nindices,MPIU_INT,source,tag2,comm,send_waits2+widx);
210: }
212: /* wait on 2nd sends */
213: if (nreceives) {
214: MPI_Waitall(nreceives,send_waits2,send_status2);
215: }
217: /* 2nd recvs: for the answer of my request */
218: for (j=0; j< nsends; j++) {
219: MPI_Waitany(nsends,recv_waits2,&widx,&recv_status);
220: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
221: source = recv_status.MPI_SOURCE;
222: /* pack output ia[] */
223: rbuf = sindices2+start[source];
224: count = 0;
225: for (i=0; i<n; i++) {
226: if (source == owner[i]) ia[i] = rbuf[count++];
227: }
228: }
230: /* free arrays */
231: PetscFree(start);
232: PetscFree2(sizes,owner);
233: PetscFree2(rindices,recv_waits);
234: PetscFree2(rindices2,recv_waits2);
235: PetscFree3(sindices,send_waits,send_status);
236: PetscFree3(sindices2,send_waits2,send_status2);
237: return 0;
238: }
240: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
241: {
242: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
243: PetscInt *app_loc = aomems->app_loc;
245: AOMap_MemoryScalable_private(ao,n,ia,app_loc);
246: return 0;
247: }
249: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
250: {
251: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
252: PetscInt *petsc_loc = aomems->petsc_loc;
254: AOMap_MemoryScalable_private(ao,n,ia,petsc_loc);
255: return 0;
256: }
258: static struct _AOOps AOOps_MemoryScalable = {
259: PetscDesignatedInitializer(view,AOView_MemoryScalable),
260: PetscDesignatedInitializer(destroy,AODestroy_MemoryScalable),
261: PetscDesignatedInitializer(petsctoapplication,AOPetscToApplication_MemoryScalable),
262: PetscDesignatedInitializer(applicationtopetsc,AOApplicationToPetsc_MemoryScalable),
263: };
265: PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm,PetscInt napp,const PetscInt from_array[],const PetscInt to_array[],AO ao, PetscInt *aomap_loc)
266: {
267: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
268: PetscLayout map = aomems->map;
269: PetscInt n_local = map->n,i,j;
270: PetscMPIInt rank,size,tag;
271: PetscInt *owner,*start,*sizes,nsends,nreceives;
272: PetscInt nmax,count,*sindices,*rindices,idx,lastidx;
273: PetscInt *owners = aomems->map->range;
274: MPI_Request *send_waits,*recv_waits;
275: MPI_Status recv_status;
276: PetscMPIInt nindices,widx;
277: PetscInt *rbuf;
278: PetscInt n=napp,ip,ia;
279: MPI_Status *send_status;
281: PetscArrayzero(aomap_loc,n_local);
283: MPI_Comm_rank(comm,&rank);
284: MPI_Comm_size(comm,&size);
286: /* first count number of contributors (of from_array[]) to each processor */
287: PetscCalloc1(2*size,&sizes);
288: PetscMalloc1(n,&owner);
290: j = 0;
291: lastidx = -1;
292: for (i=0; i<n; i++) {
293: /* if indices are NOT locally sorted, need to start search at the beginning */
294: if (lastidx > (idx = from_array[i])) j = 0;
295: lastidx = idx;
296: for (; j<size; j++) {
297: if (idx >= owners[j] && idx < owners[j+1]) {
298: sizes[2*j] += 2; /* num of indices to be sent - in pairs (ip,ia) */
299: sizes[2*j+1] = 1; /* send to proc[j] */
300: owner[i] = j;
301: break;
302: }
303: }
304: }
305: sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
306: nsends = 0;
307: for (i=0; i<size; i++) nsends += sizes[2*i+1];
309: /* inform other processors of number of messages and max length*/
310: PetscMaxSum(comm,sizes,&nmax,&nreceives);
312: /* allocate arrays */
313: PetscObjectGetNewTag((PetscObject)ao,&tag);
314: PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
315: PetscMalloc3(2*n,&sindices,nsends,&send_waits,nsends,&send_status);
316: PetscMalloc1(size,&start);
318: /* post receives: */
319: for (i=0; i<nreceives; i++) {
320: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
321: }
323: /* do sends:
324: 1) starts[i] gives the starting index in svalues for stuff going to
325: the ith processor
326: */
327: start[0] = 0;
328: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
329: for (i=0; i<n; i++) {
330: j = owner[i];
331: if (j != rank) {
332: ip = from_array[i];
333: ia = to_array[i];
334: sindices[start[j]++] = ip;
335: sindices[start[j]++] = ia;
336: } else { /* compute my own map */
337: ip = from_array[i] - owners[rank];
338: ia = to_array[i];
339: aomap_loc[ip] = ia;
340: }
341: }
343: start[0] = 0;
344: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
345: for (i=0,count=0; i<size; i++) {
346: if (sizes[2*i+1]) {
347: MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag,comm,send_waits+count);
348: count++;
349: }
350: }
353: /* wait on sends */
354: if (nsends) {
355: MPI_Waitall(nsends,send_waits,send_status);
356: }
358: /* recvs */
359: count=0;
360: for (j= nreceives; j>0; j--) {
361: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status);
362: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
363: rbuf = rindices+nmax*widx; /* global index */
365: /* compute local mapping */
366: for (i=0; i<nindices; i+=2) { /* pack aomap_loc */
367: ip = rbuf[i] - owners[rank]; /* local index */
368: ia = rbuf[i+1];
369: aomap_loc[ip] = ia;
370: }
371: count++;
372: }
374: PetscFree(start);
375: PetscFree3(sindices,send_waits,send_status);
376: PetscFree2(rindices,recv_waits);
377: PetscFree(owner);
378: PetscFree(sizes);
379: return 0;
380: }
382: PETSC_EXTERN PetscErrorCode AOCreate_MemoryScalable(AO ao)
383: {
384: IS isapp=ao->isapp,ispetsc=ao->ispetsc;
385: const PetscInt *mypetsc,*myapp;
386: PetscInt napp,n_local,N,i,start,*petsc,*lens,*disp;
387: MPI_Comm comm;
388: AO_MemoryScalable *aomems;
389: PetscLayout map;
390: PetscMPIInt size,rank;
393: /* create special struct aomems */
394: PetscNewLog(ao,&aomems);
395: ao->data = (void*) aomems;
396: PetscMemcpy(ao->ops,&AOOps_MemoryScalable,sizeof(struct _AOOps));
397: PetscObjectChangeTypeName((PetscObject)ao,AOMEMORYSCALABLE);
399: /* transmit all local lengths of isapp to all processors */
400: PetscObjectGetComm((PetscObject)isapp,&comm);
401: MPI_Comm_size(comm, &size);
402: MPI_Comm_rank(comm, &rank);
403: PetscMalloc2(size,&lens,size,&disp);
404: ISGetLocalSize(isapp,&napp);
405: MPI_Allgather(&napp, 1, MPIU_INT, lens, 1, MPIU_INT, comm);
407: N = 0;
408: for (i = 0; i < size; i++) {
409: disp[i] = N;
410: N += lens[i];
411: }
413: /* If ispetsc is 0 then use "natural" numbering */
414: if (napp) {
415: if (!ispetsc) {
416: start = disp[rank];
417: PetscMalloc1(napp+1, &petsc);
418: for (i=0; i<napp; i++) petsc[i] = start + i;
419: } else {
420: ISGetIndices(ispetsc,&mypetsc);
421: petsc = (PetscInt*)mypetsc;
422: }
423: } else {
424: petsc = NULL;
425: }
427: /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
428: PetscLayoutCreate(comm,&map);
429: map->bs = 1;
430: map->N = N;
431: PetscLayoutSetUp(map);
433: ao->N = N;
434: ao->n = map->n;
435: aomems->map = map;
437: /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
438: n_local = map->n;
439: PetscCalloc2(n_local, &aomems->app_loc,n_local,&aomems->petsc_loc);
440: PetscLogObjectMemory((PetscObject)ao,2*n_local*sizeof(PetscInt));
441: ISGetIndices(isapp,&myapp);
443: AOCreateMemoryScalable_private(comm,napp,petsc,myapp,ao,aomems->app_loc);
444: AOCreateMemoryScalable_private(comm,napp,myapp,petsc,ao,aomems->petsc_loc);
446: ISRestoreIndices(isapp,&myapp);
447: if (napp) {
448: if (ispetsc) {
449: ISRestoreIndices(ispetsc,&mypetsc);
450: } else {
451: PetscFree(petsc);
452: }
453: }
454: PetscFree2(lens,disp);
455: return 0;
456: }
458: /*@C
459: AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.
461: Collective
463: Input Parameters:
464: + comm - MPI communicator that is to share AO
465: . napp - size of integer arrays
466: . myapp - integer array that defines an ordering
467: - mypetsc - integer array that defines another ordering (may be NULL to
468: indicate the natural ordering, that is 0,1,2,3,...)
470: Output Parameter:
471: . aoout - the new application ordering
473: Level: beginner
475: Notes:
476: The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
477: in the indices. Use AOCreateMapping() or AOCreateMappingIS() if you wish to have "holes" in the indices.
478: Comparing with AOCreateBasic(), this routine trades memory with message communication.
480: .seealso: AOCreateMemoryScalableIS(), AODestroy(), AOPetscToApplication(), AOApplicationToPetsc()
481: @*/
482: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm,PetscInt napp,const PetscInt myapp[],const PetscInt mypetsc[],AO *aoout)
483: {
484: IS isapp,ispetsc;
485: const PetscInt *app=myapp,*petsc=mypetsc;
487: ISCreateGeneral(comm,napp,app,PETSC_USE_POINTER,&isapp);
488: if (mypetsc) {
489: ISCreateGeneral(comm,napp,petsc,PETSC_USE_POINTER,&ispetsc);
490: } else {
491: ispetsc = NULL;
492: }
493: AOCreateMemoryScalableIS(isapp,ispetsc,aoout);
494: ISDestroy(&isapp);
495: if (mypetsc) {
496: ISDestroy(&ispetsc);
497: }
498: return 0;
499: }
501: /*@C
502: AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.
504: Collective on IS
506: Input Parameters:
507: + isapp - index set that defines an ordering
508: - ispetsc - index set that defines another ordering (may be NULL to use the
509: natural ordering)
511: Output Parameter:
512: . aoout - the new application ordering
514: Level: beginner
516: Notes:
517: The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
518: that is there cannot be any "holes".
519: Comparing with AOCreateBasicIS(), this routine trades memory with message communication.
520: .seealso: AOCreateMemoryScalable(), AODestroy()
521: @*/
522: PetscErrorCode AOCreateMemoryScalableIS(IS isapp,IS ispetsc,AO *aoout)
523: {
524: MPI_Comm comm;
525: AO ao;
527: PetscObjectGetComm((PetscObject)isapp,&comm);
528: AOCreate(comm,&ao);
529: AOSetIS(ao,isapp,ispetsc);
530: AOSetType(ao,AOMEMORYSCALABLE);
531: AOViewFromOptions(ao,NULL,"-ao_view");
532: *aoout = ao;
533: return 0;
534: }