Actual source code: aomemscalable.c
petsc-3.3-p7 2013-05-11
2: /*
3: The memory scalable AO application ordering routines. These store the
4: local orderings on each processor.
5: */
7: #include <../src/dm/ao/aoimpl.h> /*I "petscao.h" I*/
9: typedef struct {
10: PetscInt *app_loc; /* app_loc[i] is the partner for the ith local PETSc slot */
11: PetscInt *petsc_loc; /* petsc_loc[j] is the partner for the jth local app slot */
12: PetscLayout map; /* determines the local sizes of ao */
13: } AO_MemoryScalable;
15: /*
16: All processors have the same data so processor 1 prints it
17: */
20: PetscErrorCode AOView_MemoryScalable(AO ao,PetscViewer viewer)
21: {
22: PetscErrorCode ierr;
23: PetscMPIInt rank,size;
24: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
25: PetscBool iascii;
26: PetscMPIInt tag_app,tag_petsc;
27: PetscLayout map = aomems->map;
28: PetscInt *app,*app_loc,*petsc,*petsc_loc,len,i,j;
29: MPI_Status status;
32: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
33: if (!iascii) SETERRQ1(((PetscObject)viewer)->comm,PETSC_ERR_SUP,"Viewer type %s not supported for AO MemoryScalable",((PetscObject)viewer)->type_name);
35: MPI_Comm_rank(((PetscObject)ao)->comm,&rank);
36: MPI_Comm_size(((PetscObject)ao)->comm,&size);
37:
38: PetscObjectGetNewTag((PetscObject)ao,&tag_app);
39: PetscObjectGetNewTag((PetscObject)ao,&tag_petsc);
41: if (!rank){
42: PetscViewerASCIIPrintf(viewer,"Number of elements in ordering %D\n",ao->N);
43: PetscViewerASCIIPrintf(viewer, "PETSc->App App->PETSc\n");
45: PetscMalloc2(map->N,PetscInt,&app,map->N,PetscInt,&petsc);
46: len = map->n;
47: /* print local AO */
48: PetscViewerASCIIPrintf(viewer,"Process [%D]\n",rank);
49: for (i=0; i<len; i++){
50: PetscViewerASCIIPrintf(viewer,"%3D %3D %3D %3D\n",i,aomems->app_loc[i],i,aomems->petsc_loc[i]);
51: }
53: /* recv and print off-processor's AO */
54: for (i=1; i<size; i++) {
55: len = map->range[i+1] - map->range[i];
56: app_loc = app + map->range[i];
57: petsc_loc = petsc+ map->range[i];
58: MPI_Recv(app_loc,(PetscMPIInt)len,MPIU_INT,i,tag_app,((PetscObject)ao)->comm,&status);
59: MPI_Recv(petsc_loc,(PetscMPIInt)len,MPIU_INT,i,tag_petsc,((PetscObject)ao)->comm,&status);
60: PetscViewerASCIIPrintf(viewer,"Process [%D]\n",i);
61: for (j=0; j<len; j++) {
62: PetscViewerASCIIPrintf(viewer,"%3D %3D %3D %3D\n",map->range[i]+j,app_loc[j],map->range[i]+j,petsc_loc[j]);
63: }
64: }
65: PetscFree2(app,petsc);
66:
67: } else {
68: /* send values */
69: MPI_Send((void*)aomems->app_loc,map->n,MPIU_INT,0,tag_app,((PetscObject)ao)->comm);
70: MPI_Send((void*)aomems->petsc_loc,map->n,MPIU_INT,0,tag_petsc,((PetscObject)ao)->comm);
71: }
72: PetscViewerFlush(viewer);
73: return(0);
74: }
78: PetscErrorCode AODestroy_MemoryScalable(AO ao)
79: {
80: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
81: PetscErrorCode ierr;
84: PetscFree2(aomems->app_loc,aomems->petsc_loc);
85: PetscLayoutDestroy(&aomems->map);
86: PetscFree(aomems);
87: return(0);
88: }
90: /*
91: Input Parameters:
92: + ao - the application ordering context
93: . n - the number of integers in ia[]
94: . ia - the integers; these are replaced with their mapped value
95: - maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"
97: Output Parameter:
98: . ia - the mapped interges
99: */
102: PetscErrorCode AOMap_MemoryScalable_private(AO ao,PetscInt n,PetscInt *ia,PetscInt *maploc)
103: {
104: PetscErrorCode ierr;
105: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
106: MPI_Comm comm=((PetscObject)ao)->comm;
107: PetscMPIInt rank,size,tag1,tag2;
108: PetscInt *owner,*start,*nprocs,nsends,nreceives;
109: PetscInt nmax,count,*sindices,*rindices,i,j,idx,lastidx,*sindices2,*rindices2;
110: PetscInt *owners = aomems->map->range;
111: MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2;
112: MPI_Status recv_status;
113: PetscMPIInt nindices,source,widx;
114: PetscInt *rbuf,*sbuf;
115: MPI_Status *send_status,*send_status2;
116:
118: MPI_Comm_rank(comm,&rank);
119: MPI_Comm_size(comm,&size);
121: /* first count number of contributors to each processor */
122: PetscMalloc2(2*size,PetscInt,&nprocs,size,PetscInt,&start);
123: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
124: PetscMalloc(n*sizeof(PetscInt),&owner);
125: PetscMemzero(owner,n*sizeof(PetscInt));
126:
127: j = 0;
128: lastidx = -1;
129: for (i=0; i<n; i++) {
130: /* if indices are NOT locally sorted, need to start search at the beginning */
131: if (lastidx > (idx = ia[i])) j = 0;
132: lastidx = idx;
133: for (; j<size; j++) {
134: if (idx >= owners[j] && idx < owners[j+1]) {
135: nprocs[2*j]++; /* num of indices to be sent */
136: nprocs[2*j+1] = 1; /* send to proc[j] */
137: owner[i] = j;
138: break;
139: }
140: }
141: }
142: nprocs[2*rank]=nprocs[2*rank+1]=0; /* do not receive from self! */
143: nsends = 0;
144: for (i=0; i<size; i++) nsends += nprocs[2*i+1];
146: /* inform other processors of number of messages and max length*/
147: PetscMaxSum(comm,nprocs,&nmax,&nreceives);
148:
149: /* allocate arrays */
150: PetscObjectGetNewTag((PetscObject)ao,&tag1);
151: PetscObjectGetNewTag((PetscObject)ao,&tag2);
153: PetscMalloc2(nreceives*nmax,PetscInt,&rindices,nreceives,MPI_Request,&recv_waits);
154: PetscMalloc2(nsends*nmax,PetscInt,&rindices2,nsends,MPI_Request,&recv_waits2);
155:
156: PetscMalloc3(n,PetscInt,&sindices,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
157: PetscMalloc3(n,PetscInt,&sindices2,nreceives,MPI_Request,&send_waits2,nreceives,MPI_Status,&send_status2);
159: /* post 1st receives: receive others requests
160: since we don't know how long each individual message is we
161: allocate the largest needed buffer for each receive. Potentially
162: this is a lot of wasted space.
163: */
164: for (i=0,count=0; i<nreceives; i++) {
165: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++);
166: }
168: /* do 1st sends:
169: 1) starts[i] gives the starting index in svalues for stuff going to
170: the ith processor
171: */
172: start[0] = 0;
173: for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2];
174: for (i=0; i<n; i++) {
175: j = owner[i];
176: if (j != rank){
177: sindices[start[j]++] = ia[i];
178: } else { /* compute my own map */
179: if (ia[i] >= owners[rank] && ia[i] < owners[rank+1] ) {
180: ia[i] = maploc[ia[i]-owners[rank]];
181: } else {
182: ia[i] = -1 ; /* ia[i] is not in the range of 0 and N-1, maps it to -1 */
183: }
184: }
185: }
187: start[0] = 0;
188: for (i=1; i<size; i++) { start[i] = start[i-1] + nprocs[2*i-2];}
189: for (i=0,count=0; i<size; i++) {
190: if (nprocs[2*i+1]) {
191: /* send my request to others */
192: MPI_Isend(sindices+start[i],nprocs[2*i],MPIU_INT,i,tag1,comm,send_waits+count);
193: /* post receive for the answer of my request */
194: MPI_Irecv(sindices2+start[i],nprocs[2*i],MPIU_INT,i,tag2,comm,recv_waits2+count);
195: count++;
196: }
197: }
198: if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count);
200: /* wait on 1st sends */
201: if (nsends) {
202: MPI_Waitall(nsends,send_waits,send_status);
203: }
204:
205: /* 1st recvs: other's requests */
206: for (j=0; j< nreceives; j++){
207: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); /* idx: index of handle for operation that completed */
208: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
209: rbuf = rindices+nmax*widx; /* global index */
210: source = recv_status.MPI_SOURCE;
211:
212: /* compute mapping */
213: sbuf = rbuf;
214: for (i=0; i<nindices; i++)sbuf[i] = maploc[rbuf[i]-owners[rank]];
216: /* send mapping back to the sender */
217: MPI_Isend(sbuf,nindices,MPIU_INT,source,tag2,comm,send_waits2+widx);
218: }
220: /* wait on 2nd sends */
221: if (nreceives) {
222: MPI_Waitall(nreceives,send_waits2,send_status2);
223: }
225: /* 2nd recvs: for the answer of my request */
226: for (j=0; j< nsends; j++){
227: MPI_Waitany(nsends,recv_waits2,&widx,&recv_status);
228: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
229: source = recv_status.MPI_SOURCE;
230: /* pack output ia[] */
231: rbuf=sindices2+start[source];
232: count=0;
233: for (i=0; i<n; i++){
234: if (source == owner[i]) ia[i] = rbuf[count++];
235: }
236: }
238: /* free arrays */
239: PetscFree2(nprocs,start);
240: PetscFree(owner);
241: PetscFree2(rindices,recv_waits);
242: PetscFree2(rindices2,recv_waits2);
243: PetscFree3(sindices,send_waits,send_status);
244: PetscFree3(sindices2,send_waits2,send_status2);
245: return(0);
246: }
250: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
251: {
252: PetscErrorCode ierr;
253: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
254: PetscInt *app_loc = aomems->app_loc;
257: AOMap_MemoryScalable_private(ao,n,ia,app_loc);
258: return(0);
259: }
263: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
264: {
265: PetscErrorCode ierr;
266: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
267: PetscInt *petsc_loc = aomems->petsc_loc;
270: AOMap_MemoryScalable_private(ao,n,ia,petsc_loc);
271: return(0);
272: }
274: static struct _AOOps AOOps_MemoryScalable = {
275: AOView_MemoryScalable,
276: AODestroy_MemoryScalable,
277: AOPetscToApplication_MemoryScalable,
278: AOApplicationToPetsc_MemoryScalable,
279: 0,
280: 0,
281: 0,
282: 0
283: };
287: PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm,PetscInt napp,const PetscInt from_array[],const PetscInt to_array[],AO ao, PetscInt *aomap_loc)
288: {
289: PetscErrorCode ierr;
290: AO_MemoryScalable *aomems=(AO_MemoryScalable*)ao->data;
291: PetscLayout map=aomems->map;
292: PetscInt n_local = map->n,i,j;
293: PetscMPIInt rank,size,tag;
294: PetscInt *owner,*start,*nprocs,nsends,nreceives;
295: PetscInt nmax,count,*sindices,*rindices,idx,lastidx;
296: PetscInt *owners = aomems->map->range;
297: MPI_Request *send_waits,*recv_waits;
298: MPI_Status recv_status;
299: PetscMPIInt nindices,widx;
300: PetscInt *rbuf;
301: PetscInt n=napp,ip,ia;
302: MPI_Status *send_status;
305: PetscMemzero(aomap_loc,n_local*sizeof(PetscInt));
307: MPI_Comm_rank(comm,&rank);
308: MPI_Comm_size(comm,&size);
309:
310: /* first count number of contributors (of from_array[]) to each processor */
311: PetscMalloc(2*size*sizeof(PetscInt),&nprocs);
312: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
313: PetscMalloc(n*sizeof(PetscInt),&owner);
314:
315: j = 0;
316: lastidx = -1;
317: for (i=0; i<n; i++) {
318: /* if indices are NOT locally sorted, need to start search at the beginning */
319: if (lastidx > (idx = from_array[i])) j = 0;
320: lastidx = idx;
321: for (; j<size; j++) {
322: if (idx >= owners[j] && idx < owners[j+1]) {
323: nprocs[2*j] += 2; /* num of indices to be sent - in pairs (ip,ia) */
324: nprocs[2*j+1] = 1; /* send to proc[j] */
325: owner[i] = j;
326: break;
327: }
328: }
329: }
330: nprocs[2*rank]=nprocs[2*rank+1]=0; /* do not receive from self! */
331: nsends = 0;
332: for (i=0; i<size; i++) nsends += nprocs[2*i+1];
334: /* inform other processors of number of messages and max length*/
335: PetscMaxSum(comm,nprocs,&nmax,&nreceives);
337: /* allocate arrays */
338: PetscObjectGetNewTag((PetscObject)ao,&tag);
339: PetscMalloc2(nreceives*nmax,PetscInt,&rindices,nreceives,MPI_Request,&recv_waits);
340: PetscMalloc3(2*n,PetscInt,&sindices,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
341: PetscMalloc(size*sizeof(PetscInt),&start);
342:
343: /* post receives: */
344: for (i=0; i<nreceives; i++) {
345: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
346: }
348: /* do sends:
349: 1) starts[i] gives the starting index in svalues for stuff going to
350: the ith processor
351: */
352: start[0] = 0;
353: for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2];
354: for (i=0; i<n; i++) {
355: j = owner[i];
356: if (j != rank){
357: ip = from_array[i];
358: ia = to_array[i];
359: sindices[start[j]++] = ip; sindices[start[j]++] = ia;
360: } else { /* compute my own map */
361: ip = from_array[i] -owners[rank];
362: ia = to_array[i];
363: aomap_loc[ip] = ia;
364: }
365: }
367: start[0] = 0;
368: for (i=1; i<size; i++) { start[i] = start[i-1] + nprocs[2*i-2];}
369: for (i=0,count=0; i<size; i++) {
370: if (nprocs[2*i+1]) {
371: MPI_Isend(sindices+start[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
372: count++;
373: }
374: }
375: if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count);
377: /* wait on sends */
378: if (nsends) {
379: MPI_Waitall(nsends,send_waits,send_status);
380: }
382: /* recvs */
383: count=0;
384: for (j= nreceives; j>0; j--){
385: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status);
386: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
387: rbuf = rindices+nmax*widx; /* global index */
388:
389: /* compute local mapping */
390: for (i=0; i<nindices; i+=2){ /* pack aomap_loc */
391: ip = rbuf[i] - owners[rank]; /* local index */
392: ia = rbuf[i+1];
393: aomap_loc[ip] = ia;
394: }
395: count++;
396: }
398: PetscFree(start);
399: PetscFree3(sindices,send_waits,send_status);
400: PetscFree2(rindices,recv_waits);
401: PetscFree(owner);
402: PetscFree(nprocs);
403: return(0);
404: }
406: EXTERN_C_BEGIN
409: PetscErrorCode AOCreate_MemoryScalable(AO ao)
410: {
411: PetscErrorCode ierr;
412: IS isapp=ao->isapp,ispetsc=ao->ispetsc;
413: const PetscInt *mypetsc,*myapp;
414: PetscInt napp,n_local,N,i,start,*petsc;
415: MPI_Comm comm;
416: AO_MemoryScalable *aomems;
417: PetscLayout map;
418: PetscMPIInt *lens,size,rank,*disp;
419:
421: /* create special struct aomems */
422: PetscNewLog(ao, AO_MemoryScalable, &aomems);
423: ao->data = (void*) aomems;
424: PetscMemcpy(ao->ops,&AOOps_MemoryScalable,sizeof(struct _AOOps));
425: PetscObjectChangeTypeName((PetscObject)ao,AOMEMORYSCALABLE);
427: /* transmit all local lengths of isapp to all processors */
428: PetscObjectGetComm((PetscObject)isapp,&comm);
429: MPI_Comm_size(comm, &size);
430: MPI_Comm_rank(comm, &rank);
431: PetscMalloc2(size,PetscMPIInt, &lens,size,PetscMPIInt,&disp);
432: ISGetLocalSize(isapp,&napp);
433: MPI_Allgather(&napp, 1, MPI_INT, lens, 1, MPI_INT, comm);
435: N = 0;
436: for(i = 0; i < size; i++) {
437: disp[i] = N;
438: N += lens[i];
439: }
441: /* If ispetsc is 0 then use "natural" numbering */
442: if (napp) {
443: if (!ispetsc) {
444: start = disp[rank];
445: PetscMalloc((napp+1) * sizeof(PetscInt), &petsc);
446: for (i=0; i<napp; i++) petsc[i] = start + i;
447: } else {
448: ISGetIndices(ispetsc,&mypetsc);
449: petsc = (PetscInt*)mypetsc;
450: }
451: }
452:
453: /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
454: PetscLayoutCreate(comm,&map);
455: map->bs = 1;
456: map->N = N;
457: PetscLayoutSetUp(map);
459: ao->N = N;
460: ao->n = map->n;
461: aomems->map = map;
463: /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
464: n_local = map->n;
465: PetscMalloc2(n_local,PetscInt, &aomems->app_loc,n_local,PetscInt,&aomems->petsc_loc);
466: PetscLogObjectMemory(ao,2*n_local*sizeof(PetscInt));
467: PetscMemzero(aomems->app_loc,n_local*sizeof(PetscInt));
468: PetscMemzero(aomems->petsc_loc,n_local*sizeof(PetscInt));
469: ISGetIndices(isapp,&myapp);
471: AOCreateMemoryScalable_private(comm,napp,petsc,myapp,ao,aomems->app_loc);
472: AOCreateMemoryScalable_private(comm,napp,myapp,petsc,ao,aomems->petsc_loc);
474: ISRestoreIndices(isapp,&myapp);
475: if (napp){
476: if (ispetsc){
477: ISRestoreIndices(ispetsc,&mypetsc);
478: } else {
479: PetscFree(petsc);
480: }
481: }
482: PetscFree2(lens,disp);
483: return(0);
484: }
485: EXTERN_C_END
489: /*@C
490: AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.
492: Collective on MPI_Comm
494: Input Parameters:
495: + comm - MPI communicator that is to share AO
496: . napp - size of integer arrays
497: . myapp - integer array that defines an ordering
498: - mypetsc - integer array that defines another ordering (may be PETSC_NULL to
499: indicate the natural ordering, that is 0,1,2,3,...)
501: Output Parameter:
502: . aoout - the new application ordering
504: Level: beginner
506: Notes: The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
507: in the indices. Use AOCreateMapping() or AOCreateMappingIS() if you wish to have "holes" in the indices.
508: Comparing with AOCreateBasic(), this routine trades memory with message communication.
510: .keywords: AO, create
512: .seealso: AOCreateMemoryScalableIS(), AODestroy(), AOPetscToApplication(), AOApplicationToPetsc()
513: @*/
514: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm,PetscInt napp,const PetscInt myapp[],const PetscInt mypetsc[],AO *aoout)
515: {
517: IS isapp,ispetsc;
518: const PetscInt *app=myapp,*petsc=mypetsc;
521: ISCreateGeneral(comm,napp,app,PETSC_USE_POINTER,&isapp);
522: if (mypetsc){
523: ISCreateGeneral(comm,napp,petsc,PETSC_USE_POINTER,&ispetsc);
524: } else {
525: ispetsc = PETSC_NULL;
526: }
527: AOCreateMemoryScalableIS(isapp,ispetsc,aoout);
528: ISDestroy(&isapp);
529: if (mypetsc){
530: ISDestroy(&ispetsc);
531: }
532: return(0);
533: }
537: /*@C
538: AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.
540: Collective on IS
542: Input Parameters:
543: + isapp - index set that defines an ordering
544: - ispetsc - index set that defines another ordering (may be PETSC_NULL to use the
545: natural ordering)
547: Output Parameter:
548: . aoout - the new application ordering
550: Level: beginner
552: Notes: The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
553: that is there cannot be any "holes".
554: Comparing with AOCreateBasicIS(), this routine trades memory with message communication.
555: .keywords: AO, create
557: .seealso: AOCreateMemoryScalable(), AODestroy()
558: @*/
559: PetscErrorCode AOCreateMemoryScalableIS(IS isapp,IS ispetsc,AO *aoout)
560: {
562: MPI_Comm comm;
563: AO ao;
566: PetscObjectGetComm((PetscObject)isapp,&comm);
567: AOCreate(comm,&ao);
568: AOSetIS(ao,isapp,ispetsc);
569: AOSetType(ao,AOMEMORYSCALABLE);
570: *aoout = ao;
571: return(0);
572: }