Actual source code: aomemscalable.c
petsc-3.11.4 2019-09-28
2: /*
3: The memory scalable AO application ordering routines. These store the
4: orderings on each processor for that processor's range of values
5: */
7: #include <../src/vec/is/ao/aoimpl.h>
9: typedef struct {
10: PetscInt *app_loc; /* app_loc[i] is the partner for the ith local PETSc slot */
11: PetscInt *petsc_loc; /* petsc_loc[j] is the partner for the jth local app slot */
12: PetscLayout map; /* determines the local sizes of ao */
13: } AO_MemoryScalable;
15: /*
16: All processors ship the data to process 0 to be printed; note that this is not scalable because
17: process 0 allocates space for all the orderings entry across all the processes
18: */
19: PetscErrorCode AOView_MemoryScalable(AO ao,PetscViewer viewer)
20: {
21: PetscErrorCode ierr;
22: PetscMPIInt rank,size;
23: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
24: PetscBool iascii;
25: PetscMPIInt tag_app,tag_petsc;
26: PetscLayout map = aomems->map;
27: PetscInt *app,*app_loc,*petsc,*petsc_loc,len,i,j;
28: MPI_Status status;
31: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
32: if (!iascii) SETERRQ1(PetscObjectComm((PetscObject)viewer),PETSC_ERR_SUP,"Viewer type %s not supported for AO MemoryScalable",((PetscObject)viewer)->type_name);
34: MPI_Comm_rank(PetscObjectComm((PetscObject)ao),&rank);
35: MPI_Comm_size(PetscObjectComm((PetscObject)ao),&size);
37: PetscObjectGetNewTag((PetscObject)ao,&tag_app);
38: PetscObjectGetNewTag((PetscObject)ao,&tag_petsc);
40: if (!rank) {
41: PetscViewerASCIIPrintf(viewer,"Number of elements in ordering %D\n",ao->N);
42: PetscViewerASCIIPrintf(viewer, "PETSc->App App->PETSc\n");
44: PetscMalloc2(map->N,&app,map->N,&petsc);
45: len = map->n;
46: /* print local AO */
47: PetscViewerASCIIPrintf(viewer,"Process [%d]\n",rank);
48: for (i=0; i<len; i++) {
49: PetscViewerASCIIPrintf(viewer,"%3D %3D %3D %3D\n",i,aomems->app_loc[i],i,aomems->petsc_loc[i]);
50: }
52: /* recv and print off-processor's AO */
53: for (i=1; i<size; i++) {
54: len = map->range[i+1] - map->range[i];
55: app_loc = app + map->range[i];
56: petsc_loc = petsc+ map->range[i];
57: MPI_Recv(app_loc,(PetscMPIInt)len,MPIU_INT,i,tag_app,PetscObjectComm((PetscObject)ao),&status);
58: MPI_Recv(petsc_loc,(PetscMPIInt)len,MPIU_INT,i,tag_petsc,PetscObjectComm((PetscObject)ao),&status);
59: PetscViewerASCIIPrintf(viewer,"Process [%D]\n",i);
60: for (j=0; j<len; j++) {
61: PetscViewerASCIIPrintf(viewer,"%3D %3D %3D %3D\n",map->range[i]+j,app_loc[j],map->range[i]+j,petsc_loc[j]);
62: }
63: }
64: PetscFree2(app,petsc);
66: } else {
67: /* send values */
68: MPI_Send((void*)aomems->app_loc,map->n,MPIU_INT,0,tag_app,PetscObjectComm((PetscObject)ao));
69: MPI_Send((void*)aomems->petsc_loc,map->n,MPIU_INT,0,tag_petsc,PetscObjectComm((PetscObject)ao));
70: }
71: PetscViewerFlush(viewer);
72: return(0);
73: }
75: PetscErrorCode AODestroy_MemoryScalable(AO ao)
76: {
77: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
78: PetscErrorCode ierr;
81: PetscFree2(aomems->app_loc,aomems->petsc_loc);
82: PetscLayoutDestroy(&aomems->map);
83: PetscFree(aomems);
84: return(0);
85: }
87: /*
88: Input Parameters:
89: + ao - the application ordering context
90: . n - the number of integers in ia[]
91: . ia - the integers; these are replaced with their mapped value
92: - maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"
94: Output Parameter:
95: . ia - the mapped interges
96: */
97: PetscErrorCode AOMap_MemoryScalable_private(AO ao,PetscInt n,PetscInt *ia,const PetscInt *maploc)
98: {
99: PetscErrorCode ierr;
100: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
101: MPI_Comm comm;
102: PetscMPIInt rank,size,tag1,tag2;
103: PetscInt *owner,*start,*sizes,nsends,nreceives;
104: PetscInt nmax,count,*sindices,*rindices,i,j,idx,lastidx,*sindices2,*rindices2;
105: const PetscInt *owners = aomems->map->range;
106: MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2;
107: MPI_Status recv_status;
108: PetscMPIInt nindices,source,widx;
109: PetscInt *rbuf,*sbuf;
110: MPI_Status *send_status,*send_status2;
113: PetscObjectGetComm((PetscObject)ao,&comm);
114: MPI_Comm_rank(comm,&rank);
115: MPI_Comm_size(comm,&size);
117: /* first count number of contributors to each processor */
118: PetscMalloc2(2*size,&sizes,size,&start);
119: PetscMemzero(sizes,2*size*sizeof(PetscInt));
120: PetscCalloc1(n,&owner);
122: j = 0;
123: lastidx = -1;
124: for (i=0; i<n; i++) {
125: if (ia[i] < 0) owner[i] = -1; /* mark negative entries (which are not to be mapped) with a special negative value */
126: if (ia[i] >= ao->N) owner[i] = -2; /* mark out of range entries with special negative value */
127: else {
128: /* if indices are NOT locally sorted, need to start search at the beginning */
129: if (lastidx > (idx = ia[i])) j = 0;
130: lastidx = idx;
131: for (; j<size; j++) {
132: if (idx >= owners[j] && idx < owners[j+1]) {
133: sizes[2*j]++; /* num of indices to be sent */
134: sizes[2*j+1] = 1; /* send to proc[j] */
135: owner[i] = j;
136: break;
137: }
138: }
139: }
140: }
141: sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
142: nsends = 0;
143: for (i=0; i<size; i++) nsends += sizes[2*i+1];
145: /* inform other processors of number of messages and max length*/
146: PetscMaxSum(comm,sizes,&nmax,&nreceives);
148: /* allocate arrays */
149: PetscObjectGetNewTag((PetscObject)ao,&tag1);
150: PetscObjectGetNewTag((PetscObject)ao,&tag2);
152: PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
153: PetscMalloc2(nsends*nmax,&rindices2,nsends,&recv_waits2);
155: PetscMalloc3(n,&sindices,nsends,&send_waits,nsends,&send_status);
156: PetscMalloc3(n,&sindices2,nreceives,&send_waits2,nreceives,&send_status2);
158: /* post 1st receives: receive others requests
159: since we don't know how long each individual message is we
160: allocate the largest needed buffer for each receive. Potentially
161: this is a lot of wasted space.
162: */
163: for (i=0,count=0; i<nreceives; i++) {
164: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++);
165: }
167: /* do 1st sends:
168: 1) starts[i] gives the starting index in svalues for stuff going to
169: the ith processor
170: */
171: start[0] = 0;
172: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
173: for (i=0; i<n; i++) {
174: j = owner[i];
175: if (j == -1) continue; /* do not remap negative entries in ia[] */
176: else if (j == -2) { /* out of range entries get mapped to -1 */
177: ia[i] = -1;
178: continue;
179: } else if (j != rank) {
180: sindices[start[j]++] = ia[i];
181: } else { /* compute my own map */
182: ia[i] = maploc[ia[i]-owners[rank]];
183: }
184: }
186: start[0] = 0;
187: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
188: for (i=0,count=0; i<size; i++) {
189: if (sizes[2*i+1]) {
190: /* send my request to others */
191: MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag1,comm,send_waits+count);
192: /* post receive for the answer of my request */
193: MPI_Irecv(sindices2+start[i],sizes[2*i],MPIU_INT,i,tag2,comm,recv_waits2+count);
194: count++;
195: }
196: }
197: if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count);
199: /* wait on 1st sends */
200: if (nsends) {
201: MPI_Waitall(nsends,send_waits,send_status);
202: }
204: /* 1st recvs: other's requests */
205: for (j=0; j< nreceives; j++) {
206: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); /* idx: index of handle for operation that completed */
207: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
208: rbuf = rindices+nmax*widx; /* global index */
209: source = recv_status.MPI_SOURCE;
211: /* compute mapping */
212: sbuf = rbuf;
213: for (i=0; i<nindices; i++) sbuf[i] = maploc[rbuf[i]-owners[rank]];
215: /* send mapping back to the sender */
216: MPI_Isend(sbuf,nindices,MPIU_INT,source,tag2,comm,send_waits2+widx);
217: }
219: /* wait on 2nd sends */
220: if (nreceives) {
221: MPI_Waitall(nreceives,send_waits2,send_status2);
222: }
224: /* 2nd recvs: for the answer of my request */
225: for (j=0; j< nsends; j++) {
226: MPI_Waitany(nsends,recv_waits2,&widx,&recv_status);
227: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
228: source = recv_status.MPI_SOURCE;
229: /* pack output ia[] */
230: rbuf = sindices2+start[source];
231: count = 0;
232: for (i=0; i<n; i++) {
233: if (source == owner[i]) ia[i] = rbuf[count++];
234: }
235: }
237: /* free arrays */
238: PetscFree2(sizes,start);
239: PetscFree(owner);
240: PetscFree2(rindices,recv_waits);
241: PetscFree2(rindices2,recv_waits2);
242: PetscFree3(sindices,send_waits,send_status);
243: PetscFree3(sindices2,send_waits2,send_status2);
244: return(0);
245: }
247: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
248: {
249: PetscErrorCode ierr;
250: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
251: PetscInt *app_loc = aomems->app_loc;
254: AOMap_MemoryScalable_private(ao,n,ia,app_loc);
255: return(0);
256: }
258: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao,PetscInt n,PetscInt *ia)
259: {
260: PetscErrorCode ierr;
261: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
262: PetscInt *petsc_loc = aomems->petsc_loc;
265: AOMap_MemoryScalable_private(ao,n,ia,petsc_loc);
266: return(0);
267: }
269: static struct _AOOps AOOps_MemoryScalable = {
270: AOView_MemoryScalable,
271: AODestroy_MemoryScalable,
272: AOPetscToApplication_MemoryScalable,
273: AOApplicationToPetsc_MemoryScalable,
274: 0,
275: 0,
276: 0,
277: 0
278: };
280: PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm,PetscInt napp,const PetscInt from_array[],const PetscInt to_array[],AO ao, PetscInt *aomap_loc)
281: {
282: PetscErrorCode ierr;
283: AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data;
284: PetscLayout map = aomems->map;
285: PetscInt n_local = map->n,i,j;
286: PetscMPIInt rank,size,tag;
287: PetscInt *owner,*start,*sizes,nsends,nreceives;
288: PetscInt nmax,count,*sindices,*rindices,idx,lastidx;
289: PetscInt *owners = aomems->map->range;
290: MPI_Request *send_waits,*recv_waits;
291: MPI_Status recv_status;
292: PetscMPIInt nindices,widx;
293: PetscInt *rbuf;
294: PetscInt n=napp,ip,ia;
295: MPI_Status *send_status;
298: PetscMemzero(aomap_loc,n_local*sizeof(PetscInt));
300: MPI_Comm_rank(comm,&rank);
301: MPI_Comm_size(comm,&size);
303: /* first count number of contributors (of from_array[]) to each processor */
304: PetscCalloc1(2*size,&sizes);
305: PetscMalloc1(n,&owner);
307: j = 0;
308: lastidx = -1;
309: for (i=0; i<n; i++) {
310: /* if indices are NOT locally sorted, need to start search at the beginning */
311: if (lastidx > (idx = from_array[i])) j = 0;
312: lastidx = idx;
313: for (; j<size; j++) {
314: if (idx >= owners[j] && idx < owners[j+1]) {
315: sizes[2*j] += 2; /* num of indices to be sent - in pairs (ip,ia) */
316: sizes[2*j+1] = 1; /* send to proc[j] */
317: owner[i] = j;
318: break;
319: }
320: }
321: }
322: sizes[2*rank]=sizes[2*rank+1]=0; /* do not receive from self! */
323: nsends = 0;
324: for (i=0; i<size; i++) nsends += sizes[2*i+1];
326: /* inform other processors of number of messages and max length*/
327: PetscMaxSum(comm,sizes,&nmax,&nreceives);
329: /* allocate arrays */
330: PetscObjectGetNewTag((PetscObject)ao,&tag);
331: PetscMalloc2(nreceives*nmax,&rindices,nreceives,&recv_waits);
332: PetscMalloc3(2*n,&sindices,nsends,&send_waits,nsends,&send_status);
333: PetscMalloc1(size,&start);
335: /* post receives: */
336: for (i=0; i<nreceives; i++) {
337: MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
338: }
340: /* do sends:
341: 1) starts[i] gives the starting index in svalues for stuff going to
342: the ith processor
343: */
344: start[0] = 0;
345: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
346: for (i=0; i<n; i++) {
347: j = owner[i];
348: if (j != rank) {
349: ip = from_array[i];
350: ia = to_array[i];
351: sindices[start[j]++] = ip;
352: sindices[start[j]++] = ia;
353: } else { /* compute my own map */
354: ip = from_array[i] - owners[rank];
355: ia = to_array[i];
356: aomap_loc[ip] = ia;
357: }
358: }
360: start[0] = 0;
361: for (i=1; i<size; i++) start[i] = start[i-1] + sizes[2*i-2];
362: for (i=0,count=0; i<size; i++) {
363: if (sizes[2*i+1]) {
364: MPI_Isend(sindices+start[i],sizes[2*i],MPIU_INT,i,tag,comm,send_waits+count);
365: count++;
366: }
367: }
368: if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count);
370: /* wait on sends */
371: if (nsends) {
372: MPI_Waitall(nsends,send_waits,send_status);
373: }
375: /* recvs */
376: count=0;
377: for (j= nreceives; j>0; j--) {
378: MPI_Waitany(nreceives,recv_waits,&widx,&recv_status);
379: MPI_Get_count(&recv_status,MPIU_INT,&nindices);
380: rbuf = rindices+nmax*widx; /* global index */
382: /* compute local mapping */
383: for (i=0; i<nindices; i+=2) { /* pack aomap_loc */
384: ip = rbuf[i] - owners[rank]; /* local index */
385: ia = rbuf[i+1];
386: aomap_loc[ip] = ia;
387: }
388: count++;
389: }
391: PetscFree(start);
392: PetscFree3(sindices,send_waits,send_status);
393: PetscFree2(rindices,recv_waits);
394: PetscFree(owner);
395: PetscFree(sizes);
396: return(0);
397: }
399: PETSC_EXTERN PetscErrorCode AOCreate_MemoryScalable(AO ao)
400: {
401: PetscErrorCode ierr;
402: IS isapp=ao->isapp,ispetsc=ao->ispetsc;
403: const PetscInt *mypetsc,*myapp;
404: PetscInt napp,n_local,N,i,start,*petsc,*lens,*disp;
405: MPI_Comm comm;
406: AO_MemoryScalable *aomems;
407: PetscLayout map;
408: PetscMPIInt size,rank;
411: if (!isapp) SETERRQ(PetscObjectComm((PetscObject)ao),PETSC_ERR_ARG_WRONGSTATE,"AOSetIS() must be called before AOSetType()");
412: /* create special struct aomems */
413: PetscNewLog(ao,&aomems);
414: ao->data = (void*) aomems;
415: PetscMemcpy(ao->ops,&AOOps_MemoryScalable,sizeof(struct _AOOps));
416: PetscObjectChangeTypeName((PetscObject)ao,AOMEMORYSCALABLE);
418: /* transmit all local lengths of isapp to all processors */
419: PetscObjectGetComm((PetscObject)isapp,&comm);
420: MPI_Comm_size(comm, &size);
421: MPI_Comm_rank(comm, &rank);
422: PetscMalloc2(size,&lens,size,&disp);
423: ISGetLocalSize(isapp,&napp);
424: MPI_Allgather(&napp, 1, MPIU_INT, lens, 1, MPIU_INT, comm);
426: N = 0;
427: for (i = 0; i < size; i++) {
428: disp[i] = N;
429: N += lens[i];
430: }
432: /* If ispetsc is 0 then use "natural" numbering */
433: if (napp) {
434: if (!ispetsc) {
435: start = disp[rank];
436: PetscMalloc1(napp+1, &petsc);
437: for (i=0; i<napp; i++) petsc[i] = start + i;
438: } else {
439: ISGetIndices(ispetsc,&mypetsc);
440: petsc = (PetscInt*)mypetsc;
441: }
442: } else {
443: petsc = NULL;
444: }
446: /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
447: PetscLayoutCreate(comm,&map);
448: map->bs = 1;
449: map->N = N;
450: PetscLayoutSetUp(map);
452: ao->N = N;
453: ao->n = map->n;
454: aomems->map = map;
456: /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
457: n_local = map->n;
458: PetscMalloc2(n_local, &aomems->app_loc,n_local,&aomems->petsc_loc);
459: PetscLogObjectMemory((PetscObject)ao,2*n_local*sizeof(PetscInt));
460: PetscMemzero(aomems->app_loc,n_local*sizeof(PetscInt));
461: PetscMemzero(aomems->petsc_loc,n_local*sizeof(PetscInt));
462: ISGetIndices(isapp,&myapp);
464: AOCreateMemoryScalable_private(comm,napp,petsc,myapp,ao,aomems->app_loc);
465: AOCreateMemoryScalable_private(comm,napp,myapp,petsc,ao,aomems->petsc_loc);
467: ISRestoreIndices(isapp,&myapp);
468: if (napp) {
469: if (ispetsc) {
470: ISRestoreIndices(ispetsc,&mypetsc);
471: } else {
472: PetscFree(petsc);
473: }
474: }
475: PetscFree2(lens,disp);
476: return(0);
477: }
479: /*@C
480: AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.
482: Collective on MPI_Comm
484: Input Parameters:
485: + comm - MPI communicator that is to share AO
486: . napp - size of integer arrays
487: . myapp - integer array that defines an ordering
488: - mypetsc - integer array that defines another ordering (may be NULL to
489: indicate the natural ordering, that is 0,1,2,3,...)
491: Output Parameter:
492: . aoout - the new application ordering
494: Level: beginner
496: Notes:
497: The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
498: in the indices. Use AOCreateMapping() or AOCreateMappingIS() if you wish to have "holes" in the indices.
499: Comparing with AOCreateBasic(), this routine trades memory with message communication.
501: .keywords: AO, create
503: .seealso: AOCreateMemoryScalableIS(), AODestroy(), AOPetscToApplication(), AOApplicationToPetsc()
504: @*/
505: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm,PetscInt napp,const PetscInt myapp[],const PetscInt mypetsc[],AO *aoout)
506: {
508: IS isapp,ispetsc;
509: const PetscInt *app=myapp,*petsc=mypetsc;
512: ISCreateGeneral(comm,napp,app,PETSC_USE_POINTER,&isapp);
513: if (mypetsc) {
514: ISCreateGeneral(comm,napp,petsc,PETSC_USE_POINTER,&ispetsc);
515: } else {
516: ispetsc = NULL;
517: }
518: AOCreateMemoryScalableIS(isapp,ispetsc,aoout);
519: ISDestroy(&isapp);
520: if (mypetsc) {
521: ISDestroy(&ispetsc);
522: }
523: return(0);
524: }
526: /*@C
527: AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.
529: Collective on IS
531: Input Parameters:
532: + isapp - index set that defines an ordering
533: - ispetsc - index set that defines another ordering (may be NULL to use the
534: natural ordering)
536: Output Parameter:
537: . aoout - the new application ordering
539: Level: beginner
541: Notes:
542: The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
543: that is there cannot be any "holes".
544: Comparing with AOCreateBasicIS(), this routine trades memory with message communication.
545: .keywords: AO, create
547: .seealso: AOCreateMemoryScalable(), AODestroy()
548: @*/
549: PetscErrorCode AOCreateMemoryScalableIS(IS isapp,IS ispetsc,AO *aoout)
550: {
552: MPI_Comm comm;
553: AO ao;
556: PetscObjectGetComm((PetscObject)isapp,&comm);
557: AOCreate(comm,&ao);
558: AOSetIS(ao,isapp,ispetsc);
559: AOSetType(ao,AOMEMORYSCALABLE);
560: AOViewFromOptions(ao,NULL,"-ao_view");
561: *aoout = ao;
562: return(0);
563: }