Actual source code: ex2.c
petsc-3.3-p7 2013-05-11
2: static char help[] = "Reads a a simple unstructured grid from a file. Partitions it,\n\
3: and distributes the grid data accordingly\n\n";
5: /*T
6: Concepts: Mat^partitioning a matrix;
7: Processors: n
8: T*/
10: /*
11: Updates of this example MAY be found at
12: http://www.mcs.anl.gov/petsc/src/dm/ao/examples/tutorials/ex2.c
14: This is a very basic, even crude, example of managing an unstructured
15: grid in parallel.
17: This particular code is for a Galerkin-style finite element method.
19: After the calls below, each processor will have
20: 1) a list of elements it "owns"; for each "owned" element it will have the global
21: numbering of the three vertices; stored in gdata->ele;
22: 2) a list of vertices it "owns". For each owned it will have the x and y
23: coordinates; stored in gdata->vert
25: It will not have
26: 1) list of ghost elements (since they are not needed for traditional
27: Galerkin style finite element methods). For various finite volume methods
28: you may need the ghost element lists, these may be generated using the
29: element neighbor information given in the file database.
31: To compute the local element stiffness matrix and load vector, each processor
32: will need the vertex coordinates for all of the vertices on the locally
33: "owned" elements. This data could be obtained by doing the appropriate vector
34: scatter on the data stored in gdata->vert; we haven't had time to demonstrate this.
36: Clearly writing a complete parallel unstructured grid code with PETSc is still
37: a good deal of work and requires a lot of application level coding. BUT, at least
38: PETSc can manage all of the nonlinear and linear solvers (including matrix assembly
39: etc.), which allows the programmer to concentrate his or her efforts on managing
40: the unstructured grid. The PETSc team is developing additional library objects
41: to help manage parallel unstructured grid computations. Unfortunately we have
42: not had time to complete these yet, so the application programmer still must
43: manage much of the parallel grid manipulation as indicated below.
45: */
47: /*
48: Include "petscmat.h" so that we can use matrices.
49: automatically includes:
50: petscsys.h - base PETSc routines petscvec.h - vectors
51: petscmat.h - matrices
52: petscis.h - index sets petscviewer.h - viewers
54: Include "petscao.h" allows use of the AO (application ordering) commands,
55: used below for renumbering the vertex numbers after the partitioning.
57: Include "petscbt.h" for managing logical bit arrays that are used to
58: conserve space. Note that the code does use order N bit arrays on each
59: processor so is theoretically not scalable, but even with 64 million
60: vertices it will only need temporarily 8 megabytes of memory for the
61: bit array so one can still do very large problems with this approach,
62: since the bit arrays are freed before the vectors and matrices are
63: created.
64: */
65: #include <petscmat.h>
66: #include <petscao.h>
67: #include <petscbt.h>
69: /*
70: This is the user-defined grid data context
71: */
72: typedef struct {
73: PetscInt n_vert,n_ele;
74: PetscInt mlocal_vert,mlocal_ele;
75: PetscInt *ele;
76: PetscScalar *vert;
77: PetscInt *ia,*ja;
78: IS isnewproc;
79: PetscInt *localvert,nlocal; /* used to stash temporarily old global vertex number of new vertex */
80: } GridData;
82: /*
83: Variables on all processors:
84: n_vert - total number of vertices
85: mlocal_vert - number of vertices on this processor
86: vert - x,y coordinates of local vertices
88: n_ele - total number of elements
89: mlocal_ele - number of elements on this processor
90: ele - vertices of elements on this processor
92: ia, ja - adjacency graph of elements (for partitioning)
93:
94: Variables on processor 0 during data reading from file:
95: mmlocal_vert[i] - number of vertices on each processor
96: tmpvert - x,y coordinates of vertices on any processor (as read in)
98: mmlocal_ele[i] - number of elements on each processor
100: tmpia, tmpja - adjacency graph of elements for other processors
102: Notes:
103: The code below has a great deal of IO (print statements). This is to allow one to track
104: the renumbering and movement of data among processors. In an actual
105: production run, IO of this type would be deactivated.
107: To use the ParMETIS partitioner run with the option -mat_partitioning_type parmetis
108: otherwise it defaults to the initial element partitioning induced when the data
109: is read in.
111: To understand the parallel performance of this type of code, it is important
112: to profile the time spent in various events in the code; running with the
113: option -log_summary will indicate how much time is spent in the routines
114: below. Of course, for very small problems, such as the sample grid used here,
115: the profiling results are meaningless.
116: */
118: extern PetscErrorCode DataRead(GridData *);
119: extern PetscErrorCode DataPartitionElements(GridData *);
120: extern PetscErrorCode DataMoveElements(GridData *);
121: extern PetscErrorCode DataPartitionVertices(GridData *);
122: extern PetscErrorCode DataMoveVertices(GridData *);
123: extern PetscErrorCode DataDestroy(GridData *);
127: int main(int argc,char **args)
128: {
130: #if defined(PETSC_USE_LOG)
131: PetscLogEvent READ_EVENT,PARTITION_ELEMENT_EVENT,MOVE_ELEMENT_EVENT;
132: PetscLogEvent PARTITION_VERTEX_EVENT,MOVE_VERTEX_EVENT;
133: #endif
134: GridData gdata;
136: PetscInitialize(&argc,&args,(char *)0,help);
138: PetscLogEventRegister("Read Data",0,&READ_EVENT);
139: PetscLogEventRegister("Partition elemen",0,&PARTITION_ELEMENT_EVENT);
140: PetscLogEventRegister("Move elements",0,&MOVE_ELEMENT_EVENT);
141: PetscLogEventRegister("Partition vertic",0,&PARTITION_VERTEX_EVENT);
142: PetscLogEventRegister("Move vertices",0,&MOVE_VERTEX_EVENT);
144: PetscLogEventBegin(READ_EVENT,0,0,0,0);
145: DataRead(&gdata);
146: PetscLogEventEnd(READ_EVENT,0,0,0,0);
147: PetscLogEventBegin(PARTITION_ELEMENT_EVENT,0,0,0,0);
148: DataPartitionElements(&gdata);
149: PetscLogEventEnd(PARTITION_ELEMENT_EVENT,0,0,0,0);
150: PetscLogEventBegin(MOVE_ELEMENT_EVENT,0,0,0,0);
151: DataMoveElements(&gdata);
152: PetscLogEventEnd(MOVE_ELEMENT_EVENT,0,0,0,0);
153: PetscLogEventBegin(PARTITION_VERTEX_EVENT,0,0,0,0);
154: DataPartitionVertices(&gdata);
155: PetscLogEventEnd(PARTITION_VERTEX_EVENT,0,0,0,0);
156: PetscLogEventBegin(MOVE_VERTEX_EVENT,0,0,0,0);
157: DataMoveVertices(&gdata);
158: PetscLogEventEnd(MOVE_VERTEX_EVENT,0,0,0,0);
159: DataDestroy(&gdata);
161: PetscFinalize();
162: return 0;
163: }
168: /*
169: Reads in the grid data from a file; each processor is naively
170: assigned a continuous chunk of vertex and element data. Later the data
171: will be partitioned and moved to the appropriate processor.
172: */
173: PetscErrorCode DataRead(GridData *gdata)
174: {
175: PetscMPIInt rank,size;
176: PetscInt n_vert,*mmlocal_vert,mlocal_vert,i,*ia,*ja,cnt,j;
177: PetscInt mlocal_ele,*mmlocal_ele,*ele,*tmpele,n_ele,net,a1,a2,a3;
178: PetscInt *iatmp,*jatmp;
180: char msg[128];
181: PetscScalar *vert,*tmpvert;
182: MPI_Status status;
185: /*
186: Processor 0 opens the file, reads in chunks of data and sends a portion off to
187: each other processor.
189: Note: For a truely scalable IO portion of the code, one would store
190: the grid data in a binary file and use MPI-IO commands to have each
191: processor read in the parts that it needs. However in most circumstances
192: involving up to a say a million nodes and 100 processors this approach
193: here is fine.
194: */
195: MPI_Comm_size(PETSC_COMM_WORLD,&size);
196: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
198: if (!rank) {
199: FILE *fd;
200: fd = fopen("usgdata","r"); if (!fd) SETERRQ(PETSC_COMM_SELF,1,"Cannot open grid file");
202: /* read in number of vertices */
203: fgets(msg,128,fd);
204: printf("File msg:%s",msg);
205: fscanf(fd,"Number Vertices = %d\n",&n_vert);
206: printf("Number of grid vertices %d\n",n_vert);
208: /* broadcast number of vertices to all processors */
209: MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
210: mlocal_vert = n_vert/size + ((n_vert % size) > 0);
212: /*
213: allocate enough room for the first processor to keep track of how many
214: vertices are assigned to each processor. Splitting vertices equally amoung
215: all processors.
216: */
217: PetscMalloc(size*sizeof(PetscInt),&mmlocal_vert);
218: for (i=0; i<size; i++) {
219: mmlocal_vert[i] = n_vert/size + ((n_vert % size) > i);
220: printf("Processor %d assigned %d vertices\n",i,mmlocal_vert[i]);
221: }
223: /*
224: Read in vertices assigned to first processor
225: */
226: PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&vert);
227: printf("Vertices assigned to processor 0\n");
228: for (i=0; i<mlocal_vert; i++) {
229: fscanf(fd,"%d %lf %lf\n",&cnt,vert+2*i,vert+2*i+1);
230: printf("%d %g %g\n",cnt,PetscRealPart(vert[2*i]),PetscRealPart(vert[2*i+1]));
231: }
233: /*
234: Read in vertices for all the other processors
235: */
236: PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&tmpvert);
237: for (j=1; j<size; j++) {
238: printf("Vertices assigned to processor %d\n",j);
239: for (i=0; i<mmlocal_vert[j]; i++) {
240: fscanf(fd,"%d %lf %lf\n",&cnt,tmpvert+2*i,tmpvert+2*i+1);
241: printf("%d %g %g\n",cnt,tmpvert[2*i],tmpvert[2*i+1]);
242: }
243: MPI_Send(tmpvert,2*mmlocal_vert[j],MPIU_SCALAR,j,0,PETSC_COMM_WORLD);
244: }
245: PetscFree(tmpvert);
246: PetscFree(mmlocal_vert);
248: fscanf(fd,"Number Elements = %d\n",&n_ele);
249: printf("Number of grid elements %d\n",n_ele);
251: /*
252: Broadcast number of elements to all processors
253: */
254: MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
255: mlocal_ele = n_ele/size + ((n_ele % size) > 0);
257: /*
258: Allocate enough room for the first processor to keep track of how many
259: elements are assigned to each processor.
260: */
261: PetscMalloc(size*sizeof(PetscInt),&mmlocal_ele);
262: for (i=0; i<size; i++) {
263: mmlocal_ele[i] = n_ele/size + ((n_ele % size) > i);
264: printf("Processor %d assigned %d elements\n",i,mmlocal_ele[i]);
265: }
266:
267: /*
268: read in element information for the first processor
269: */
270: PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&ele);
271: printf("Elements assigned to processor 0\n");
272: for (i=0; i<mlocal_ele; i++) {
273: fscanf(fd,"%d %d %d %d\n",&cnt,ele+3*i,ele+3*i+1,ele+3*i+2);
274: printf("%d %d %d %d\n",cnt,ele[3*i],ele[3*i+1],ele[3*i+2]);
275: }
277: /*
278: Read in elements for all the other processors
279: */
280: PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&tmpele);
281: for (j=1; j<size; j++) {
282: printf("Elements assigned to processor %d\n",j);
283: for (i=0; i<mmlocal_ele[j]; i++) {
284: fscanf(fd,"%d %d %d %d\n",&cnt,tmpele+3*i,tmpele+3*i+1,tmpele+3*i+2);
285: printf("%d %d %d %d\n",cnt,tmpele[3*i],tmpele[3*i+1],tmpele[3*i+2]);
286: }
287: MPI_Send(tmpele,3*mmlocal_ele[j],MPI_INT,j,0,PETSC_COMM_WORLD);
288: }
289: PetscFree(tmpele);
291: /*
292: Read in element neighbors for processor 0
293: We don't know how many spaces in ja[] to allocate so we allocate
294: 3*the number of local elements, this is the maximum it could be
295: */
296: PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
297: PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&ja);
298: net = 0;
299: ia[0] = 0;
300: printf("Element neighbors on processor 0\n");
301: fgets(msg,128,fd);
302: for (i=0; i<mlocal_ele; i++) {
303: fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
304: printf("%d %d %d %d\n",cnt,a1,a2,a3);
305: if (a1 >= 0) {ja[net++] = a1;}
306: if (a2 >= 0) {ja[net++] = a2;}
307: if (a3 >= 0) {ja[net++] = a3;}
308: ia[i+1] = net;
309: }
311: printf("ia values for processor 0\n");
312: for (i=0; i<mlocal_ele+1; i++) {
313: printf("%d ",ia[i]);
314: }
315: printf("\n");
316: printf("ja values for processor 0\n");
317: for (i=0; i<ia[mlocal_ele]; i++) {
318: printf("%d ",ja[i]);
319: }
320: printf("\n");
322: /*
323: Read in element neighbor information for all other processors
324: */
325: PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&iatmp);
326: PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&jatmp);
327: for (j=1; j<size; j++) {
328: net = 0;
329: iatmp[0] = 0;
330: printf("Element neighbors on processor %d\n",j);
331: for (i=0; i<mmlocal_ele[j]; i++) {
332: fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
333: printf("%d %d %d %d\n",cnt,a1,a2,a3);
334: if (a1 >= 0) {jatmp[net++] = a1;}
335: if (a2 >= 0) {jatmp[net++] = a2;}
336: if (a3 >= 0) {jatmp[net++] = a3;}
337: iatmp[i+1] = net;
338: }
340: printf("ia values for processor %d\n",j);
341: for (i=0; i<mmlocal_ele[j]+1; i++) {
342: printf("%d ",iatmp[i]);
343: }
344: printf("\n");
345: printf("ja values for processor %d\n",j);
346: for (i=0; i<iatmp[mmlocal_ele[j]]; i++) {
347: printf("%d ",jatmp[i]);
348: }
349: printf("\n");
351: /* send graph off to appropriate processor */
352: MPI_Send(iatmp,mmlocal_ele[j]+1,MPI_INT,j,0,PETSC_COMM_WORLD);
353: MPI_Send(jatmp,iatmp[mmlocal_ele[j]],MPI_INT,j,0,PETSC_COMM_WORLD);
354: }
355: PetscFree(iatmp);
356: PetscFree(jatmp);
357: PetscFree(mmlocal_ele);
359: fclose(fd);
360: } else {
361: /*
362: We are not the zeroth processor so we do not open the file
363: rather we wait for processor 0 to send us our data.
364: */
366: /* receive total number of vertices */
367: MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
368: mlocal_vert = n_vert/size + ((n_vert % size) > rank);
370: /* receive vertices */
371: PetscMalloc(2*(mlocal_vert+1)*sizeof(PetscScalar),&vert);
372: MPI_Recv(vert,2*mlocal_vert,MPIU_SCALAR,0,0,PETSC_COMM_WORLD,&status);
374: /* receive total number of elements */
375: MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
376: mlocal_ele = n_ele/size + ((n_ele % size) > rank);
378: /* receive elements */
379: PetscMalloc(3*(mlocal_ele+1)*sizeof(PetscInt),&ele);
380: MPI_Recv(ele,3*mlocal_ele,MPI_INT,0,0,PETSC_COMM_WORLD,&status);
382: /* receive element adjacency graph */
383: PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
384: MPI_Recv(ia,mlocal_ele+1,MPI_INT,0,0,PETSC_COMM_WORLD,&status);
386: PetscMalloc((ia[mlocal_ele]+1)*sizeof(PetscInt),&ja);
387: MPI_Recv(ja,ia[mlocal_ele],MPI_INT,0,0,PETSC_COMM_WORLD,&status);
388: }
390: gdata->n_vert = n_vert;
391: gdata->n_ele = n_ele;
392: gdata->mlocal_vert = mlocal_vert;
393: gdata->mlocal_ele = mlocal_ele;
394: gdata->ele = ele;
395: gdata->vert = vert;
397: gdata->ia = ia;
398: gdata->ja = ja;
400: return(0);
401: }
406: /*
407: Given the grid data spread across the processors, determines a
408: new partitioning of the CELLS (elements) to reduce the number of cut edges between
409: cells (elements).
410: */
411: PetscErrorCode DataPartitionElements(GridData *gdata)
412: {
413: Mat Adj; /* adjacency matrix */
414: PetscInt *ia,*ja;
415: PetscInt mlocal_ele,n_ele;
416: PetscErrorCode ierr;
417: MatPartitioning part;
418: IS isnewproc;
421: n_ele = gdata->n_ele;
422: mlocal_ele = gdata->mlocal_ele;
424: ia = gdata->ia;
425: ja = gdata->ja;
427: /*
428: Create the adjacency graph matrix
429: */
430: MatCreateMPIAdj(PETSC_COMM_WORLD,mlocal_ele,n_ele,ia,ja,PETSC_NULL,&Adj);
432: /*
433: Create the partioning object
434: */
435: MatPartitioningCreate(PETSC_COMM_WORLD,&part);
436: MatPartitioningSetAdjacency(part,Adj);
437: MatPartitioningSetFromOptions(part);
438: MatPartitioningApply(part,&isnewproc);
439: MatPartitioningDestroy(&part);
441: /*
442: isnewproc - indicates for each local element the new processor it is assigned to
443: */
444: PetscPrintf(PETSC_COMM_WORLD,"New processor assignment for each element\n");
445: ISView(isnewproc,PETSC_VIEWER_STDOUT_WORLD);
446: gdata->isnewproc = isnewproc;
448: /*
449: Free the adjacency graph data structures
450: */
451: MatDestroy(&Adj);
454: return(0);
455: }
459: /*
460: Moves the grid element data to be on the correct processor for the new
461: element partitioning.
462: */
463: PetscErrorCode DataMoveElements(GridData *gdata)
464: {
466: PetscInt *counts,i,*tidx;
467: const PetscInt *idx;
468: PetscMPIInt rank,size;
469: Vec vele,veleold;
470: PetscScalar *array;
471: IS isscat,isnum;
472: VecScatter vecscat;
476: MPI_Comm_size(PETSC_COMM_WORLD,&size);
477: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
479: /*
480: Determine how many elements are assigned to each processor
481: */
482: PetscMalloc(size*sizeof(PetscInt),&counts);
483: ISPartitioningCount(gdata->isnewproc,size,counts);
485: /*
486: Create a vector to contain the newly ordered element information
488: Note: we use vectors to communicate this data since we can use the powerful
489: VecScatter routines to get the data to the correct location. This is a little
490: wasteful since the vectors hold double precision numbers instead of integers,
491: but since this is just a setup phase in the entire numerical computation that
492: is only called once it is not a measureable performance bottleneck.
493: */
494: VecCreate(PETSC_COMM_WORLD,&vele);
495: VecSetSizes(vele,3*counts[rank],PETSC_DECIDE);
496: VecSetFromOptions(vele);
498: /*
499: Create an index set from the isnewproc index set to indicate the mapping TO
500: */
501: ISPartitioningToNumbering(gdata->isnewproc,&isnum);
502: ISDestroy(&gdata->isnewproc);
503: /*
504: There are three data items per cell (element), the integer vertex numbers of its three
505: coordinates (we convert to double to use the scatter) (one can think
506: of the vectors of having a block size of 3, then there is one index in idx[] for each element)
507: */
508: ISGetIndices(isnum,&idx);
509: PetscMalloc(gdata->mlocal_ele*sizeof(PetscInt),&tidx);
510: for (i=0; i<gdata->mlocal_ele; i++) {
511: tidx[i] = idx[i];
512: }
513: ISCreateBlock(PETSC_COMM_WORLD,3,gdata->mlocal_ele,tidx,PETSC_COPY_VALUES,&isscat);
514: ISRestoreIndices(isnum,&idx);
515: PetscFree(tidx);
516: ISDestroy(&isnum);
518: /*
519: Create a vector to contain the original vertex information for each element
520: */
521: VecCreateSeq(PETSC_COMM_SELF,3*gdata->mlocal_ele,&veleold);
522: VecGetArray(veleold,&array);
523: for (i=0; i<3*gdata->mlocal_ele; i++) {
524: array[i] = gdata->ele[i];
525: }
526: VecRestoreArray(veleold,&array);
527: /*
528: Scatter the element vertex information (still in the original vertex ordering) to the correct processor
529: */
530: VecScatterCreate(veleold,PETSC_NULL,vele,isscat,&vecscat);
531: ISDestroy(&isscat);
532: VecScatterBegin(vecscat,veleold,vele,INSERT_VALUES,SCATTER_FORWARD);
533: VecScatterEnd(vecscat,veleold,vele,INSERT_VALUES,SCATTER_FORWARD);
534: VecScatterDestroy(&vecscat);
535: VecDestroy(&veleold);
537: /*
538: Put the element vertex data into a new allocation of the gdata->ele
539: */
540: PetscFree(gdata->ele);
541: gdata->mlocal_ele = counts[rank];
542: PetscFree(counts);
543: PetscMalloc(3*gdata->mlocal_ele*sizeof(PetscInt),&gdata->ele);
544: VecGetArray(vele,&array);
545: for (i=0; i<3*gdata->mlocal_ele; i++) {
546: gdata->ele[i] = (int)PetscRealPart(array[i]);
547: }
548: VecRestoreArray(vele,&array);
549: VecDestroy(&vele);
551: PetscPrintf(PETSC_COMM_WORLD,"Old vertex numbering in new element ordering\n");
552: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
553: for (i=0; i<gdata->mlocal_ele; i++) {
554: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
555: gdata->ele[3*i+2]);
556: }
557: PetscSynchronizedFlush(PETSC_COMM_WORLD);
559: return(0);
560: }
564: /*
565: Given the newly partitioned cells (elements), this routine partitions the
566: vertices.
568: The code is not completely scalable since it requires
569: 1) O(n_vert) bits per processor memory
570: 2) uses O(size) stages of communication; each of size O(n_vert) bits
571: 3) it is sequential (each processor marks it vertices ONLY after all processors
572: to the left have marked theirs.
573: 4) the amount of work on the last processor is O(n_vert)
575: The algorithm also does not take advantage of vertices that are "interior" to a
576: processors elements (that is; is not contained in any element on another processor).
577: A better algorithm would first have all processors determine "interior" vertices and
578: make sure they are retained on that processor before listing "boundary" vertices.
580: The algorithm is:
581: a) each processor waits for a message from the left containing mask of all marked vertices
582: b) it loops over all local elements, generating a list of vertices it will
583: claim (not claiming ones that have already been marked in the bit-array)
584: it claims at most n_vert/size vertices
585: c) it sends to the right the mask
587: This is a quick-and-dirty implementation; it should work fine for many problems,
588: but will need to be replaced once profiling shows that it takes a large amount of
589: time. An advantage is it requires no searching or sorting.
590:
591: */
592: PetscErrorCode DataPartitionVertices(GridData *gdata)
593: {
594: PetscInt n_vert = gdata->n_vert,*localvert;
595: PetscInt mlocal_ele = gdata->mlocal_ele,*ele = gdata->ele,i,j,nlocal = 0,nmax;
596: PetscMPIInt rank,size;
598: PetscBT mask;
599: MPI_Status status;
602: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
603: MPI_Comm_size(PETSC_COMM_WORLD,&size);
605: /*
606: Allocated space to store bit-array indicting vertices marked
607: */
608: PetscBTCreate(n_vert,&mask);
610: /*
611: All processors except last can have a maximum of n_vert/size vertices assigned
612: (because last processor needs to handle any leftovers)
613: */
614: nmax = n_vert/size;
615: if (rank == size-1) {
616: nmax = n_vert;
617: }
619: /*
620: Receive list of marked vertices from left
621: */
622: if (rank) {
623: MPI_Recv(mask,PetscBTLength(n_vert),MPI_CHAR,rank-1,0,PETSC_COMM_WORLD,&status);
624: }
626: if (rank == size-1) {
627: /* last processor gets all the rest */
628: for (i=0; i<n_vert; i++) {
629: if (!PetscBTLookup(mask,i)) {
630: nlocal++;
631: }
632: }
633: nmax = nlocal;
634: }
636: /*
637: Now we know how many are local, allocated enough space for them and mark them
638: */
639: PetscMalloc((nmax+1)*sizeof(PetscInt),&localvert);
641: /* generate local list and fill in mask */
642: nlocal = 0;
643: if (rank < size-1) {
644: /* count my vertices */
645: for (i=0; i<mlocal_ele; i++) {
646: for (j=0; j<3; j++) {
647: if (!PetscBTLookupSet(mask,ele[3*i+j])) {
648: localvert[nlocal++] = ele[3*i+j];
649: if (nlocal >= nmax) goto foundenough2;
650: }
651: }
652: }
653: foundenough2:;
654: } else {
655: /* last processor gets all the rest */
656: for (i=0; i<n_vert; i++) {
657: if (!PetscBTLookup(mask,i)) {
658: localvert[nlocal++] = i;
659: }
660: }
661: }
662: /*
663: Send bit mask on to next processor
664: */
665: if (rank < size-1) {
666: MPI_Send(mask,PetscBTLength(n_vert),MPI_CHAR,rank+1,0,PETSC_COMM_WORLD);
667: }
668: PetscBTDestroy(&mask);
670: gdata->localvert = localvert;
671: gdata->nlocal = nlocal;
673: /* print lists of owned vertices */
674: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Number vertices assigned %d\n",rank,nlocal);
675: PetscSynchronizedFlush(PETSC_COMM_WORLD);
676: PetscIntView(nlocal,localvert,PETSC_VIEWER_STDOUT_WORLD);
678: return(0);
679: }
683: /*
684: Given the partitioning of the vertices; renumbers the element vertex lists for the
685: new vertex numbering and moves the vertex coordinate values to the correct processor
686: */
687: PetscErrorCode DataMoveVertices(GridData *gdata)
688: {
689: AO ao;
691: PetscMPIInt rank;
692: PetscInt i;
693: Vec vert,overt;
694: VecScatter vecscat;
695: IS isscat;
696: PetscScalar *avert;
699: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
701: /* ---------------------------------------------------------------------
702: Create a global reodering of the vertex numbers
703: */
704: AOCreateBasic(PETSC_COMM_WORLD,gdata->nlocal,gdata->localvert,PETSC_NULL,&ao);
706: /*
707: Change the element vertex information to the new vertex numbering
708: */
709: AOApplicationToPetsc(ao,3*gdata->mlocal_ele,gdata->ele);
710: PetscPrintf(PETSC_COMM_WORLD,"New vertex numbering in new element ordering\n");
711: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
712: for (i=0; i<gdata->mlocal_ele; i++) {
713: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
714: gdata->ele[3*i+2]);
715: }
716: PetscSynchronizedFlush(PETSC_COMM_WORLD);
718: /*
719: Destroy the AO that is no longer needed
720: */
721: AODestroy(&ao);
723: /* --------------------------------------------------------------------
724: Finally ship the vertex coordinate information to its owning process
725: note, we do this in a way very similar to what was done for the element info
726: */
727: /* create a vector to contain the newly ordered vertex information */
728: VecCreateSeq(PETSC_COMM_SELF,2*gdata->nlocal,&vert);
730: /* create a vector to contain the old ordered vertex information */
731: VecCreateMPIWithArray(PETSC_COMM_WORLD,1,2*gdata->mlocal_vert,PETSC_DECIDE,gdata->vert,&overt);
733: /*
734: There are two data items per vertex, the x and y coordinates (i.e. one can think
735: of the vectors of having a block size of 2 and there is one index in localvert[] for each block)
736: */
737: ISCreateBlock(PETSC_COMM_WORLD,2,gdata->nlocal,gdata->localvert,PETSC_COPY_VALUES,&isscat);
738: PetscFree(gdata->localvert);
740: /*
741: Scatter the element vertex information to the correct processor
742: */
743: VecScatterCreate(overt,isscat,vert,PETSC_NULL,&vecscat);
744: ISDestroy(&isscat);
745: VecScatterBegin(vecscat,overt,vert,INSERT_VALUES,SCATTER_FORWARD);
746: VecScatterEnd(vecscat,overt,vert,INSERT_VALUES,SCATTER_FORWARD);
747: VecScatterDestroy(&vecscat);
749: VecDestroy(&overt);
750: PetscFree(gdata->vert);
751:
752: /*
753: Put resulting vertex information into gdata->vert array
754: */
755: PetscMalloc(2*gdata->nlocal*sizeof(PetscScalar),&gdata->vert);
756: VecGetArray(vert,&avert);
757: PetscMemcpy(gdata->vert,avert,2*gdata->nlocal*sizeof(PetscScalar));
758: VecRestoreArray(vert,&avert);
759: gdata->mlocal_vert = gdata->nlocal;
760: VecDestroy(&vert);
762: PetscPrintf(PETSC_COMM_WORLD,"Vertex coordinates in new numbering\n");
763: for (i=0; i<gdata->mlocal_vert; i++) {
764: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"(%g,%g)\n",gdata->vert[2*i],gdata->vert[2*i+1]);
765: }
766: PetscSynchronizedFlush(PETSC_COMM_WORLD);
768: return(0);
769: }
774: PetscErrorCode DataDestroy(GridData *gdata)
775: {
779: PetscFree(gdata->ele);
780: PetscFree(gdata->vert);
781: return(0);
782: }