Actual source code: xmllogevent.c

petsc-3.7.3 2016-08-01
Report Typos and Errors
  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12:  #include ../src/sys/logging/xmllogevent.h
 13:  #include ../src/sys/logging/xmlviewer.h

 15: #if defined(PETSC_USE_LOG)

 17: /*
 18:  * Support for nested PetscTimers
 19:  *
 20:  * PetscTimers keep track of a lot of useful information: Wall clock times, 
 21:  * message passing statistics, flop counts.  Information about the nested structure 
 22:  * of the timers is lost. Example: 
 23:  *
 24:  * 7:30   Start: awake
 25:  * 7:30      Start: morning routine
 26:  * 7:40         Start: eat
 27:  * 7:49         Done:  eat
 28:  * 7:43      Done:  morning routine
 29:  * 8:15      Start: work
 30:  * 12:15        Start: eat
 31:  * 12:45        Done:  eat
 32:  * 16:00     Done:  work
 33:  * 16:30     Start: evening routine
 34:  * 18:30        Start: eat
 35:  * 19:15        Done:  eat
 36:  * 22:00     Done:  evening routine
 37:  * 22:00  Done:  awake
 38:  *
 39:  * Petsc timers provide the following timer results:
 40:  *
 41:  *    awake:              1 call    14:30 hours
 42:  *    morning routine:    1 call     0:13 hours
 43:  *    eat:                3 calls    1:24 hours
 44:  *    work:               1 call     7:45 hours
 45:  *    evening routine     1 call     5:30 hours
 46:  *
 47:  * Nested timers can be used to get the following table:
 48:  *
 49:  *   [1 call]: awake                14:30 hours
 50:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 51:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 52:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 53:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 54:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 55:  *                   rest (work)               7:15 hours         (94 % of work)
 56:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 57:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 58:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 59:  *
 60:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least, 
 61:  * the nested timers make the stages unnecessary.
 62:  *
 63:  */

 65: /*
 66:  * Data structures for keeping track of nested timers:
 67:  *
 68:  *   nestedEvents: information about the timers that have actually been activated
 69:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 70:  *
 71:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to 
 72:  * (one or more) default timers, where one of the default timers has the same event-id as the 
 73:  * nested one.
 74:  *
 75:  * Because of the risk of confusion between nested timer ids and default timer ids, we 
 76:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent 
 77:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 78:  * default timers with 'dft'.
 79:  */
 80: typedef PetscLogEvent NestedEventId;
 81: typedef struct {
 82:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 83:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 84:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 85:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 87:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 88:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 89: } PetscNestedEvent;

 91: static PetscLogEvent    dftParentActive  = 0;
 92: static int              nNestedEvents           = 0;
 93: static int              nNestedEventsAllocated  = 0;
 94: static PetscNestedEvent *nestedEvents = NULL;
 95: static PetscLogDouble   threshTime      = 0.01; /* initial value was .1 */

 97: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
 98: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);

102: PetscErrorCode PetscLogNestedBegin(void)
103: {
104:   PetscErrorCode    ierr;
106:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

108:   nNestedEventsAllocated=10;
109:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);


112:   dftParentActive = 0;
113:   nNestedEvents =1;

115:   /* 'Awake' is nested event 0. It has no parents */
116:   nestedEvents[0].nstEvent          = 0;
117:   nestedEvents[0].nParents          = 0;
118:   nestedEvents[0].dftParentsSorted  = NULL;
119:   nestedEvents[0].dftEvents         = NULL;
120:   nestedEvents[0].dftParents        = NULL;
121:   nestedEvents[0].dftEventsSorted   = NULL;

123:   PetscLogSet(PetscLogEventBeginNested, PetscLogEventEndNested);
124:   return(0);
125: }

127: /* Delete the data structures for the nested timers */
130: PetscErrorCode PetscLogNestedEnd(void)
131: {
132:   PetscErrorCode    ierr;
133:   int               i;

136:   if (!nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents does not exist");

138:   for (i=0; i<nNestedEvents; i++) {
139:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
140:   }
141:   PetscFree(nestedEvents);
142:   nestedEvents           = NULL;
143:   nNestedEvents          = 0;
144:   nNestedEventsAllocated = 0;
145:   return(0);
146: }


149: /*
150:  * UTILITIES: FIND STUFF IN SORTED ARRAYS
151:  *
152:  * Utility: find a default timer in a sorted array */
155: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,    /* index to be found */
156:                                                     const PetscLogEvent *dftArray,  /* sorted array of PetscLogEvent-ids */
157:                                                     int narray,   /* dimension of dftArray */
158:                                                     int *entry)         /* entry in the array where dftIndex may be found; 
159:                                                                             *   if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray  
160:                                                                             *   In that case, the dftIndex can be inserted at this entry. */
161: {
163:   if (narray==0 || dftIndex <= dftArray[0]) {
164:     *entry = 0;
165:   } else if (dftIndex > dftArray[narray-1]) {
166:     *entry = narray;
167:   } else {
168:     int ihigh=narray-1,  ilow=0;
169:     while (ihigh>ilow) {
170:       const int imiddle = (ihigh+ilow)/2;
171:       if (dftArray[imiddle] > dftIndex) {
172:         ihigh=imiddle;
173:       } else if (dftArray[imiddle]<dftIndex) {
174:         ilow =imiddle+1;
175:       } else {
176:         ihigh=imiddle;
177:         ilow =imiddle;
178:       }
179:     }
180:     *entry = ihigh;
181:   }
182:   return(0);
183: }

185: /* Utility: find the nested event with given identification */
188: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, /* Nested event to be found */
189:                                                    int *entry)          /* entry in the nestedEvents where nstEvent may be found;
190:                                                                               if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray  */
191: {

194:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
195:     *entry = 0;
196:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
197:     *entry = nNestedEvents;
198:   } else {
199:     int ihigh=nNestedEvents-1,  ilow=0;
200:     while (ihigh>ilow) {
201:       const int imiddle = (ihigh+ilow)/2;
202:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
203:         ihigh=imiddle;
204:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
205:         ilow =imiddle+1;
206:       } else {
207:         ihigh=imiddle;
208:         ilow =imiddle;
209:       }
210:     }
211:     *entry = ihigh;
212:   }
213:   return(0);
214: }

216: /******************************************************************************************/
217: /* Start a nested event */
220: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
221: {
222:   PetscErrorCode  ierr;
223:   int             entry, pentry, tentry,i;
224:   PetscLogEvent   dftEvent;

227:   PetscLogEventFindNestedTimer(nstEvent, &entry);
228:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
229:     /* Nested event doesn't exist yet: create it */

231:     if (nNestedEvents==nNestedEventsAllocated) {
232:       /* Enlarge and re-allocate nestedEvents if needed */
233:       PetscNestedEvent *tmp = nestedEvents;
234:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
235:       nNestedEventsAllocated*=2;
236:       PetscMemcpy(nestedEvents, tmp, nNestedEvents*sizeof(PetscNestedEvent));
237:       PetscFree(tmp);
238:     }

240:     /* Clear space in nestedEvents for new nested event */
241:     nNestedEvents++;
242:     for (i = nNestedEvents-1; i>entry; i--) {
243:       nestedEvents[i] = nestedEvents[i-1];
244:     }

246:     /* Create event in nestedEvents */
247:     nestedEvents[entry].nstEvent = nstEvent;
248:     nestedEvents[entry].nParents=1;
249:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

251:     /* Fill in new event */
252:     pentry = 0;
253:     dftEvent = (PetscLogEvent) nstEvent;

255:     nestedEvents[entry].nstEvent                 = nstEvent;
256:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
257:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
258:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
259:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

261:   } else {
262:     /* Nested event exists: find current dftParentActive among parents */
263:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
264:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
265:     int           nParents          = nestedEvents[entry].nParents;

267:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

269:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
270:       /* dftParentActive not in the list: add it to the list */
271:       int           i;
272:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
273:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
274:       char          name[100];

276:       /* Register a new default timer */
277:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
278:       PetscLogEventRegister(name, 0, &dftEvent);
279:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

281:       /* Reallocate parents and dftEvents to make space for new parent */
282:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
283:       PetscMemcpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents*sizeof(PetscLogEvent));
284:       PetscMemcpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents*sizeof(PetscLogEvent));
285:       PetscMemcpy(nestedEvents[entry].dftParents,       dftParents,       nParents*sizeof(PetscLogEvent));
286:       PetscMemcpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents*sizeof(PetscLogEvent));
287:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

289:       dftParents       = nestedEvents[entry].dftParents;
290:       dftEvents        = nestedEvents[entry].dftEvents;
291:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
292:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

294:       nestedEvents[entry].nParents++;
295:       nParents++;

297:       for (i = nParents-1; i>pentry; i--) {
298:         dftParentsSorted[i] = dftParentsSorted[i-1];
299:         dftEvents[i]        = dftEvents[i-1];
300:       }
301:       for (i = nParents-1; i>tentry; i--) {
302:         dftParents[i]      = dftParents[i-1];
303:         dftEventsSorted[i] = dftEventsSorted[i-1];
304:       }

306:       /* Fill in the new default timer */
307:       dftParentsSorted[pentry] = dftParentActive;
308:       dftEvents[pentry]        = dftEvent;
309:       dftParents[tentry]       = dftParentActive;
310:       dftEventsSorted[tentry]  = dftEvent;

312:     } else {
313:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
314:       dftEvent = nestedEvents[entry].dftEvents[pentry];
315:     }
316:   }

318:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
319:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
320:   dftParentActive = dftEvent;
321:   return(0);
322: }

324: /* End a nested event */
327: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
328: {
329:   PetscErrorCode  ierr;
330:   int             entry, pentry, nParents;
331:   PetscLogEvent  *dftEventsSorted;

334:   /* Find the nested event */
335:   PetscLogEventFindNestedTimer(nstEvent, &entry);
336:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
337:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
338:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
339:   nParents        = nestedEvents[entry].nParents;

341:   /* Find the current default timer among the 'dftEvents' of this event */
342:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

344:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
345:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

347:   /* Stop the default timer and update the dftParentActive */
348:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
349:   dftParentActive = nestedEvents[entry].dftParents[pentry];
350:   return(0);
351: }

353: /* Set the threshold time for logging the events 
354:  */
357: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
358: {
360:   *oldThresh = threshTime;
361:   threshTime = newThresh;
362:   return(0);
363: }

367: static PetscErrorCode  PetscPrintExeSpecs(PetscViewer viewer)
368: {
369:   PetscErrorCode     ierr;
370:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
371:   char               version[256], buildoptions[128];
372:   PetscMPIInt        size;
373:   MPI_Comm           comm;
374:   size_t             len;
375: 
377:   PetscObjectGetComm((PetscObject)viewer,&comm);
378:   PetscGetArchType(arch,sizeof(arch));
379:   PetscGetHostName(hostname,sizeof(hostname));
380:   PetscGetUserName(username,sizeof(username));
381:   PetscGetProgramName(pname,sizeof(pname));
382:   PetscGetDate(date,sizeof(date));
383:   PetscGetVersion(version,sizeof(version));
384:   MPI_Comm_size(comm, &size);

386:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
387:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname );
388:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch );
389:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
390:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size );
391:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
392:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
393:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);
394: #if defined(PETSC_USE_DEBUG)
395: #  if defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_FORTRAN_KERNELS)
396:   sprintf(buildoptions, "Debug, ComplexC++Kernels");
397: #  else
398:   sprintf(buildoptions, "Debug");
399: #  endif
400: #else
401: #  if defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_FORTRAN_KERNELS)
402:   sprintf(buildoptions, "ComplexC++Kernels");
403: #  endif
404: #endif
405:   PetscStrlen(buildoptions,&len);
406:   if (len) {
407:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
408:   }
409:   PetscViewerXMLEndSection(viewer, "runspecification");
410:   return(0);
411: }

413: /* Print the global performance: max, max/min, average and total of 
414:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
415:  */
418: static PetscErrorCode  PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble max, PetscLogDouble ratio, PetscLogDouble avg, PetscLogDouble tot)
419: {

423:   PetscViewerXMLStartSection(viewer, name, desc);
424:   PetscViewerXMLPutDouble(viewer, "max", NULL, max, "%e");
425:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
426:   if (avg>-1.0) {
427:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
428:   }
429:   if (tot>-1.0) {
430:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
431:   }
432:   PetscViewerXMLEndSection(viewer, name);
433:   return(0);
434: }

436: /* Print the global performance: max, max/min, average and total of 
437:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
438:  */
441: static PetscErrorCode  PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
442: {
443:   PetscErrorCode     ierr;
444:   PetscLogDouble     min, max, tot, ratio, avg;
445:   PetscLogDouble     flops, mem, red, mess;
446:   PetscMPIInt        size;
447:   MPI_Comm           comm;

450:   PetscObjectGetComm((PetscObject)viewer,&comm);
451:   MPI_Comm_size(comm, &size);

453:   /* Must preserve reduction count before we go on */
454:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

456:   /* Calculate summary information */
457:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

459:   /*   Time */
460:   MPIU_Allreduce(&locTotalTime, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
461:   MPIU_Allreduce(&locTotalTime, &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
462:   MPIU_Allreduce(&locTotalTime, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
463:   avg  = (tot)/((PetscLogDouble) size);
464:   if (min != 0.0) ratio = max/min;
465:   else ratio = 0.0;
466:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", max, ratio, avg, -1.0);

468:   /*   Objects */
469:   avg  = (PetscLogDouble) petsc_numObjects;
470:   MPIU_Allreduce(&avg,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
471:   MPIU_Allreduce(&avg,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
472:   MPIU_Allreduce(&avg,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
473:   avg  = (tot)/((PetscLogDouble) size);
474:   if (min != 0.0) ratio = max/min;
475:   else ratio = 0.0;
476:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", max, ratio, avg, -1.0);

478:   /*   Flop */
479:   MPIU_Allreduce(&petsc_TotalFlops,  &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
480:   MPIU_Allreduce(&petsc_TotalFlops,  &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
481:   MPIU_Allreduce(&petsc_TotalFlops,  &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
482:   avg  = (tot)/((PetscLogDouble) size);
483:   if (min != 0.0) ratio = max/min;
484:   else ratio = 0.0;
485:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

487:   /*   Flop/sec -- Must talk to Barry here */
488:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
489:   else flops = 0.0;
490:   MPIU_Allreduce(&flops,        &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
491:   MPIU_Allreduce(&flops,        &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
492:   MPIU_Allreduce(&flops,        &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
493:   avg  = (tot)/((PetscLogDouble) size);
494:   if (min != 0.0) ratio = max/min;
495:   else ratio = 0.0;
496:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

498:   /*   Memory */
499:   PetscMallocGetMaximumUsage(&mem);
500:   if (mem > 0.0) {
501:     MPIU_Allreduce(&mem,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
502:     MPIU_Allreduce(&mem,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
503:     MPIU_Allreduce(&mem,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
504:     avg  = (tot)/((PetscLogDouble) size);
505:     if (min != 0.0) ratio = max/min;
506:     else ratio = 0.0;
507:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);
508:   }
509:   /*   Messages */
510:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
511:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
512:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
513:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
514:   avg  = (tot)/((PetscLogDouble) size);
515:   if (min != 0.0) ratio = max/min;
516:   else ratio = 0.0;
517:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", max, ratio, avg, tot);

519:   /*   Message Volume */
520:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
521:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
522:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
523:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
524:   avg = (tot)/((PetscLogDouble) size);
525:   if (min != 0.0) ratio = max/min;
526:   else ratio = 0.0;
527:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);

529:   /*   Reductions */
530:   MPIU_Allreduce(&red,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
531:   MPIU_Allreduce(&red,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
532:   MPIU_Allreduce(&red,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
533:   if (min != 0.0) ratio = max/min;
534:   else ratio = 0.0;
535:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", max, ratio, -1, -1);
536:   PetscViewerXMLEndSection(viewer, "globalperformance");
537:   return(0);
538: }

540: typedef struct {
541:   PetscLogEvent  dftEvent;
542:   NestedEventId  nstEvent;
543:   PetscLogEvent  dftParent;
544:   NestedEventId  nstParent;
545:   PetscBool      own;
546:   int            depth;
547:   NestedEventId* nstPath;
548: } PetscNestedEventTree;

550: /* Compare timers to sort them in the tree */
551: static int compareTreeItems(const void *item1_, const void *item2_)
552: {
553:   int                  i;
554:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
555:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
556:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
557:      if (item1->nstPath[i]<item2->nstPath[i]) return -1;
558:      if (item1->nstPath[i]>item2->nstPath[i]) return +1;
559:   }
560:   if (item1->depth < item2->depth) return -1;
561:   if (item1->depth > item2->depth) return 1;
562:   return 0;
563: }
564: /*
565:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
566:  * calls that happen in some processes, but not in others.
567:  *
568:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
569:  * The tree is sorted so that the timers can be printed in the order of appearance.
570:  *
571:  * For tree-items which appear in the trees of multiple processes (which will be most items), the 
572:  * following rule is followed:
573:  * + if information from my own process is available, then that is the information stored in tree.
574:  *   otherwise it is some other process's information.
575:  */
578: static PetscErrorCode  PetscCreateLogTreeNested(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
579: {
580:   PetscNestedEventTree *tree = NULL, *newTree;
581:   int                  *treeIndices;
582:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
583:   int                  yesno;
584:   PetscBool            done;
585:   PetscErrorCode       ierr;
586:   int                  maxdepth;
587:   int                  depth;
588:   int                  illegalEvent;
589:   int                  iextra;
590:   PetscStageLog        stageLog;
591:   NestedEventId        *nstPath, *nstMyPath;
592:   MPI_Comm             comm;

595:   PetscObjectGetComm((PetscObject)viewer,&comm);
596:   PetscLogGetStageLog(&stageLog);

598:   /* Calculate memory needed to store everybody's information and allocate tree */
599:   nTimers = 0;
600:   for (i=0; i<nNestedEvents; i++) nTimers+=nestedEvents[i].nParents;

602:   PetscMalloc1(nTimers,&tree);

604:   /* Fill tree with readily available information */
605:   iTimer0 = 0;
606:   maxDefaultTimer =0;
607:   for (i=0; i<nNestedEvents; i++) {
608:     int           nParents          = nestedEvents[i].nParents;
609:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
610:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
611:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
612:     for (j=0; j<nParents; j++) {
613:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

615:       tree[iTimer0+j].dftEvent   = dftEvents[j];
616:       tree[iTimer0+j].nstEvent   = nstEvent;
617:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
618:       tree[iTimer0+j].own        = PETSC_TRUE;

620:       tree[iTimer0+j].nstParent  = 0;
621:       tree[iTimer0+j].depth      = 0;
622:       tree[iTimer0+j].nstPath    = NULL;
623:     }
624:     iTimer0 += nParents;
625:   }

627:   /* Calculate the global maximum for the default timer index, so array treeIndices can 
628:    * be allocated only once */
629:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
630:   maxDefaultTimer = j;

632:   /* Find default timer's place in the tree */
633:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
634:   treeIndices[0] = 0;
635:   for (i=1; i<nTimers; i++) {
636:     PetscLogEvent dftEvent = tree[i].dftEvent;
637:     treeIndices[dftEvent] = i;
638:   }

640:   /* Find each dftParent's nested identification */
641:   for (i=0; i<nTimers; i++) {
642:     PetscLogEvent dftParent = tree[i].dftParent;
643:     if (dftParent) {
644:       int j = treeIndices[dftParent];
645:       tree[i].nstParent  = tree[j].nstEvent;
646:     }
647:   }

649:   /* Find depths for each timer path */
650:   done = PETSC_FALSE;
651:   maxdepth = 0;
652:   tree[0].depth=1;
653:   while (!done) {
654:     done = PETSC_TRUE;
655:     for (i=1; i<nTimers; i++) {
656:       int j = treeIndices[tree[i].dftParent];
657:       if (j==0) {
658:         tree[i].depth=1;
659:       } else if (tree[i].dftEvent!=0) {
660:         depth = 1+tree[j].depth;
661:         if (depth>tree[i].depth) {
662:           done          = PETSC_FALSE;
663:           tree[i].depth = depth;
664:           maxdepth      = PetscMax(depth,maxdepth);
665:         }
666:       }
667:     }
668:   }

670:   /* Allocate the paths in the entire tree */
671:   for (i=0; i<nTimers; i++) {
672:     depth = tree[i].depth;
673:     PetscCalloc1(depth,&tree[i].nstPath);
674:   }

676:   /* Calculate the paths for all timers */
677:   for (depth=1; depth<=maxdepth; depth++) {
678:     for (i=0; i<nTimers; i++) {
679:       if (tree[i].depth==depth) {
680:         if (depth>1) {
681:           int    j = treeIndices[tree[i].dftParent];
682:           PetscMemcpy(tree[i].nstPath,tree[j].nstPath,(depth-1)*sizeof(NestedEventId));
683:         }
684:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
685:       }
686:     }
687:   }
688:   PetscFree(treeIndices);

690:   /* Sort the tree on basis of the paths */
691:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

693:   /* Allocate an array to store paths */
694:   depth = maxdepth;
695:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
696:   PetscMalloc1(maxdepth+2, &nstPath);
697:   PetscMalloc1(maxdepth+2, &nstMyPath);

699:   /* Find an illegal nested event index (1+largest nested event index) */
700:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
701:   i = illegalEvent;
702:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

704:   /* First, detect timers which are not available in this process, but are available in others
705:    *        Allocate a new tree, that can contain all timers
706:    * Then,  fill the new tree with all (own and not-own) timers */
707:   newTree= NULL;
708:   for (yesno=0; yesno<=1; yesno++) {
709:     depth  = 1;
710:     i      = 0;
711:     iextra = 0;
712:     while (depth>0) {
713:       int       j;
714:       PetscBool same;

716:       /* Construct the next path in this process's tree:
717:        * if necessary, supplement with invalid path entries */
718:       depth++;
719:       if (depth > maxdepth+1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth + 1 %d",depth,maxdepth+1);
720:       if (i<nTimers) {
721:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
722:         for (j=tree[i].depth; j<=depth;         j++) nstMyPath[j] = illegalEvent;
723:       } else {
724:         for (j=0;             j<=depth;         j++) nstMyPath[j] = illegalEvent;
725:       }
726: 
727:       /* Communicate with other processes to obtain the next path and its depth */
728:       MPIU_Allreduce(nstMyPath, nstPath, depth+1, MPI_INT, MPI_MIN, comm);
729:       for (j=depth-1; (int) j>=0; j--) {
730:         if (nstPath[j]==illegalEvent) depth=j;
731:       }
732: 
733:       if (depth>0) {
734:         /* If the path exists */
735: 
736:         /* check whether the next path is the same as this process's next path */
737:         same = PETSC_TRUE;
738:         for (j=0; same && j<=depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}
739: 
740:         if (same) {
741:           /* Register 'own path' */
742:           if (newTree) newTree[i+iextra] = tree[i];
743:           i++;
744:         } else {
745:           /* Register 'not an own path' */
746:           if (newTree) {
747:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
748:             newTree[i+iextra].own        = PETSC_FALSE;
749:             newTree[i+iextra].depth      = depth;
750:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
751:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}
752: 
753:             newTree[i+iextra].dftEvent  = 0;
754:             newTree[i+iextra].dftParent = 0;
755:             newTree[i+iextra].nstParent = 0;
756:           }
757:           iextra++;
758:         }
759: 
760:       }
761:     }

763:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
764:     totalNTimers = nTimers + iextra;
765:     if (!newTree) {
766:       PetscMalloc1(totalNTimers, &newTree);
767:     }
768:   }
769:   PetscFree(nstPath);
770:   PetscFree(nstMyPath);
771:   PetscFree(tree);
772:   tree = newTree;
773:   newTree = NULL;

775:   /* Set return value and return */
776:   *p_tree    = tree;
777:   *p_nTimers = totalNTimers;
778:   return(0);
779: }

781: /*
782:  * Delete the nested timer tree 
783:  */
786: static PetscErrorCode  PetscLogFreeNestedTree(PetscNestedEventTree *tree, int nTimers)
787: {
788:   int             i;
789:   PetscErrorCode  ierr;
790: 
792:   for (i=0; i<nTimers; i++) {
793:     PetscFree(tree[i].nstPath);
794:   }
795:   PetscFree(tree);
796:   return(0);
797: }

799: /* Print the global performance: max, max/min, average and total of 
800:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
801:  */
804: static PetscErrorCode  PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble minvalue, PetscLogDouble maxvalue, PetscLogDouble minmaxtreshold)
805: {

809:   PetscViewerXMLStartSection(viewer, name, NULL);
810:   if (maxvalue>minvalue*minmaxtreshold) {
811:     PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%f");
812:     PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%f");
813:   } else {
814:     PetscViewerXMLPutDouble(viewer, "value", NULL, (minvalue+maxvalue)/2.0, "%g");
815:   };
816:   PetscViewerXMLEndSection(viewer, name);
817:   return(0);
818: }

820: #define N_COMM 8
823: static PetscErrorCode  PetscLogPrintNestedLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
824: {
825:   PetscLogDouble time = perfInfo.time;
826:   PetscLogDouble timeMx,          timeMn;
827:   PetscLogDouble countsPerCallMx, countsPerCallMn;
828:   PetscLogDouble reductSpeedMx,   reductSpeedMn;
829:   PetscLogDouble flopSpeedMx,     flopSpeedMn;
830:   PetscLogDouble msgSpeedMx,      msgSpeedMn;
831:   PetscLogDouble commarr_in[N_COMM], commarr_out[N_COMM];
833:   MPI_Comm       comm;

836:   PetscObjectGetComm((PetscObject)viewer,&comm);

838:   commarr_in[0] =  time;
839:   commarr_in[1] = -time;
840:   MPIU_Allreduce(commarr_in, commarr_out,    2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
841:   timeMx =  commarr_out[0];
842:   timeMn = -commarr_out[1];

844:   commarr_in[0] = time>0.0 ?  perfInfo.flops/time         : 0;
845:   commarr_in[1] = time>0.0 ?  perfInfo.numReductions/time : 0;
846:   commarr_in[2] = time>0.0 ?  perfInfo.messageLength/time : 0;
847:   commarr_in[3] = parentCount>0    ?  countsPerCall      : 0;

849:   commarr_in[4] = time>0.0 ? -perfInfo.flops/time         : -1e30;
850:   commarr_in[5] = time>0.0 ? -perfInfo.numReductions/time : -1e30;
851:   commarr_in[6] = time>0.0 ? -perfInfo.messageLength/time : -1e30;
852:   commarr_in[7] = parentCount>0    ? -countsPerCall      : -1e30;

854:   MPIU_Allreduce(commarr_in, commarr_out,  N_COMM, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

856:   flopSpeedMx     =  commarr_out[0];
857:   reductSpeedMx   =  commarr_out[1];
858:   msgSpeedMx      =  commarr_out[2];
859:   countsPerCallMx =  commarr_out[3];

861:   flopSpeedMn     = -commarr_out[4];
862:   reductSpeedMn   = -commarr_out[5];
863:   msgSpeedMn      = -commarr_out[6];
864:   countsPerCallMn = -commarr_out[7];

866:   *isPrinted = ((timeMx/totalTime) > (threshTime/100.0)) ? PETSC_TRUE : PETSC_FALSE;
867:   if (isPrinted) {
868:     PetscViewerXMLStartSection(viewer, "event", NULL);
869:     PetscViewerXMLPutString(viewer, "name", NULL, name);
870:     PetscPrintXMLNestedLinePerfResults(viewer, "time", timeMn/totalTime*100.0, timeMx/totalTime*100.0, 1.02);


873:     if (countsPerCallMx<1.01 && countsPerCallMn>0.99) {
874:       /* One call per parent */
875:     } else {
876:       PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", countsPerCallMn, countsPerCallMx, 1.02);
877:     }
878: 
879:     if (flopSpeedMx<0.01) {
880:       /* NO flops: don't print */
881:     } else {
882:       PetscPrintXMLNestedLinePerfResults(viewer, "mflops", flopSpeedMn/1e6, flopSpeedMx/1e6, 1.05);
883:     }
884: 
885:     if (msgSpeedMx<0.01) {
886:       /* NO msgs: don't print */
887:     } else {
888:       PetscPrintXMLNestedLinePerfResults(viewer, "mbps", msgSpeedMn/1024.0/1024.0, msgSpeedMx/1024.0/1024.0, 1.05);
889:     }
890: 
891:     if (reductSpeedMx<0.01) {
892:       /* NO reductions: don't print */
893:     } else {
894:       PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", reductSpeedMn, reductSpeedMx, 1.05);
895:     }
896:   }
897:   return(0);
898: }

900: /* Count the number of times the parent event was called */

902: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
903: {
904:    if (tree[i].depth<=1) {
905:      return 1;  /* Main event: only once */
906:    } else if (!tree[i].own) {
907:      return 1;  /* This event didn't happen in this process, but did in another */
908:    } else {
909:      int iParent;
910:      for (iParent=i-1; iParent>=0; iParent--) {
911:        if (tree[iParent].depth == tree[i].depth-1) break;
912:      }
913:      if (tree[iParent].depth != tree[i].depth-1) {
914:        printf("\n\n   *****  Internal error: cannot find parent ****\n\n");
915:        return -2;
916:      } else {
917:        PetscLogEvent dftEvent  = tree[iParent].dftEvent;
918:        return eventPerfInfo[dftEvent].count;
919:      }
920:    }
921: }

923: typedef struct {
924:   int             id;
925:   PetscLogDouble  val;
926: } PetscSortItem;

928: static int compareSortItems(const void *item1_, const void *item2_)
929: {
930:   PetscSortItem *item1 = (PetscSortItem *) item1_;
931:   PetscSortItem *item2 = (PetscSortItem *) item2_;
932:   if (item1->val > item2->val) return -1;
933:   if (item1->val < item2->val) return +1;
934:   return 0;
935: }

937: static PetscErrorCode  PetscLogNestedPrint(PetscViewer viewer, PetscNestedEventTree *tree,int nTimers, int iStart, PetscLogDouble totalTime)
938: {
939:   int                depth   = tree[iStart].depth;
940:   const char         *name;
941:   int                parentCount, nChildren;
942:   PetscSortItem      *children;
943:   PetscErrorCode     ierr;
944:   PetscEventPerfInfo *eventPerfInfo;
945:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
946:   PetscLogDouble     countsPerCall;
947:   PetscBool          wasPrinted;
948:   PetscBool          childWasPrinted;
949:   MPI_Comm           comm;

951:   {
952:   /* Look up the name of the event and its PerfInfo */
953:      const int          stage=0;
954:      PetscStageLog      stageLog;
955:      PetscEventRegInfo  *eventRegInfo;
956:      PetscLogGetStageLog(&stageLog);
957:      eventRegInfo  = stageLog->eventLog->eventInfo;
958:      eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
959:      name          = eventRegInfo[(PetscLogEvent) tree[iStart].nstEvent].name;
960:   }

962:   PetscObjectGetComm((PetscObject)viewer,&comm);

964:   /* Count the number of child processes */
965:   nChildren = 0;
966:   {
967:     int i;
968:     for (i=iStart+1; i<nTimers; i++) {
969:       if (tree[i].depth<=depth) break;
970:       if (tree[i].depth == depth + 1) nChildren++;
971:     }
972:   }

974:   if (nChildren>0) {
975:     /* Create an array for the id-s and maxTimes of the children,
976:      *  leaving 2 spaces for self-time and other-time */
977:     int            i;
978:     PetscLogDouble *times, *maxTimes;

980:     PetscMalloc1(nChildren+2,&children);
981:     nChildren = 0;
982:     for (i=iStart+1; i<nTimers; i++) {
983:       if (tree[i].depth<=depth) break;
984:       if (tree[i].depth == depth + 1) {
985:         children[nChildren].id  = i;
986:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
987:         nChildren++;
988:       }
989:     }

991:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
992:     PetscMalloc1(nChildren,&times);
993:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

995:     PetscMalloc1(nChildren,&maxTimes);
996:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
997:     PetscFree(times);

999:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1000:     PetscFree(maxTimes);
1001:   }

1003:   if (!tree[iStart].own) {
1004:   /* Set values for a timer that was not activated in this process 
1005:    * (but was, in other processes of this run) */
1006:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

1008:     selfPerfInfo  = myPerfInfo;
1009:     otherPerfInfo = myPerfInfo;

1011:     parentCount   = 1;
1012:     countsPerCall = 0;
1013:   } else {
1014:   /* Set the values for a timer that was activated in this process */
1015:     int           i;
1016:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1018:     parentCount    = countParents( tree, eventPerfInfo, iStart);
1019:     myPerfInfo     = eventPerfInfo[dftEvent];
1020:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

1022:     selfPerfInfo                = myPerfInfo;
1023:     otherPerfInfo.time          = 0;
1024:     otherPerfInfo.flops         = 0;
1025:     otherPerfInfo.numMessages   = 0;
1026:     otherPerfInfo.messageLength = 0;
1027:     otherPerfInfo.numReductions = 0;

1029:     for (i=0; i<nChildren; i++) {
1030:       /* For all child counters: subtract the child values from self-timers */

1032:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1033:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1035:       selfPerfInfo.time          -= childPerfInfo.time;
1036:       selfPerfInfo.flops         -= childPerfInfo.flops;
1037:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1038:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1039:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1041:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1042:         /* Add them to 'other' if the time is ignored in the output */
1043:         otherPerfInfo.time          += childPerfInfo.time;
1044:         otherPerfInfo.flops         += childPerfInfo.flops;
1045:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1046:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1047:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1048:       }
1049:     }
1050:   }

1052:   /* Main output for this timer */
1053:   PetscLogPrintNestedLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1055:   /* Now print the lines for the children */
1056:   if (nChildren>0) {
1057:     /* Calculate max-times for 'self' and 'other' */
1058:     int            i;
1059:     PetscLogDouble times[2], maxTimes[2];
1060:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1061:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1062:     children[nChildren+0].id = -1;
1063:     children[nChildren+0].val = maxTimes[0];
1064:     children[nChildren+1].id = -2;
1065:     children[nChildren+1].val = maxTimes[1];

1067:     /* Now sort the children (including 'self' and 'other') on total time */
1068:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1070:     /* Print (or ignore) the children in ascending order of total time */
1071:     PetscViewerXMLStartSection(viewer,"events", NULL);
1072:     for (i=0; i<nChildren+2; i++) {
1073:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1074:         /* ignored: no output */
1075:       } else if (children[i].id==-1) {
1076:         PetscLogPrintNestedLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1077:         if (childWasPrinted) {
1078:           PetscViewerXMLEndSection(viewer,"event");
1079:         }
1080:       } else if (children[i].id==-2) {
1081:         size_t  len;
1082:         char    *otherName;

1084:         PetscStrlen(name,&len);
1085:         PetscMalloc1(16+len,&otherName);
1086:         sprintf(otherName,"%s: other-timed",name);
1087:         PetscLogPrintNestedLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1088:         PetscFree(otherName);
1089:         if (childWasPrinted) {
1090:           PetscViewerXMLEndSection(viewer,"event");
1091:         }
1092:       } else {
1093:         /* Print the child with a recursive call to this function */
1094:         PetscLogNestedPrint(viewer, tree, nTimers, children[i].id, totalTime);
1095:       }
1096:     }
1097:     PetscViewerXMLEndSection(viewer,"events");
1098:     PetscFree(children);
1099:   }

1101:   if (wasPrinted) {
1102:     PetscViewerXMLEndSection(viewer, "event");
1103:   }
1104:   return 0;
1105: }

1109: static PetscErrorCode  PetscLogNestedPrintTop(PetscViewer viewer, PetscNestedEventTree *tree,int nTimers, PetscLogDouble totalTime)
1110: {
1111:   int                nChildren;
1112:   PetscSortItem      *children;
1113:   PetscErrorCode     ierr;
1114:   PetscEventPerfInfo *eventPerfInfo;
1115:   MPI_Comm           comm;

1118:   PetscObjectGetComm((PetscObject)viewer,&comm);
1119:   {
1120:   /* Look up the PerfInfo */
1121:      const int          stage=0;
1122:      PetscStageLog      stageLog;
1123:      PetscLogGetStageLog(&stageLog);
1124:      eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1125:   }

1127:   /* Count the number of child processes, and count total time */
1128:   nChildren = 0;
1129:   {
1130:     int i;
1131:     for (i=0; i<nTimers; i++) {
1132:       if (tree[i].depth==1) nChildren++;
1133:     }
1134:   }

1136:   if (nChildren>0) {
1137:     /* Create an array for the id-s and maxTimes of the children,
1138:      *  leaving 2 spaces for self-time and other-time */
1139:     int            i;
1140:     PetscLogDouble *times, *maxTimes;

1142:     PetscMalloc1(nChildren,&children);
1143:     nChildren = 0;
1144:     for (i=0; i<nTimers; i++) {
1145:       if (tree[i].depth == 1) {
1146:         children[nChildren].id  = i;
1147:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1148:         nChildren++;
1149:       }
1150:      }
1151: 
1152:     /* Calculate the children's maximum times, to sort them */
1153:     PetscMalloc1(nChildren,&times);
1154:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1156:     PetscMalloc1(nChildren,&maxTimes);
1157:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1158:     PetscFree(times);

1160:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1161:     PetscFree(maxTimes);

1163:     /* Now sort the children on total time */
1164:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1165:     /* Print (or ignore) the children in ascending order of total time */
1166:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1167:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1168:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, threshTime, "%f");

1170:     for (i=0; i<nChildren; i++) {
1171:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1172:         /* ignored: no output */
1173:       } else {
1174:         /* Print the child with a recursive call to this function */
1175:         PetscLogNestedPrint(viewer, tree, nTimers, children[i].id, totalTime);
1176:       }
1177:     }
1178:     PetscViewerXMLEndSection(viewer, "timertree");
1179:     PetscFree(children);
1180:   }
1181:   return(0);
1182: }

1184: typedef struct {
1185:   char           *name;
1186:   PetscLogDouble time;
1187:   PetscLogDouble flops;
1188:   PetscLogDouble numMessages;
1189:   PetscLogDouble messageLength;
1190:   PetscLogDouble numReductions;
1191: } PetscSelfTimer;

1195: static PetscErrorCode  PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1196: {
1197:   PetscErrorCode     ierr;
1198:   PetscEventPerfInfo *eventPerfInfo;
1199:   PetscEventRegInfo  *eventRegInfo;
1200:   PetscSelfTimer     *selftimes;
1201:   PetscSelfTimer     *totaltimes;
1202:   NestedEventId      *nstEvents;
1203:   int                i, maxDefaultTimer;
1204:   NestedEventId      nst;
1205:   PetscLogEvent      dft;
1206:   int                nstMax, nstMax_local;
1207:   MPI_Comm           comm;
1208: 
1210:   PetscObjectGetComm((PetscObject)viewer,&comm);
1211:   {
1212:     const int          stage=0;
1213:     PetscStageLog      stageLog;
1214:     PetscLogGetStageLog(&stageLog);
1215:     eventRegInfo  = stageLog->eventLog->eventInfo;
1216:     eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1217:   }

1219:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1220:   maxDefaultTimer =0;
1221:   for (i=0; i<nNestedEvents; i++) {
1222:     int            nParents         = nestedEvents[i].nParents;
1223:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1224:      int j;
1225:      for (j=0; j<nParents; j++) {
1226:        maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1227:      }
1228:   }
1229:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1230:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1231:   for (i=0; i<nNestedEvents; i++) {
1232:     int           nParents          = nestedEvents[i].nParents;
1233:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
1234:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1235:     int           j;
1236:     for (j=0; j<nParents; j++) { nstEvents[dftEvents[j]] = nstEvent; }
1237:   }

1239:   /* Calculate largest nested event-ID */
1240:   nstMax_local = 0;
1241:   for (i=0; i<nNestedEvents; i++) { if (nestedEvents[i].nstEvent>nstMax_local) {nstMax_local = nestedEvents[i].nstEvent;} }
1242:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);


1245:   /* Initialize all total-times with zero */
1246:   PetscMalloc1(nstMax+1,&selftimes);
1247:   PetscMalloc1(nstMax+1,&totaltimes);
1248:   for (nst=0; nst<=nstMax; nst++) {
1249:     totaltimes[nst].time          = 0;
1250:     totaltimes[nst].flops         = 0;
1251:     totaltimes[nst].numMessages   = 0;
1252:     totaltimes[nst].messageLength = 0;
1253:     totaltimes[nst].numReductions = 0;
1254:     totaltimes[nst].name          = NULL;
1255:   }

1257:   /* Calculate total-times */
1258:   for (i=0; i<nNestedEvents; i++) {
1259:     const int            nParents  = nestedEvents[i].nParents;
1260:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1261:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1262:     int                  j;
1263:     for (j=0; j<nParents; j++) {
1264:       const PetscLogEvent dftEvent = dftEvents[j];
1265:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1266:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1267:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1268:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1269:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1270:     }
1271:     totaltimes[nstEvent].name    = eventRegInfo[(PetscLogEvent) nstEvent].name;
1272:   }

1274:   /* Initialize: self-times := totaltimes */
1275:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1277:   /* Subtract timed supprocesses from self-times */
1278:   for (i=0; i<nNestedEvents; i++) {
1279:     const int           nParents          = nestedEvents[i].nParents;
1280:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1281:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1282:     int                 j;
1283:     for (j=0; j<nParents; j++) {
1284:       const PetscLogEvent dftEvent  = dftEvents[j];
1285:       const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1286:       selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1287:       selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1288:       selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1289:       selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1290:       selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1291:     }
1292:   }

1294:   PetscFree(nstEvents);
1295:   PetscFree(totaltimes);

1297:   /* Set outputs */
1298:   *p_self  = selftimes;
1299:   *p_nstMax = nstMax;
1300:   return(0);
1301: }

1305: static PetscErrorCode  PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1306: {
1307:   PetscErrorCode     ierr;
1308:   int                i;
1309:   NestedEventId      nst;
1310:   PetscSortItem      *sortSelfTimes;
1311:   PetscLogDouble     *times, *maxTimes;
1312:   PetscEventRegInfo  *eventRegInfo;
1313:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1314:   PetscBool          wasPrinted;
1315:   MPI_Comm           comm;

1318:   PetscObjectGetComm((PetscObject)viewer,&comm);
1319:   {
1320:     PetscStageLog      stageLog;
1321:     PetscLogGetStageLog(&stageLog);
1322:     eventRegInfo  = stageLog->eventLog->eventInfo;
1323:   }

1325:   PetscMalloc1(nstMax+1,&times);
1326:   PetscMalloc1(nstMax+1,&maxTimes);
1327:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1328:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1329:   PetscFree(times);

1331:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1333:   /* Sort the self-timers on basis of the largest time needed */
1334:   for (nst=0; nst<=nstMax; nst++) {
1335:     sortSelfTimes[nst].id  = nst;
1336:     sortSelfTimes[nst].val = maxTimes[nst];
1337:   }
1338:   PetscFree(maxTimes);
1339:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1341:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1342:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1344:   for (i=0; i<=nstMax; i++) {
1345:     if ((sortSelfTimes[i].val/totalTime) >= (threshTime/100.0)) {
1346:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1347:       PetscEventPerfInfo selfPerfInfo;
1348:       const char         *name     = eventRegInfo[(PetscLogEvent) nstEvent].name;

1350:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1351:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1352:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1353:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1354:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1355: 
1356:       PetscLogPrintNestedLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1357:       if (wasPrinted){
1358:         PetscViewerXMLEndSection(viewer, "event");
1359:       }
1360:     }
1361:   }
1362:   PetscViewerXMLEndSection(viewer, "selftimertable");
1363:   PetscFree(sortSelfTimes);
1364:   return(0);
1365: }

1369: PetscErrorCode  PetscLogView_Nested(PetscViewer viewer)
1370: {
1371:   MPI_Comm             comm;
1372:   PetscErrorCode       ierr;
1373:   PetscLogDouble       locTotalTime, globTotalTime;
1374:   PetscNestedEventTree *tree = NULL;
1375:   PetscSelfTimer       *selftimers = NULL;
1376:   int                  nTimers = 0, nstMax = 0;
1377:   PetscViewerType      vType;

1380:   PetscViewerGetType(viewer,&vType);

1382:   /* Set useXMLFormat that controls the format in all local PetscPrint.. functions */
1383:   PetscViewerInitASCII_XML(viewer);

1385:   PetscObjectGetComm((PetscObject)viewer,&comm);

1387:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1388:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1390:   /* Get the total elapsed time, local and global maximum */
1391:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1392:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1394:   /* Print global information about this run */
1395:   PetscPrintExeSpecs(viewer);
1396:   PetscPrintGlobalPerformance(viewer, locTotalTime);
1397: 
1398:   /* Collect nested timer tree info from all processes */
1399:   PetscCreateLogTreeNested(viewer, &tree, &nTimers);
1400:   PetscLogNestedPrintTop(viewer, tree, nTimers, globTotalTime);
1401:   PetscLogFreeNestedTree(tree, nTimers);

1403:   /* Calculate self-time for all (not-nested) events */
1404:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1405:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1406:   PetscFree(selftimers);

1408:   PetscViewerXMLEndSection(viewer, "petscroot");
1409:   PetscViewerFinalASCII_XML(viewer);
1410:   PetscLogNestedEnd();
1411:   return(0);
1412: }

1414: #endif