Actual source code: xmllogevent.c

petsc-3.11.4 2019-09-28
Report Typos and Errors
  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8:  #include <petsclog.h>
  9:  #include <petsc/private/logimpl.h>
 10:  #include <petsctime.h>
 11:  #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80: #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent    dftParentActive        = DFT_ID_AWAKE;
 94: static int              nNestedEvents          = 0;
 95: static int              nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents          = NULL;
 97: static PetscLogDouble   thresholdTime          = 0.01; /* initial value was 0.1 */

 99: #define THRESHOLD (thresholdTime/100.0+1e-12)

101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);


106: /*@C
107:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108:   rates and object creation and should not slow programs down too much.

110:   Logically Collective over PETSC_COMM_WORLD

112:   Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

115:   Usage:
116: .vb
117:       PetscInitialize(...);
118:       PetscLogNestedBegin();
119:        ... code ...
120:       PetscLogView(viewer);
121:       PetscFinalize();
122: .ve

124:   Level: advanced

126: .keywords: log, begin
127: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
128: @*/
129: PetscErrorCode PetscLogNestedBegin(void)
130: {

134:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

136:   nNestedEventsAllocated = 10;
137:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
138:   dftParentActive = DFT_ID_AWAKE;
139:   nNestedEvents =1;

141:   /* 'Awake' is nested event 0. It has no parents */
142:   nestedEvents[0].nstEvent          = 0;
143:   nestedEvents[0].nParents          = 0;
144:   nestedEvents[0].dftParentsSorted  = NULL;
145:   nestedEvents[0].dftEvents         = NULL;
146:   nestedEvents[0].dftParents        = NULL;
147:   nestedEvents[0].dftEventsSorted   = NULL;

149:   PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
150:   return(0);
151: }

153: /* Delete the data structures for the nested timers */
154: PetscErrorCode PetscLogNestedEnd(void)
155: {
157:   int            i;

160:   if (!nestedEvents) return(0);
161:   for (i=0; i<nNestedEvents; i++) {
162:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
163:   }
164:   PetscFree(nestedEvents);
165:   nestedEvents           = NULL;
166:   nNestedEvents          = 0;
167:   nNestedEventsAllocated = 0;
168:   return(0);
169: }


172: /*
173:  UTILITIES: FIND STUFF IN SORTED ARRAYS

175:     dftIndex - index to be found
176:     dftArray - sorted array of PetscLogEvent-ids
177:     narray - dimension of dftArray 
178:     entry - entry in the array where dftIndex may be found;

180:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
181:      In that case, the dftIndex can be inserted at this entry. 
182: */
183: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
184: {
186:   if (narray==0 || dftIndex <= dftArray[0]) {
187:     *entry = 0;
188:   } else if (dftIndex > dftArray[narray-1]) {
189:     *entry = narray;
190:   } else {
191:     int ihigh = narray-1, ilow=0;
192:     while (ihigh>ilow) {
193:       const int imiddle = (ihigh+ilow)/2;
194:       if (dftArray[imiddle] > dftIndex) {
195:         ihigh = imiddle;
196:       } else if (dftArray[imiddle]<dftIndex) {
197:         ilow = imiddle+1;
198:       } else {
199:         ihigh = imiddle;
200:         ilow  = imiddle;
201:       }
202:     }
203:     *entry = ihigh;
204:   }
205:   return(0);
206: }

208: /*
209:     Utility: find the nested event with given identification

211:     nstEvent - Nested event to be found
212:     entry - entry in the nestedEvents where nstEvent may be found;

214:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
215: */
216: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
217: {
219:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
220:     *entry = 0;
221:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
222:     *entry = nNestedEvents;
223:   } else {
224:     int ihigh = nNestedEvents-1,  ilow = 0;
225:     while (ihigh>ilow) {
226:       const int imiddle = (ihigh+ilow)/2;
227:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
228:         ihigh = imiddle;
229:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
230:         ilow = imiddle+1;
231:       } else {
232:         ihigh = imiddle;
233:         ilow  = imiddle;
234:       }
235:     }
236:     *entry = ihigh;
237:   }
238:   return(0);
239: }

241: /*
242:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
243:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
244: */

246: #define MAINSTAGE 0

248: static PetscLogStage savedStage = 0;

250: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
251: {
252:   PetscStageLog  stageLog = petsc_stageLog;

256:   if (stageLog->curStage == MAINSTAGE) return(0);
257:   savedStage = stageLog->curStage;
258:   stageLog->curStage = MAINSTAGE;
259:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
260:   return(0);
261: }

263: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
264: {
265:   PetscStageLog  stageLog = petsc_stageLog;

269:   if (savedStage == MAINSTAGE) return(0);
270:   stageLog->curStage = savedStage;
271:   PetscIntStackPop(stageLog->stack, &savedStage);
272:   return(0);
273: }

275: /******************************************************************************************/
276: /* Start a nested event */
277: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
278: {
279:   PetscErrorCode  ierr;
280:   int             entry, pentry, tentry,i;
281:   PetscLogEvent   dftEvent;

284:   PetscLogEventFindNestedTimer(nstEvent, &entry);
285:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
286:     /* Nested event doesn't exist yet: create it */

288:     if (nNestedEvents==nNestedEventsAllocated) {
289:       /* Enlarge and re-allocate nestedEvents if needed */
290:       PetscNestedEvent *tmp = nestedEvents;
291:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
292:       nNestedEventsAllocated*=2;
293:       PetscMemcpy(nestedEvents, tmp, nNestedEvents*sizeof(PetscNestedEvent));
294:       PetscFree(tmp);
295:     }

297:     /* Clear space in nestedEvents for new nested event */
298:     nNestedEvents++;
299:     for (i = nNestedEvents-1; i>entry; i--) {
300:       nestedEvents[i] = nestedEvents[i-1];
301:     }

303:     /* Create event in nestedEvents */
304:     nestedEvents[entry].nstEvent = nstEvent;
305:     nestedEvents[entry].nParents=1;
306:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

308:     /* Fill in new event */
309:     pentry = 0;
310:     dftEvent = (PetscLogEvent) nstEvent;

312:     nestedEvents[entry].nstEvent                 = nstEvent;
313:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
314:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
315:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
316:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

318:   } else {
319:     /* Nested event exists: find current dftParentActive among parents */
320:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
321:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
322:     int           nParents          = nestedEvents[entry].nParents;

324:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

326:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
327:       /* dftParentActive not in the list: add it to the list */
328:       int           i;
329:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
330:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
331:       char          name[100];

333:       /* Register a new default timer */
334:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
335:       PetscLogEventRegister(name, 0, &dftEvent);
336:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

338:       /* Reallocate parents and dftEvents to make space for new parent */
339:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
340:       PetscMemcpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents*sizeof(PetscLogEvent));
341:       PetscMemcpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents*sizeof(PetscLogEvent));
342:       PetscMemcpy(nestedEvents[entry].dftParents,       dftParents,       nParents*sizeof(PetscLogEvent));
343:       PetscMemcpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents*sizeof(PetscLogEvent));
344:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

346:       dftParents       = nestedEvents[entry].dftParents;
347:       dftEvents        = nestedEvents[entry].dftEvents;
348:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
349:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

351:       nestedEvents[entry].nParents++;
352:       nParents++;

354:       for (i = nParents-1; i>pentry; i--) {
355:         dftParentsSorted[i] = dftParentsSorted[i-1];
356:         dftEvents[i]        = dftEvents[i-1];
357:       }
358:       for (i = nParents-1; i>tentry; i--) {
359:         dftParents[i]      = dftParents[i-1];
360:         dftEventsSorted[i] = dftEventsSorted[i-1];
361:       }

363:       /* Fill in the new default timer */
364:       dftParentsSorted[pentry] = dftParentActive;
365:       dftEvents[pentry]        = dftEvent;
366:       dftParents[tentry]       = dftParentActive;
367:       dftEventsSorted[tentry]  = dftEvent;

369:     } else {
370:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
371:       dftEvent = nestedEvents[entry].dftEvents[pentry];
372:     }
373:   }

375:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
376:   PetscLogStageOverride();
377:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
378:   PetscLogStageRestore();
379:   dftParentActive = dftEvent;
380:   return(0);
381: }

383: /* End a nested event */
384: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
385: {
386:   PetscErrorCode  ierr;
387:   int             entry, pentry, nParents;
388:   PetscLogEvent  *dftEventsSorted;

391:   /* Find the nested event */
392:   PetscLogEventFindNestedTimer(nstEvent, &entry);
393:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
394:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
395:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
396:   nParents        = nestedEvents[entry].nParents;

398:   /* Find the current default timer among the 'dftEvents' of this event */
399:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

401:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
402:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

404:   /* Stop the default timer and update the dftParentActive */
405:   PetscLogStageOverride();
406:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
407:   PetscLogStageRestore();
408:   dftParentActive = nestedEvents[entry].dftParents[pentry];
409:   return(0);
410: }

412: /*@
413:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
414:           that takes 1 or more percent of the time.

416:   Logically Collective over PETSC_COMM_WORLD

418:   Input Parameter:
419: .   newThresh - the threshold to use

421:   Output Parameter:
422: .   oldThresh - the previously set threshold value

424:   Options Database Keys:
425: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

427:   Usage:
428: .vb
429:       PetscInitialize(...);
430:       PetscLogNestedBegin();
431:       PetscLogSetThreshold(0.1,&oldthresh);
432:        ... code ...
433:       PetscLogView(viewer);
434:       PetscFinalize();
435: .ve

437:   Level: advanced

439: .keywords: log, begin
440: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
441:           PetscLogNestedBegin()
442: @*/
443: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
444: {
446:   if (oldThresh) *oldThresh = thresholdTime;
447:   if (newThresh == PETSC_DECIDE)  newThresh = 0.01;
448:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
449:   thresholdTime = PetscMax(newThresh, 0.0);
450:   return(0);
451: }

453: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
454: {
455:   PetscErrorCode     ierr;
456:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
457:   char               version[256], buildoptions[128] = "";
458:   PetscMPIInt        size;
459:   size_t             len;

462:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
463:   PetscGetArchType(arch,sizeof(arch));
464:   PetscGetHostName(hostname,sizeof(hostname));
465:   PetscGetUserName(username,sizeof(username));
466:   PetscGetProgramName(pname,sizeof(pname));
467:   PetscGetDate(date,sizeof(date));
468:   PetscGetVersion(version,sizeof(version));

470:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
471:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname );
472:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch );
473:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
474:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size );
475:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
476:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
477:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);

479: #if defined(PETSC_USE_DEBUG)
480:   PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
481: #endif
482: #if defined(PETSC_USE_COMPLEX)
483:   PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
484: #endif
485: #if defined(PETSC_USE_REAL_SINGLE)
486:   PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
487: #elif defined(PETSC_USE_REAL___FLOAT128)
488:   PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
489: #elif defined(PETSC_USE_REAL___FP16)
490:   PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
491: #endif
492: #if defined(PETSC_USE_64BIT_INDICES)
493:   PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
494: #endif
495: #if defined(__cplusplus)
496:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
497: #endif
498:   PetscStrlen(buildoptions,&len);
499:   if (len) {
500:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
501:   }
502:   PetscViewerXMLEndSection(viewer, "runspecification");
503:   return(0);
504: }

506: /* Print the global performance: max, max/min, average and total of
507:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
508:  */
509: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
510: {
511:   PetscErrorCode  ierr;
512:   PetscLogDouble  min, tot, ratio, avg;
513:   MPI_Comm        comm;
514:   PetscMPIInt     rank, size;
515:   PetscLogDouble  valrank[2], max[2];

518:   PetscObjectGetComm((PetscObject)viewer,&comm);
519:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
520:   MPI_Comm_rank(comm, &rank);

522:   valrank[0] = local_val;
523:   valrank[1] = (PetscLogDouble) rank;
524:   MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE,  MPI_MIN,    comm);
525:   MPIU_Allreduce(valrank,    &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
526:   MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);
527:   avg  = tot/((PetscLogDouble) size);
528:   if (min != 0.0) ratio = max[0]/min;
529:   else ratio = 0.0;

531:   PetscViewerXMLStartSection(viewer, name, desc);
532:   PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
533:   PetscViewerXMLPutInt(   viewer, "maxrank"  , "rank at which max was found" , (PetscMPIInt) max[1] );
534:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
535:   if (print_average) {
536:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
537:   }
538:   if (print_total) {
539:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
540:   }
541:   PetscViewerXMLEndSection(viewer, name);
542:   return(0);
543: }

545: /* Print the global performance: max, max/min, average and total of
546:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
547:  */
548: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
549: {
550:   PetscErrorCode  ierr;
551:   PetscLogDouble  flops, mem, red, mess;
552:   const PetscBool print_total_yes   = PETSC_TRUE,
553:                   print_total_no    = PETSC_FALSE,
554:                   print_average_no  = PETSC_FALSE,
555:                   print_average_yes = PETSC_TRUE;

558:   /* Must preserve reduction count before we go on */
559:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

561:   /* Calculate summary information */
562:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

564:   /*   Time */
565:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);

567:   /*   Objects */
568:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);

570:   /*   Flop */
571:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);

573:   /*   Flop/sec -- Must talk to Barry here */
574:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
575:   else flops = 0.0;
576:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);

578:   /*   Memory */
579:   PetscMallocGetMaximumUsage(&mem);
580:   if (mem > 0.0) {
581:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
582:   }
583:   /*   Messages */
584:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
585:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);

587:   /*   Message Volume */
588:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
589:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);

591:   /*   Reductions */
592:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
593:   PetscViewerXMLEndSection(viewer, "globalperformance");
594:   return(0);
595: }

597: typedef struct {
598:   PetscLogEvent  dftEvent;
599:   NestedEventId  nstEvent;
600:   PetscLogEvent  dftParent;
601:   NestedEventId  nstParent;
602:   PetscBool      own;
603:   int            depth;
604:   NestedEventId* nstPath;
605: } PetscNestedEventTree;

607: /* Compare timers to sort them in the tree */
608: static int compareTreeItems(const void *item1_, const void *item2_)
609: {
610:   int                  i;
611:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
612:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;

614:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
615:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
616:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
617:   }
618:   if (item1->depth < item2->depth) return -1;
619:   if (item1->depth > item2->depth) return 1;
620:   return 0;
621: }
622: /*
623:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
624:  * calls that happen in some processes, but not in others.
625:  *
626:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
627:  * The tree is sorted so that the timers can be printed in the order of appearance.
628:  *
629:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
630:  * following rule is followed:
631:  * + if information from my own process is available, then that is the information stored in tree.
632:  *   otherwise it is some other process's information.
633:  */
634: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
635: {
636:   PetscNestedEventTree *tree = NULL, *newTree;
637:   int                  *treeIndices;
638:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
639:   int                  yesno;
640:   PetscBool            done;
641:   PetscErrorCode       ierr;
642:   int                  maxdepth;
643:   int                  depth;
644:   int                  illegalEvent;
645:   int                  iextra;
646:   NestedEventId        *nstPath, *nstMyPath;
647:   MPI_Comm             comm;

650:   PetscObjectGetComm((PetscObject)viewer,&comm);

652:   /* Calculate memory needed to store everybody's information and allocate tree */
653:   nTimers = 0;
654:   for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

656:   PetscMalloc1(nTimers,&tree);

658:   /* Fill tree with readily available information */
659:   iTimer0 = 0;
660:   maxDefaultTimer =0;
661:   for (i=0; i<nNestedEvents; i++) {
662:     int           nParents          = nestedEvents[i].nParents;
663:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
664:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
665:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
666:     for (j=0; j<nParents; j++) {
667:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

669:       tree[iTimer0+j].dftEvent   = dftEvents[j];
670:       tree[iTimer0+j].nstEvent   = nstEvent;
671:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
672:       tree[iTimer0+j].own        = PETSC_TRUE;

674:       tree[iTimer0+j].nstParent  = 0;
675:       tree[iTimer0+j].depth      = 0;
676:       tree[iTimer0+j].nstPath    = NULL;
677:     }
678:     iTimer0 += nParents;
679:   }

681:   /* Calculate the global maximum for the default timer index, so array treeIndices can
682:    * be allocated only once */
683:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
684:   maxDefaultTimer = j;

686:   /* Find default timer's place in the tree */
687:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
688:   treeIndices[0] = 0;
689:   for (i=0; i<nTimers; i++) {
690:     PetscLogEvent dftEvent = tree[i].dftEvent;
691:     treeIndices[dftEvent] = i;
692:   }

694:   /* Find each dftParent's nested identification */
695:   for (i=0; i<nTimers; i++) {
696:     PetscLogEvent dftParent = tree[i].dftParent;
697:     if (dftParent!= DFT_ID_AWAKE) {
698:       int j = treeIndices[dftParent];
699:       tree[i].nstParent = tree[j].nstEvent;
700:     }
701:   }

703:   /* Find depths for each timer path */
704:   done = PETSC_FALSE;
705:   maxdepth = 0;
706:   while (!done) {
707:     done = PETSC_TRUE;
708:     for (i=0; i<nTimers; i++) {
709:       if (tree[i].dftParent == DFT_ID_AWAKE) {
710:         tree[i].depth = 1;
711:         maxdepth = PetscMax(1,maxdepth);
712:       } else {
713:         int j = treeIndices[tree[i].dftParent];
714:         depth = 1+tree[j].depth;
715:         if (depth>tree[i].depth) {
716:           done          = PETSC_FALSE;
717:           tree[i].depth = depth;
718:           maxdepth      = PetscMax(depth,maxdepth);
719:         }
720:       }
721:     }
722:   }

724:   /* Allocate the paths in the entire tree */
725:   for (i=0; i<nTimers; i++) {
726:     depth = tree[i].depth;
727:     PetscCalloc1(depth,&tree[i].nstPath);
728:   }

730:   /* Calculate the paths for all timers */
731:   for (depth=1; depth<=maxdepth; depth++) {
732:     for (i=0; i<nTimers; i++) {
733:       if (tree[i].depth==depth) {
734:         if (depth>1) {
735:           int    j = treeIndices[tree[i].dftParent];
736:           PetscMemcpy(tree[i].nstPath,tree[j].nstPath,(depth-1)*sizeof(NestedEventId));
737:         }
738:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
739:       }
740:     }
741:   }
742:   PetscFree(treeIndices);

744:   /* Sort the tree on basis of the paths */
745:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

747:   /* Allocate an array to store paths */
748:   depth = maxdepth;
749:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
750:   PetscMalloc1(maxdepth+1, &nstPath);
751:   PetscMalloc1(maxdepth+1, &nstMyPath);

753:   /* Find an illegal nested event index (1+largest nested event index) */
754:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
755:   i = illegalEvent;
756:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

758:   /* First, detect timers which are not available in this process, but are available in others
759:    *        Allocate a new tree, that can contain all timers
760:    * Then,  fill the new tree with all (own and not-own) timers */
761:   newTree= NULL;
762:   for (yesno=0; yesno<=1; yesno++) {
763:     depth  = 1;
764:     i      = 0;
765:     iextra = 0;
766:     while (depth>0) {
767:       int       j;
768:       PetscBool same;

770:       /* Construct the next path in this process's tree:
771:        * if necessary, supplement with invalid path entries */
772:       depth++;
773:       if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
774:       if (i<nTimers) {
775:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
776:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
777:       } else {
778:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
779:       }

781:       /* Communicate with other processes to obtain the next path and its depth */
782:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
783:       for (j=depth-1; (int) j>=0; j--) {
784:         if (nstPath[j]==illegalEvent) depth=j;
785:       }

787:       if (depth>0) {
788:         /* If the path exists */

790:         /* check whether the next path is the same as this process's next path */
791:         same = PETSC_TRUE;
792:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}

794:         if (same) {
795:           /* Register 'own path' */
796:           if (newTree) newTree[i+iextra] = tree[i];
797:           i++;
798:         } else {
799:           /* Register 'not an own path' */
800:           if (newTree) {
801:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
802:             newTree[i+iextra].own        = PETSC_FALSE;
803:             newTree[i+iextra].depth      = depth;
804:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
805:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

807:             newTree[i+iextra].dftEvent  = 0;
808:             newTree[i+iextra].dftParent = 0;
809:             newTree[i+iextra].nstParent = 0;
810:           }
811:           iextra++;
812:         }

814:       }
815:     }

817:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
818:     totalNTimers = nTimers + iextra;
819:     if (!newTree) {
820:       PetscMalloc1(totalNTimers, &newTree);
821:     }
822:   }
823:   PetscFree(nstPath);
824:   PetscFree(nstMyPath);
825:   PetscFree(tree);
826:   tree = newTree;
827:   newTree = NULL;

829:   /* Set return value and return */
830:   *p_tree    = tree;
831:   *p_nTimers = totalNTimers;
832:   return(0);
833: }

835: /*
836:  * Delete the nested timer tree
837:  */
838: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
839: {
840:   int             i;
841:   PetscErrorCode  ierr;

844:   for (i=0; i<nTimers; i++) {
845:     PetscFree(tree[i].nstPath);
846:   }
847:   PetscFree(tree);
848:   return(0);
849: }

851: /* Print the global performance: max, max/min, average and total of
852:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
853:  */
854: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
855: {
856:   MPI_Comm       comm;                          /* MPI communicator in reduction */
857:   PetscMPIInt    rank;                          /* rank of this process */
858:   PetscLogDouble val_in[2], max[2], min[2];
859:   PetscLogDouble minvalue, maxvalue, tot;
860:   PetscMPIInt    size;
861:   PetscMPIInt    minLoc, maxLoc;

865:   PetscObjectGetComm((PetscObject)viewer,&comm);
866:   MPI_Comm_size(comm, &size);
867:   MPI_Comm_rank(comm, &rank);
868:   val_in[0] = value;
869:   val_in[1] = (PetscLogDouble) rank;
870:   MPIU_Allreduce(val_in, max,  1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
871:   MPIU_Allreduce(val_in, min,  1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
872:   maxvalue = max[0];
873:   maxLoc   = (PetscMPIInt) max[1];
874:   minvalue = min[0];
875:   minLoc   = (PetscMPIInt) min[1];
876:   MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);

878:   if (maxvalue<maxthreshold && minvalue>=minthreshold) {
879:     /* One call per parent or NO value: don't print */
880:   } else {
881:      PetscViewerXMLStartSection(viewer, name, NULL);
882:      if (maxvalue>minvalue*minmaxtreshold) {
883:        PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
884:        PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
885:        PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
886:        PetscViewerXMLPutInt(   viewer, "minloc"  , NULL, minLoc);
887:        PetscViewerXMLPutInt(   viewer, "maxloc"  , NULL, maxLoc);
888:      } else {
889:        PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
890:      }
891:      PetscViewerXMLEndSection(viewer, name);
892:   }
893:   return(0);
894: }

896: #define N_COMM 8
897: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
898: {
899:   PetscLogDouble time = perfInfo.time;
900:   PetscLogDouble timeMx;
902:   MPI_Comm       comm;

905:   PetscObjectGetComm((PetscObject)viewer,&comm);
906:   MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
907:   *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
908:   if (*isPrinted) {
909:     PetscViewerXMLStartSection(viewer, "event", NULL);
910:     PetscViewerXMLPutString(viewer, "name", NULL, name);
911:     PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
912:     PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
913:     PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
914:     PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
915:     PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
916:   }
917:   return(0);
918: }

920: /* Count the number of times the parent event was called */

922: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
923: {
924:   if (tree[i].depth<=1) {
925:     return 1;  /* Main event: only once */
926:   } else if (!tree[i].own) {
927:     return 1;  /* This event didn't happen in this process, but did in another */
928:   } else {
929:     int iParent;
930:     for (iParent=i-1; iParent>=0; iParent--) {
931:       if (tree[iParent].depth == tree[i].depth-1) break;
932:     }
933:     if (tree[iParent].depth != tree[i].depth-1) {
934:       /* *****  Internal error: cannot find parent */
935:       return -2;
936:     } else {
937:       PetscLogEvent dftEvent  = tree[iParent].dftEvent;
938:       return eventPerfInfo[dftEvent].count;
939:     }
940:   }
941: }

943: typedef struct {
944:   int             id;
945:   PetscLogDouble  val;
946: } PetscSortItem;

948: static int compareSortItems(const void *item1_, const void *item2_)
949: {
950:   PetscSortItem *item1 = (PetscSortItem *) item1_;
951:   PetscSortItem *item2 = (PetscSortItem *) item2_;
952:   if (item1->val > item2->val) return -1;
953:   if (item1->val < item2->val) return +1;
954:   return 0;
955: }

957: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
958: {
959:   int                depth = tree[iStart].depth;
960:   const char         *name;
961:   int                parentCount, nChildren;
962:   PetscSortItem      *children;
963:   PetscErrorCode     ierr;
964:   const int          stage = MAINSTAGE;
965:   PetscStageLog      stageLog;
966:   PetscEventRegInfo  *eventRegInfo;
967:   PetscEventPerfInfo *eventPerfInfo;
968:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
969:   PetscLogDouble     countsPerCall;
970:   PetscBool          wasPrinted;
971:   PetscBool          childWasPrinted;
972:   MPI_Comm           comm;

975:   /* Look up the name of the event and its PerfInfo */
976:   PetscLogGetStageLog(&stageLog);
977:   eventRegInfo  = stageLog->eventLog->eventInfo;
978:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
979:   name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
980:   PetscObjectGetComm((PetscObject)viewer,&comm);

982:   /* Count the number of child processes */
983:   nChildren = 0;
984:   {
985:     int i;
986:     for (i=iStart+1; i<nTimers; i++) {
987:       if (tree[i].depth <= depth) break;
988:       if (tree[i].depth == depth + 1) nChildren++;
989:     }
990:   }

992:   if (nChildren>0) {
993:     /* Create an array for the id-s and maxTimes of the children,
994:      *  leaving 2 spaces for self-time and other-time */
995:     int            i;
996:     PetscLogDouble *times, *maxTimes;

998:     PetscMalloc1(nChildren+2,&children);
999:     nChildren = 0;
1000:     for (i=iStart+1; i<nTimers; i++) {
1001:       if (tree[i].depth<=depth) break;
1002:       if (tree[i].depth == depth + 1) {
1003:         children[nChildren].id  = i;
1004:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1005:         nChildren++;
1006:       }
1007:     }

1009:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1010:     PetscMalloc1(nChildren,&times);
1011:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1013:     PetscMalloc1(nChildren,&maxTimes);
1014:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1015:     PetscFree(times);

1017:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1018:     PetscFree(maxTimes);
1019:   }

1021:   if (!tree[iStart].own) {
1022:   /* Set values for a timer that was not activated in this process
1023:    * (but was, in other processes of this run) */
1024:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

1026:     selfPerfInfo  = myPerfInfo;
1027:     otherPerfInfo = myPerfInfo;

1029:     parentCount   = 1;
1030:     countsPerCall = 0;
1031:   } else {
1032:   /* Set the values for a timer that was activated in this process */
1033:     int           i;
1034:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1036:     parentCount    = countParents( tree, eventPerfInfo, iStart);
1037:     myPerfInfo     = eventPerfInfo[dftEvent];
1038:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

1040:     selfPerfInfo                = myPerfInfo;
1041:     otherPerfInfo.time          = 0;
1042:     otherPerfInfo.flops         = 0;
1043:     otherPerfInfo.numMessages   = 0;
1044:     otherPerfInfo.messageLength = 0;
1045:     otherPerfInfo.numReductions = 0;

1047:     for (i=0; i<nChildren; i++) {
1048:       /* For all child counters: subtract the child values from self-timers */

1050:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1051:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1053:       selfPerfInfo.time          -= childPerfInfo.time;
1054:       selfPerfInfo.flops         -= childPerfInfo.flops;
1055:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1056:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1057:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1059:       if ((children[i].val/totalTime) < THRESHOLD) {
1060:         /* Add them to 'other' if the time is ignored in the output */
1061:         otherPerfInfo.time          += childPerfInfo.time;
1062:         otherPerfInfo.flops         += childPerfInfo.flops;
1063:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1064:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1065:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1066:       }
1067:     }
1068:   }

1070:   /* Main output for this timer */
1071:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1073:   /* Now print the lines for the children */
1074:   if (nChildren > 0) {
1075:     /* Calculate max-times for 'self' and 'other' */
1076:     int            i;
1077:     PetscLogDouble times[2], maxTimes[2];
1078:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1079:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1080:     children[nChildren+0].id = -1;
1081:     children[nChildren+0].val = maxTimes[0];
1082:     children[nChildren+1].id = -2;
1083:     children[nChildren+1].val = maxTimes[1];

1085:     /* Now sort the children (including 'self' and 'other') on total time */
1086:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1088:     /* Print (or ignore) the children in ascending order of total time */
1089:     PetscViewerXMLStartSection(viewer,"events", NULL);
1090:     for (i=0; i<nChildren+2; i++) {
1091:       if ((children[i].val/totalTime) < THRESHOLD) {
1092:         /* ignored: no output */
1093:       } else if (children[i].id==-1) {
1094:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1095:         if (childWasPrinted) {
1096:           PetscViewerXMLEndSection(viewer,"event");
1097:         }
1098:       } else if (children[i].id==-2) {
1099:         size_t  len;
1100:         char    *otherName;

1102:         PetscStrlen(name,&len);
1103:         PetscMalloc1(len+16,&otherName);
1104:         PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1105:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1106:         PetscFree(otherName);
1107:         if (childWasPrinted) {
1108:           PetscViewerXMLEndSection(viewer,"event");
1109:         }
1110:       } else {
1111:         /* Print the child with a recursive call to this function */
1112:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1113:       }
1114:     }
1115:     PetscViewerXMLEndSection(viewer,"events");
1116:     PetscFree(children);
1117:   }

1119:   if (wasPrinted) {
1120:     PetscViewerXMLEndSection(viewer, "event");
1121:   }
1122:   return(0);
1123: }

1125: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1126: {
1127:   int                i, nChildren;
1128:   PetscSortItem      *children;
1129:   PetscErrorCode     ierr;
1130:   const int          stage = MAINSTAGE;
1131:   PetscStageLog      stageLog;
1132:   PetscEventPerfInfo *eventPerfInfo;
1133:   MPI_Comm           comm;

1136:   PetscObjectGetComm((PetscObject)viewer,&comm);

1138:   /* Look up the PerfInfo */
1139:   PetscLogGetStageLog(&stageLog);
1140:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1142:   /* Count the number of child processes, and count total time */
1143:   nChildren = 0;
1144:   for (i=0; i<nTimers; i++)
1145:     if (tree[i].depth==1) nChildren++;

1147:   if (nChildren>0) {
1148:     /* Create an array for the id-s and maxTimes of the children,
1149:      *  leaving 2 spaces for self-time and other-time */
1150:     PetscLogDouble *times, *maxTimes;

1152:     PetscMalloc1(nChildren,&children);
1153:     nChildren = 0;
1154:     for (i=0; i<nTimers; i++) {
1155:       if (tree[i].depth == 1) {
1156:         children[nChildren].id  = i;
1157:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1158:         nChildren++;
1159:       }
1160:     }

1162:     /* Calculate the children's maximum times, to sort them */
1163:     PetscMalloc1(nChildren,&times);
1164:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1166:     PetscMalloc1(nChildren,&maxTimes);
1167:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1168:     PetscFree(times);

1170:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1171:     PetscFree(maxTimes);

1173:     /* Now sort the children on total time */
1174:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1175:     /* Print (or ignore) the children in ascending order of total time */
1176:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1177:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1178:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1180:     for (i=0; i<nChildren; i++) {
1181:       if ((children[i].val/totalTime) < THRESHOLD) {
1182:         /* ignored: no output */
1183:       } else {
1184:         /* Print the child with a recursive call to this function */
1185:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1186:       }
1187:     }
1188:     PetscViewerXMLEndSection(viewer, "timertree");
1189:     PetscFree(children);
1190:   }
1191:   return(0);
1192: }

1194: typedef struct {
1195:   char           *name;
1196:   PetscLogDouble time;
1197:   PetscLogDouble flops;
1198:   PetscLogDouble numMessages;
1199:   PetscLogDouble messageLength;
1200:   PetscLogDouble numReductions;
1201: } PetscSelfTimer;

1203: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1204: {
1205:   PetscErrorCode     ierr;
1206:   const int          stage = MAINSTAGE;
1207:   PetscStageLog      stageLog;
1208:   PetscEventRegInfo  *eventRegInfo;
1209:   PetscEventPerfInfo *eventPerfInfo;
1210:   PetscSelfTimer     *selftimes;
1211:   PetscSelfTimer     *totaltimes;
1212:   NestedEventId      *nstEvents;
1213:   int                i, j, maxDefaultTimer;
1214:   NestedEventId      nst;
1215:   PetscLogEvent      dft;
1216:   int                nstMax, nstMax_local;
1217:   MPI_Comm           comm;

1220:   PetscObjectGetComm((PetscObject)viewer,&comm);
1221:   PetscLogGetStageLog(&stageLog);
1222:   eventRegInfo  = stageLog->eventLog->eventInfo;
1223:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1225:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1226:   maxDefaultTimer =0;
1227:   for (i=0; i<nNestedEvents; i++) {
1228:     int           nParents   = nestedEvents[i].nParents;
1229:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1230:     for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1231:   }
1232:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1233:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1234:   for (i=0; i<nNestedEvents; i++) {
1235:     int           nParents   = nestedEvents[i].nParents;
1236:     NestedEventId nstEvent   = nestedEvents[i].nstEvent;
1237:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1238:     for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1239:   }

1241:   /* Calculate largest nested event-ID */
1242:   nstMax_local = 0;
1243:   for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1244:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);

1246:   /* Initialize all total-times with zero */
1247:   PetscMalloc1(nstMax+1,&selftimes);
1248:   PetscMalloc1(nstMax+1,&totaltimes);
1249:   for (nst=0; nst<=nstMax; nst++) {
1250:     totaltimes[nst].time          = 0;
1251:     totaltimes[nst].flops         = 0;
1252:     totaltimes[nst].numMessages   = 0;
1253:     totaltimes[nst].messageLength = 0;
1254:     totaltimes[nst].numReductions = 0;
1255:     totaltimes[nst].name          = NULL;
1256:   }

1258:   /* Calculate total-times */
1259:   for (i=0; i<nNestedEvents; i++) {
1260:     const int            nParents  = nestedEvents[i].nParents;
1261:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1262:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1263:     for (j=0; j<nParents; j++) {
1264:       const PetscLogEvent dftEvent = dftEvents[j];
1265:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1266:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1267:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1268:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1269:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1270:     }
1271:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1272:   }

1274:   /* Initialize: self-times := totaltimes */
1275:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1277:   /* Subtract timed subprocesses from self-times */
1278:   for (i=0; i<nNestedEvents; i++) {
1279:     const int           nParents          = nestedEvents[i].nParents;
1280:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1281:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1282:     for (j=0; j<nParents; j++) {
1283:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1284:         const PetscLogEvent dftEvent  = dftEvents[j];
1285:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1286:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1287:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1288:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1289:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1290:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1291:       }
1292:     }
1293:   }

1295:   PetscFree(nstEvents);
1296:   PetscFree(totaltimes);

1298:   /* Set outputs */
1299:   *p_self  = selftimes;
1300:   *p_nstMax = nstMax;
1301:   return(0);
1302: }

1304: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1305: {
1306:   PetscErrorCode     ierr;
1307:   int                i;
1308:   NestedEventId      nst;
1309:   PetscSortItem      *sortSelfTimes;
1310:   PetscLogDouble     *times, *maxTimes;
1311:   PetscStageLog      stageLog;
1312:   PetscEventRegInfo  *eventRegInfo;
1313:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1314:   PetscBool          wasPrinted;
1315:   MPI_Comm           comm;

1318:   PetscObjectGetComm((PetscObject)viewer,&comm);
1319:   PetscLogGetStageLog(&stageLog);
1320:   eventRegInfo  = stageLog->eventLog->eventInfo;

1322:   PetscMalloc1(nstMax+1,&times);
1323:   PetscMalloc1(nstMax+1,&maxTimes);
1324:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1325:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1326:   PetscFree(times);

1328:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1330:   /* Sort the self-timers on basis of the largest time needed */
1331:   for (nst=0; nst<=nstMax; nst++) {
1332:     sortSelfTimes[nst].id  = nst;
1333:     sortSelfTimes[nst].val = maxTimes[nst];
1334:   }
1335:   PetscFree(maxTimes);
1336:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1338:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1339:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1341:   for (i=0; i<=nstMax; i++) {
1342:     if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1343:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1344:       const char         *name    = eventRegInfo[(PetscLogEvent)nstEvent].name;
1345:       PetscEventPerfInfo selfPerfInfo;

1347:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1348:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1349:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1350:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1351:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1353:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1354:       if (wasPrinted){
1355:         PetscViewerXMLEndSection(viewer, "event");
1356:       }
1357:     }
1358:   }
1359:   PetscViewerXMLEndSection(viewer, "selftimertable");
1360:   PetscFree(sortSelfTimes);
1361:   return(0);
1362: }

1364: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1365: {
1366:   PetscErrorCode       ierr;
1367:   PetscLogDouble       locTotalTime, globTotalTime;
1368:   PetscNestedEventTree *tree = NULL;
1369:   PetscSelfTimer       *selftimers = NULL;
1370:   int                  nTimers = 0, nstMax = 0;
1371:   MPI_Comm             comm;

1374:   PetscObjectGetComm((PetscObject)viewer,&comm);
1375:   PetscViewerInitASCII_XML(viewer);
1376:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1377:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1379:   /* Get the total elapsed time, local and global maximum */
1380:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1381:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1383:   /* Print global information about this run */
1384:   PetscPrintExeSpecs(viewer);
1385:   PetscPrintGlobalPerformance(viewer, locTotalTime);

1387:   /* Collect nested timer tree info from all processes */
1388:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1389:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1390:   PetscLogNestedTreeDestroy(tree, nTimers);

1392:   /* Calculate self-time for all (not-nested) events */
1393:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1394:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1395:   PetscFree(selftimers);

1397:   PetscViewerXMLEndSection(viewer, "petscroot");
1398:   PetscViewerFinalASCII_XML(viewer);
1399:   return(0);
1400: }

1402: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1403: {
1404: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1406: #endif

1409:   petsc_send_ct++;
1410: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1411:   PetscMPITypeSize(&petsc_send_len,count, MPI_Type_f2c((MPI_Fint) datatype));
1412: #endif
1413:   return(0);
1414: }

1416: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1417: {
1418: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1420: #endif

1423:   petsc_recv_ct++;
1424: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1425:   PetscMPITypeSize(&petsc_recv_len,count, MPI_Type_f2c((MPI_Fint) datatype));
1426: #endif
1427:   return(0);
1428: }

1430: PETSC_EXTERN PetscErrorCode PetscAReduce()
1431: {
1433:   petsc_allreduce_ct++;
1434:   return(0);
1435: }

1437: #endif