Actual source code: xmllogevent.c

petsc-3.14.6 2021-03-30
Report Typos and Errors
  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80: #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent    dftParentActive        = DFT_ID_AWAKE;
 94: static int              nNestedEvents          = 0;
 95: static int              nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents          = NULL;
 97: static PetscLogDouble   thresholdTime          = 0.01; /* initial value was 0.1 */

 99: #define THRESHOLD (thresholdTime/100.0+1e-12)

101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);


106: /*@C
107:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108:   rates and object creation and should not slow programs down too much.

110:   Logically Collective over PETSC_COMM_WORLD

112:   Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

115:   Usage:
116: .vb
117:       PetscInitialize(...);
118:       PetscLogNestedBegin();
119:        ... code ...
120:       PetscLogView(viewer);
121:       PetscFinalize();
122: .ve

124:   Level: advanced

126: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {

133:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

135:   nNestedEventsAllocated = 10;
136:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
137:   dftParentActive = DFT_ID_AWAKE;
138:   nNestedEvents =1;

140:   /* 'Awake' is nested event 0. It has no parents */
141:   nestedEvents[0].nstEvent          = 0;
142:   nestedEvents[0].nParents          = 0;
143:   nestedEvents[0].dftParentsSorted  = NULL;
144:   nestedEvents[0].dftEvents         = NULL;
145:   nestedEvents[0].dftParents        = NULL;
146:   nestedEvents[0].dftEventsSorted   = NULL;

148:   PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
149:   return(0);
150: }

152: /* Delete the data structures for the nested timers */
153: PetscErrorCode PetscLogNestedEnd(void)
154: {
156:   int            i;

159:   if (!nestedEvents) return(0);
160:   for (i=0; i<nNestedEvents; i++) {
161:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
162:   }
163:   PetscFree(nestedEvents);
164:   nestedEvents           = NULL;
165:   nNestedEvents          = 0;
166:   nNestedEventsAllocated = 0;
167:   return(0);
168: }


171: /*
172:  UTILITIES: FIND STUFF IN SORTED ARRAYS

174:     dftIndex - index to be found
175:     dftArray - sorted array of PetscLogEvent-ids
176:     narray - dimension of dftArray
177:     entry - entry in the array where dftIndex may be found;

179:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
180:      In that case, the dftIndex can be inserted at this entry.
181: */
182: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
183: {
185:   if (narray==0 || dftIndex <= dftArray[0]) {
186:     *entry = 0;
187:   } else if (dftIndex > dftArray[narray-1]) {
188:     *entry = narray;
189:   } else {
190:     int ihigh = narray-1, ilow=0;
191:     while (ihigh>ilow) {
192:       const int imiddle = (ihigh+ilow)/2;
193:       if (dftArray[imiddle] > dftIndex) {
194:         ihigh = imiddle;
195:       } else if (dftArray[imiddle]<dftIndex) {
196:         ilow = imiddle+1;
197:       } else {
198:         ihigh = imiddle;
199:         ilow  = imiddle;
200:       }
201:     }
202:     *entry = ihigh;
203:   }
204:   return(0);
205: }

207: /*
208:     Utility: find the nested event with given identification

210:     nstEvent - Nested event to be found
211:     entry - entry in the nestedEvents where nstEvent may be found;

213:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
214: */
215: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
216: {
218:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
219:     *entry = 0;
220:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
221:     *entry = nNestedEvents;
222:   } else {
223:     int ihigh = nNestedEvents-1,  ilow = 0;
224:     while (ihigh>ilow) {
225:       const int imiddle = (ihigh+ilow)/2;
226:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
227:         ihigh = imiddle;
228:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
229:         ilow = imiddle+1;
230:       } else {
231:         ihigh = imiddle;
232:         ilow  = imiddle;
233:       }
234:     }
235:     *entry = ihigh;
236:   }
237:   return(0);
238: }

240: /*
241:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
242:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
243: */

245: #define MAINSTAGE 0

247: static PetscLogStage savedStage = 0;

249: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
250: {
251:   PetscStageLog  stageLog = petsc_stageLog;

255:   if (stageLog->curStage == MAINSTAGE) return(0);
256:   savedStage = stageLog->curStage;
257:   stageLog->curStage = MAINSTAGE;
258:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
259:   return(0);
260: }

262: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
263: {
264:   PetscStageLog  stageLog = petsc_stageLog;

268:   if (savedStage == MAINSTAGE) return(0);
269:   stageLog->curStage = savedStage;
270:   PetscIntStackPop(stageLog->stack, &savedStage);
271:   return(0);
272: }

274: /******************************************************************************************/
275: /* Start a nested event */
276: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
277: {
278:   PetscErrorCode  ierr;
279:   int             entry, pentry, tentry,i;
280:   PetscLogEvent   dftEvent;

283:   PetscLogEventFindNestedTimer(nstEvent, &entry);
284:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
285:     /* Nested event doesn't exist yet: create it */

287:     if (nNestedEvents==nNestedEventsAllocated) {
288:       /* Enlarge and re-allocate nestedEvents if needed */
289:       PetscNestedEvent *tmp = nestedEvents;
290:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
291:       nNestedEventsAllocated*=2;
292:       PetscArraycpy(nestedEvents, tmp, nNestedEvents);
293:       PetscFree(tmp);
294:     }

296:     /* Clear space in nestedEvents for new nested event */
297:     nNestedEvents++;
298:     for (i = nNestedEvents-1; i>entry; i--) {
299:       nestedEvents[i] = nestedEvents[i-1];
300:     }

302:     /* Create event in nestedEvents */
303:     nestedEvents[entry].nstEvent = nstEvent;
304:     nestedEvents[entry].nParents=1;
305:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

307:     /* Fill in new event */
308:     pentry = 0;
309:     dftEvent = (PetscLogEvent) nstEvent;

311:     nestedEvents[entry].nstEvent                 = nstEvent;
312:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
313:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
314:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
315:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

317:   } else {
318:     /* Nested event exists: find current dftParentActive among parents */
319:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
320:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
321:     int           nParents          = nestedEvents[entry].nParents;

323:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

325:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
326:       /* dftParentActive not in the list: add it to the list */
327:       int           i;
328:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
329:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
330:       char          name[100];

332:       /* Register a new default timer */
333:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
334:       PetscLogEventRegister(name, 0, &dftEvent);
335:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

337:       /* Reallocate parents and dftEvents to make space for new parent */
338:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
339:       PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
340:       PetscArraycpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents);
341:       PetscArraycpy(nestedEvents[entry].dftParents,       dftParents,       nParents);
342:       PetscArraycpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents);
343:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

345:       dftParents       = nestedEvents[entry].dftParents;
346:       dftEvents        = nestedEvents[entry].dftEvents;
347:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
348:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

350:       nestedEvents[entry].nParents++;
351:       nParents++;

353:       for (i = nParents-1; i>pentry; i--) {
354:         dftParentsSorted[i] = dftParentsSorted[i-1];
355:         dftEvents[i]        = dftEvents[i-1];
356:       }
357:       for (i = nParents-1; i>tentry; i--) {
358:         dftParents[i]      = dftParents[i-1];
359:         dftEventsSorted[i] = dftEventsSorted[i-1];
360:       }

362:       /* Fill in the new default timer */
363:       dftParentsSorted[pentry] = dftParentActive;
364:       dftEvents[pentry]        = dftEvent;
365:       dftParents[tentry]       = dftParentActive;
366:       dftEventsSorted[tentry]  = dftEvent;

368:     } else {
369:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
370:       dftEvent = nestedEvents[entry].dftEvents[pentry];
371:     }
372:   }

374:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
375:   PetscLogStageOverride();
376:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
377:   PetscLogStageRestore();
378:   dftParentActive = dftEvent;
379:   return(0);
380: }

382: /* End a nested event */
383: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
384: {
385:   PetscErrorCode  ierr;
386:   int             entry, pentry, nParents;
387:   PetscLogEvent  *dftEventsSorted;

390:   /* Find the nested event */
391:   PetscLogEventFindNestedTimer(nstEvent, &entry);
392:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
393:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
394:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
395:   nParents        = nestedEvents[entry].nParents;

397:   /* Find the current default timer among the 'dftEvents' of this event */
398:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

400:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
401:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

403:   /* Stop the default timer and update the dftParentActive */
404:   PetscLogStageOverride();
405:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
406:   PetscLogStageRestore();
407:   dftParentActive = nestedEvents[entry].dftParents[pentry];
408:   return(0);
409: }

411: /*@
412:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
413:           that takes 1 or more percent of the time.

415:   Logically Collective over PETSC_COMM_WORLD

417:   Input Parameter:
418: .   newThresh - the threshold to use

420:   Output Parameter:
421: .   oldThresh - the previously set threshold value

423:   Options Database Keys:
424: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

426:   Usage:
427: .vb
428:       PetscInitialize(...);
429:       PetscLogNestedBegin();
430:       PetscLogSetThreshold(0.1,&oldthresh);
431:        ... code ...
432:       PetscLogView(viewer);
433:       PetscFinalize();
434: .ve

436:   Level: advanced

438: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
439:           PetscLogNestedBegin()
440: @*/
441: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
442: {
444:   if (oldThresh) *oldThresh = thresholdTime;
445:   if (newThresh == PETSC_DECIDE)  newThresh = 0.01;
446:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
447:   thresholdTime = PetscMax(newThresh, 0.0);
448:   return(0);
449: }

451: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
452: {
453:   PetscErrorCode     ierr;
454:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
455:   char               version[256], buildoptions[128] = "";
456:   PetscMPIInt        size;
457:   size_t             len;

460:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
461:   PetscGetArchType(arch,sizeof(arch));
462:   PetscGetHostName(hostname,sizeof(hostname));
463:   PetscGetUserName(username,sizeof(username));
464:   PetscGetProgramName(pname,sizeof(pname));
465:   PetscGetDate(date,sizeof(date));
466:   PetscGetVersion(version,sizeof(version));

468:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
469:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname);
470:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch);
471:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
472:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size);
473:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
474:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
475:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);

477:   if (PetscDefined(USE_DEBUG)) {
478:     PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
479:   }
480:   if (PetscDefined(USE_COMPLEX)) {
481:     PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
482:   }
483:   if (PetscDefined(USE_REAL_SINGLE)) {
484:     PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
485:   } else if (PetscDefined(USE_REAL___FLOAT128)) {
486:     PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
487:   } else if (PetscDefined(USE_REAL___FP16)) {
488:     PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
489:   }
490:   if (PetscDefined(USE_64BIT_INDICES)) {
491:     PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
492:   }
493: #if defined(__cplusplus)
494:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
495: #endif
496:   PetscStrlen(buildoptions,&len);
497:   if (len) {
498:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
499:   }
500:   PetscViewerXMLEndSection(viewer, "runspecification");
501:   return(0);
502: }

504: /* Print the global performance: max, max/min, average and total of
505:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
506:  */
507: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
508: {
509:   PetscErrorCode  ierr;
510:   PetscLogDouble  min, tot, ratio, avg;
511:   MPI_Comm        comm;
512:   PetscMPIInt     rank, size;
513:   PetscLogDouble  valrank[2], max[2];

516:   PetscObjectGetComm((PetscObject)viewer,&comm);
517:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
518:   MPI_Comm_rank(comm, &rank);

520:   valrank[0] = local_val;
521:   valrank[1] = (PetscLogDouble) rank;
522:   MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE,  MPI_MIN,    comm);
523:   MPIU_Allreduce(valrank,    &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
524:   MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);
525:   avg  = tot/((PetscLogDouble) size);
526:   if (min != 0.0) ratio = max[0]/min;
527:   else ratio = 0.0;

529:   PetscViewerXMLStartSection(viewer, name, desc);
530:   PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
531:   PetscViewerXMLPutInt(   viewer, "maxrank"  , "rank at which max was found" , (PetscMPIInt) max[1]);
532:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
533:   if (print_average) {
534:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
535:   }
536:   if (print_total) {
537:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
538:   }
539:   PetscViewerXMLEndSection(viewer, name);
540:   return(0);
541: }

543: /* Print the global performance: max, max/min, average and total of
544:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
545:  */
546: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
547: {
548:   PetscErrorCode  ierr;
549:   PetscLogDouble  flops, mem, red, mess;
550:   const PetscBool print_total_yes   = PETSC_TRUE,
551:                   print_total_no    = PETSC_FALSE,
552:                   print_average_no  = PETSC_FALSE,
553:                   print_average_yes = PETSC_TRUE;

556:   /* Must preserve reduction count before we go on */
557:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

559:   /* Calculate summary information */
560:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

562:   /*   Time */
563:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);

565:   /*   Objects */
566:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);

568:   /*   Flop */
569:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);

571:   /*   Flop/sec -- Must talk to Barry here */
572:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
573:   else flops = 0.0;
574:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);

576:   /*   Memory */
577:   PetscMallocGetMaximumUsage(&mem);
578:   if (mem > 0.0) {
579:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
580:   }
581:   /*   Messages */
582:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
583:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);

585:   /*   Message Volume */
586:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
587:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);

589:   /*   Reductions */
590:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
591:   PetscViewerXMLEndSection(viewer, "globalperformance");
592:   return(0);
593: }

595: typedef struct {
596:   PetscLogEvent  dftEvent;
597:   NestedEventId  nstEvent;
598:   PetscLogEvent  dftParent;
599:   NestedEventId  nstParent;
600:   PetscBool      own;
601:   int            depth;
602:   NestedEventId* nstPath;
603: } PetscNestedEventTree;

605: /* Compare timers to sort them in the tree */
606: static int compareTreeItems(const void *item1_, const void *item2_)
607: {
608:   int                  i;
609:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
610:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;

612:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
613:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
614:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
615:   }
616:   if (item1->depth < item2->depth) return -1;
617:   if (item1->depth > item2->depth) return 1;
618:   return 0;
619: }
620: /*
621:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
622:  * calls that happen in some processes, but not in others.
623:  *
624:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
625:  * The tree is sorted so that the timers can be printed in the order of appearance.
626:  *
627:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
628:  * following rule is followed:
629:  * + if information from my own process is available, then that is the information stored in tree.
630:  *   otherwise it is some other process's information.
631:  */
632: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
633: {
634:   PetscNestedEventTree *tree = NULL, *newTree;
635:   int                  *treeIndices;
636:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
637:   int                  yesno;
638:   PetscBool            done;
639:   PetscErrorCode       ierr;
640:   int                  maxdepth;
641:   int                  depth;
642:   int                  illegalEvent;
643:   int                  iextra;
644:   NestedEventId        *nstPath, *nstMyPath;
645:   MPI_Comm             comm;

648:   PetscObjectGetComm((PetscObject)viewer,&comm);

650:   /* Calculate memory needed to store everybody's information and allocate tree */
651:   nTimers = 0;
652:   for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

654:   PetscMalloc1(nTimers,&tree);

656:   /* Fill tree with readily available information */
657:   iTimer0 = 0;
658:   maxDefaultTimer =0;
659:   for (i=0; i<nNestedEvents; i++) {
660:     int           nParents          = nestedEvents[i].nParents;
661:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
662:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
663:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
664:     for (j=0; j<nParents; j++) {
665:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

667:       tree[iTimer0+j].dftEvent   = dftEvents[j];
668:       tree[iTimer0+j].nstEvent   = nstEvent;
669:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
670:       tree[iTimer0+j].own        = PETSC_TRUE;

672:       tree[iTimer0+j].nstParent  = 0;
673:       tree[iTimer0+j].depth      = 0;
674:       tree[iTimer0+j].nstPath    = NULL;
675:     }
676:     iTimer0 += nParents;
677:   }

679:   /* Calculate the global maximum for the default timer index, so array treeIndices can
680:    * be allocated only once */
681:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
682:   maxDefaultTimer = j;

684:   /* Find default timer's place in the tree */
685:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
686:   treeIndices[0] = 0;
687:   for (i=0; i<nTimers; i++) {
688:     PetscLogEvent dftEvent = tree[i].dftEvent;
689:     treeIndices[dftEvent] = i;
690:   }

692:   /* Find each dftParent's nested identification */
693:   for (i=0; i<nTimers; i++) {
694:     PetscLogEvent dftParent = tree[i].dftParent;
695:     if (dftParent!= DFT_ID_AWAKE) {
696:       int j = treeIndices[dftParent];
697:       tree[i].nstParent = tree[j].nstEvent;
698:     }
699:   }

701:   /* Find depths for each timer path */
702:   done = PETSC_FALSE;
703:   maxdepth = 0;
704:   while (!done) {
705:     done = PETSC_TRUE;
706:     for (i=0; i<nTimers; i++) {
707:       if (tree[i].dftParent == DFT_ID_AWAKE) {
708:         tree[i].depth = 1;
709:         maxdepth = PetscMax(1,maxdepth);
710:       } else {
711:         int j = treeIndices[tree[i].dftParent];
712:         depth = 1+tree[j].depth;
713:         if (depth>tree[i].depth) {
714:           done          = PETSC_FALSE;
715:           tree[i].depth = depth;
716:           maxdepth      = PetscMax(depth,maxdepth);
717:         }
718:       }
719:     }
720:   }

722:   /* Allocate the paths in the entire tree */
723:   for (i=0; i<nTimers; i++) {
724:     depth = tree[i].depth;
725:     PetscCalloc1(depth,&tree[i].nstPath);
726:   }

728:   /* Calculate the paths for all timers */
729:   for (depth=1; depth<=maxdepth; depth++) {
730:     for (i=0; i<nTimers; i++) {
731:       if (tree[i].depth==depth) {
732:         if (depth>1) {
733:           int    j = treeIndices[tree[i].dftParent];
734:           PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
735:         }
736:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
737:       }
738:     }
739:   }
740:   PetscFree(treeIndices);

742:   /* Sort the tree on basis of the paths */
743:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

745:   /* Allocate an array to store paths */
746:   depth = maxdepth;
747:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
748:   PetscMalloc1(maxdepth+1, &nstPath);
749:   PetscMalloc1(maxdepth+1, &nstMyPath);

751:   /* Find an illegal nested event index (1+largest nested event index) */
752:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
753:   i = illegalEvent;
754:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

756:   /* First, detect timers which are not available in this process, but are available in others
757:    *        Allocate a new tree, that can contain all timers
758:    * Then,  fill the new tree with all (own and not-own) timers */
759:   newTree= NULL;
760:   for (yesno=0; yesno<=1; yesno++) {
761:     depth  = 1;
762:     i      = 0;
763:     iextra = 0;
764:     while (depth>0) {
765:       int       j;
766:       PetscBool same;

768:       /* Construct the next path in this process's tree:
769:        * if necessary, supplement with invalid path entries */
770:       depth++;
771:       if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
772:       if (i<nTimers) {
773:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
774:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
775:       } else {
776:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
777:       }

779:       /* Communicate with other processes to obtain the next path and its depth */
780:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
781:       for (j=depth-1; (int) j>=0; j--) {
782:         if (nstPath[j]==illegalEvent) depth=j;
783:       }

785:       if (depth>0) {
786:         /* If the path exists */

788:         /* check whether the next path is the same as this process's next path */
789:         same = PETSC_TRUE;
790:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}

792:         if (same) {
793:           /* Register 'own path' */
794:           if (newTree) newTree[i+iextra] = tree[i];
795:           i++;
796:         } else {
797:           /* Register 'not an own path' */
798:           if (newTree) {
799:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
800:             newTree[i+iextra].own        = PETSC_FALSE;
801:             newTree[i+iextra].depth      = depth;
802:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
803:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

805:             newTree[i+iextra].dftEvent  = 0;
806:             newTree[i+iextra].dftParent = 0;
807:             newTree[i+iextra].nstParent = 0;
808:           }
809:           iextra++;
810:         }

812:       }
813:     }

815:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
816:     totalNTimers = nTimers + iextra;
817:     if (!newTree) {
818:       PetscMalloc1(totalNTimers, &newTree);
819:     }
820:   }
821:   PetscFree(nstPath);
822:   PetscFree(nstMyPath);
823:   PetscFree(tree);
824:   tree = newTree;
825:   newTree = NULL;

827:   /* Set return value and return */
828:   *p_tree    = tree;
829:   *p_nTimers = totalNTimers;
830:   return(0);
831: }

833: /*
834:  * Delete the nested timer tree
835:  */
836: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
837: {
838:   int             i;
839:   PetscErrorCode  ierr;

842:   for (i=0; i<nTimers; i++) {
843:     PetscFree(tree[i].nstPath);
844:   }
845:   PetscFree(tree);
846:   return(0);
847: }

849: /* Print the global performance: max, max/min, average and total of
850:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
851:  */
852: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
853: {
854:   MPI_Comm       comm;                          /* MPI communicator in reduction */
855:   PetscMPIInt    rank;                          /* rank of this process */
856:   PetscLogDouble val_in[2], max[2], min[2];
857:   PetscLogDouble minvalue, maxvalue, tot;
858:   PetscMPIInt    size;
859:   PetscMPIInt    minLoc, maxLoc;

863:   PetscObjectGetComm((PetscObject)viewer,&comm);
864:   MPI_Comm_size(comm, &size);
865:   MPI_Comm_rank(comm, &rank);
866:   val_in[0] = value;
867:   val_in[1] = (PetscLogDouble) rank;
868:   MPIU_Allreduce(val_in, max,  1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
869:   MPIU_Allreduce(val_in, min,  1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
870:   maxvalue = max[0];
871:   maxLoc   = (PetscMPIInt) max[1];
872:   minvalue = min[0];
873:   minLoc   = (PetscMPIInt) min[1];
874:   MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);

876:   if (maxvalue<maxthreshold && minvalue>=minthreshold) {
877:     /* One call per parent or NO value: don't print */
878:   } else {
879:      PetscViewerXMLStartSection(viewer, name, NULL);
880:      if (maxvalue>minvalue*minmaxtreshold) {
881:        PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
882:        PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
883:        PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
884:        PetscViewerXMLPutInt(   viewer, "minloc"  , NULL, minLoc);
885:        PetscViewerXMLPutInt(   viewer, "maxloc"  , NULL, maxLoc);
886:      } else {
887:        PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
888:      }
889:      PetscViewerXMLEndSection(viewer, name);
890:   }
891:   return(0);
892: }

894: #define N_COMM 8
895: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
896: {
897:   PetscLogDouble time = perfInfo.time;
898:   PetscLogDouble timeMx;
900:   MPI_Comm       comm;

903:   PetscObjectGetComm((PetscObject)viewer,&comm);
904:   MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
905:   *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
906:   if (*isPrinted) {
907:     PetscViewerXMLStartSection(viewer, "event", NULL);
908:     PetscViewerXMLPutString(viewer, "name", NULL, name);
909:     PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
910:     PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
911:     PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
912:     PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
913:     PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
914:   }
915:   return(0);
916: }

918: /* Count the number of times the parent event was called */

920: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
921: {
922:   if (tree[i].depth<=1) {
923:     return 1;  /* Main event: only once */
924:   } else if (!tree[i].own) {
925:     return 1;  /* This event didn't happen in this process, but did in another */
926:   } else {
927:     int iParent;
928:     for (iParent=i-1; iParent>=0; iParent--) {
929:       if (tree[iParent].depth == tree[i].depth-1) break;
930:     }
931:     if (tree[iParent].depth != tree[i].depth-1) {
932:       /* *****  Internal error: cannot find parent */
933:       return -2;
934:     } else {
935:       PetscLogEvent dftEvent  = tree[iParent].dftEvent;
936:       return eventPerfInfo[dftEvent].count;
937:     }
938:   }
939: }

941: typedef struct {
942:   int             id;
943:   PetscLogDouble  val;
944: } PetscSortItem;

946: static int compareSortItems(const void *item1_, const void *item2_)
947: {
948:   PetscSortItem *item1 = (PetscSortItem *) item1_;
949:   PetscSortItem *item2 = (PetscSortItem *) item2_;
950:   if (item1->val > item2->val) return -1;
951:   if (item1->val < item2->val) return +1;
952:   return 0;
953: }

955: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
956: {
957:   int                depth = tree[iStart].depth;
958:   const char         *name;
959:   int                parentCount, nChildren;
960:   PetscSortItem      *children;
961:   PetscErrorCode     ierr;
962:   const int          stage = MAINSTAGE;
963:   PetscStageLog      stageLog;
964:   PetscEventRegInfo  *eventRegInfo;
965:   PetscEventPerfInfo *eventPerfInfo;
966:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
967:   PetscLogDouble     countsPerCall;
968:   PetscBool          wasPrinted;
969:   PetscBool          childWasPrinted;
970:   MPI_Comm           comm;

973:   /* Look up the name of the event and its PerfInfo */
974:   PetscLogGetStageLog(&stageLog);
975:   eventRegInfo  = stageLog->eventLog->eventInfo;
976:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
977:   name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
978:   PetscObjectGetComm((PetscObject)viewer,&comm);

980:   /* Count the number of child processes */
981:   nChildren = 0;
982:   {
983:     int i;
984:     for (i=iStart+1; i<nTimers; i++) {
985:       if (tree[i].depth <= depth) break;
986:       if (tree[i].depth == depth + 1) nChildren++;
987:     }
988:   }

990:   if (nChildren>0) {
991:     /* Create an array for the id-s and maxTimes of the children,
992:      *  leaving 2 spaces for self-time and other-time */
993:     int            i;
994:     PetscLogDouble *times, *maxTimes;

996:     PetscMalloc1(nChildren+2,&children);
997:     nChildren = 0;
998:     for (i=iStart+1; i<nTimers; i++) {
999:       if (tree[i].depth<=depth) break;
1000:       if (tree[i].depth == depth + 1) {
1001:         children[nChildren].id  = i;
1002:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1003:         nChildren++;
1004:       }
1005:     }

1007:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1008:     PetscMalloc1(nChildren,&times);
1009:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1011:     PetscMalloc1(nChildren,&maxTimes);
1012:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1013:     PetscFree(times);

1015:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1016:     PetscFree(maxTimes);
1017:   }

1019:   if (!tree[iStart].own) {
1020:   /* Set values for a timer that was not activated in this process
1021:    * (but was, in other processes of this run) */
1022:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

1024:     selfPerfInfo  = myPerfInfo;
1025:     otherPerfInfo = myPerfInfo;

1027:     parentCount   = 1;
1028:     countsPerCall = 0;
1029:   } else {
1030:   /* Set the values for a timer that was activated in this process */
1031:     int           i;
1032:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1034:     parentCount    = countParents( tree, eventPerfInfo, iStart);
1035:     myPerfInfo     = eventPerfInfo[dftEvent];
1036:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

1038:     selfPerfInfo                = myPerfInfo;
1039:     otherPerfInfo.time          = 0;
1040:     otherPerfInfo.flops         = 0;
1041:     otherPerfInfo.numMessages   = 0;
1042:     otherPerfInfo.messageLength = 0;
1043:     otherPerfInfo.numReductions = 0;

1045:     for (i=0; i<nChildren; i++) {
1046:       /* For all child counters: subtract the child values from self-timers */

1048:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1049:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1051:       selfPerfInfo.time          -= childPerfInfo.time;
1052:       selfPerfInfo.flops         -= childPerfInfo.flops;
1053:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1054:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1055:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1057:       if ((children[i].val/totalTime) < THRESHOLD) {
1058:         /* Add them to 'other' if the time is ignored in the output */
1059:         otherPerfInfo.time          += childPerfInfo.time;
1060:         otherPerfInfo.flops         += childPerfInfo.flops;
1061:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1062:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1063:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1064:       }
1065:     }
1066:   }

1068:   /* Main output for this timer */
1069:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1071:   /* Now print the lines for the children */
1072:   if (nChildren > 0) {
1073:     /* Calculate max-times for 'self' and 'other' */
1074:     int            i;
1075:     PetscLogDouble times[2], maxTimes[2];
1076:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1077:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1078:     children[nChildren+0].id = -1;
1079:     children[nChildren+0].val = maxTimes[0];
1080:     children[nChildren+1].id = -2;
1081:     children[nChildren+1].val = maxTimes[1];

1083:     /* Now sort the children (including 'self' and 'other') on total time */
1084:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1086:     /* Print (or ignore) the children in ascending order of total time */
1087:     PetscViewerXMLStartSection(viewer,"events", NULL);
1088:     for (i=0; i<nChildren+2; i++) {
1089:       if ((children[i].val/totalTime) < THRESHOLD) {
1090:         /* ignored: no output */
1091:       } else if (children[i].id==-1) {
1092:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1093:         if (childWasPrinted) {
1094:           PetscViewerXMLEndSection(viewer,"event");
1095:         }
1096:       } else if (children[i].id==-2) {
1097:         size_t  len;
1098:         char    *otherName;

1100:         PetscStrlen(name,&len);
1101:         PetscMalloc1(len+16,&otherName);
1102:         PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1103:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1104:         PetscFree(otherName);
1105:         if (childWasPrinted) {
1106:           PetscViewerXMLEndSection(viewer,"event");
1107:         }
1108:       } else {
1109:         /* Print the child with a recursive call to this function */
1110:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1111:       }
1112:     }
1113:     PetscViewerXMLEndSection(viewer,"events");
1114:     PetscFree(children);
1115:   }

1117:   if (wasPrinted) {
1118:     PetscViewerXMLEndSection(viewer, "event");
1119:   }
1120:   return(0);
1121: }

1123: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1124: {
1125:   int                i, nChildren;
1126:   PetscSortItem      *children;
1127:   PetscErrorCode     ierr;
1128:   const int          stage = MAINSTAGE;
1129:   PetscStageLog      stageLog;
1130:   PetscEventPerfInfo *eventPerfInfo;
1131:   MPI_Comm           comm;

1134:   PetscObjectGetComm((PetscObject)viewer,&comm);

1136:   /* Look up the PerfInfo */
1137:   PetscLogGetStageLog(&stageLog);
1138:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1140:   /* Count the number of child processes, and count total time */
1141:   nChildren = 0;
1142:   for (i=0; i<nTimers; i++)
1143:     if (tree[i].depth==1) nChildren++;

1145:   if (nChildren>0) {
1146:     /* Create an array for the id-s and maxTimes of the children,
1147:      *  leaving 2 spaces for self-time and other-time */
1148:     PetscLogDouble *times, *maxTimes;

1150:     PetscMalloc1(nChildren,&children);
1151:     nChildren = 0;
1152:     for (i=0; i<nTimers; i++) {
1153:       if (tree[i].depth == 1) {
1154:         children[nChildren].id  = i;
1155:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1156:         nChildren++;
1157:       }
1158:     }

1160:     /* Calculate the children's maximum times, to sort them */
1161:     PetscMalloc1(nChildren,&times);
1162:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1164:     PetscMalloc1(nChildren,&maxTimes);
1165:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1166:     PetscFree(times);

1168:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1169:     PetscFree(maxTimes);

1171:     /* Now sort the children on total time */
1172:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1173:     /* Print (or ignore) the children in ascending order of total time */
1174:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1175:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1176:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1178:     for (i=0; i<nChildren; i++) {
1179:       if ((children[i].val/totalTime) < THRESHOLD) {
1180:         /* ignored: no output */
1181:       } else {
1182:         /* Print the child with a recursive call to this function */
1183:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1184:       }
1185:     }
1186:     PetscViewerXMLEndSection(viewer, "timertree");
1187:     PetscFree(children);
1188:   }
1189:   return(0);
1190: }

1192: typedef struct {
1193:   char           *name;
1194:   PetscLogDouble time;
1195:   PetscLogDouble flops;
1196:   PetscLogDouble numMessages;
1197:   PetscLogDouble messageLength;
1198:   PetscLogDouble numReductions;
1199: } PetscSelfTimer;

1201: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1202: {
1203:   PetscErrorCode     ierr;
1204:   const int          stage = MAINSTAGE;
1205:   PetscStageLog      stageLog;
1206:   PetscEventRegInfo  *eventRegInfo;
1207:   PetscEventPerfInfo *eventPerfInfo;
1208:   PetscSelfTimer     *selftimes;
1209:   PetscSelfTimer     *totaltimes;
1210:   NestedEventId      *nstEvents;
1211:   int                i, j, maxDefaultTimer;
1212:   NestedEventId      nst;
1213:   PetscLogEvent      dft;
1214:   int                nstMax, nstMax_local;
1215:   MPI_Comm           comm;

1218:   PetscObjectGetComm((PetscObject)viewer,&comm);
1219:   PetscLogGetStageLog(&stageLog);
1220:   eventRegInfo  = stageLog->eventLog->eventInfo;
1221:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1223:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1224:   maxDefaultTimer =0;
1225:   for (i=0; i<nNestedEvents; i++) {
1226:     int           nParents   = nestedEvents[i].nParents;
1227:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1228:     for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1229:   }
1230:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1231:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1232:   for (i=0; i<nNestedEvents; i++) {
1233:     int           nParents   = nestedEvents[i].nParents;
1234:     NestedEventId nstEvent   = nestedEvents[i].nstEvent;
1235:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1236:     for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1237:   }

1239:   /* Calculate largest nested event-ID */
1240:   nstMax_local = 0;
1241:   for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1242:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);

1244:   /* Initialize all total-times with zero */
1245:   PetscMalloc1(nstMax+1,&selftimes);
1246:   PetscMalloc1(nstMax+1,&totaltimes);
1247:   for (nst=0; nst<=nstMax; nst++) {
1248:     totaltimes[nst].time          = 0;
1249:     totaltimes[nst].flops         = 0;
1250:     totaltimes[nst].numMessages   = 0;
1251:     totaltimes[nst].messageLength = 0;
1252:     totaltimes[nst].numReductions = 0;
1253:     totaltimes[nst].name          = NULL;
1254:   }

1256:   /* Calculate total-times */
1257:   for (i=0; i<nNestedEvents; i++) {
1258:     const int            nParents  = nestedEvents[i].nParents;
1259:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1260:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1261:     for (j=0; j<nParents; j++) {
1262:       const PetscLogEvent dftEvent = dftEvents[j];
1263:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1264:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1265:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1266:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1267:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1268:     }
1269:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1270:   }

1272:   /* Initialize: self-times := totaltimes */
1273:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1275:   /* Subtract timed subprocesses from self-times */
1276:   for (i=0; i<nNestedEvents; i++) {
1277:     const int           nParents          = nestedEvents[i].nParents;
1278:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1279:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1280:     for (j=0; j<nParents; j++) {
1281:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1282:         const PetscLogEvent dftEvent  = dftEvents[j];
1283:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1284:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1285:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1286:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1287:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1288:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1289:       }
1290:     }
1291:   }

1293:   PetscFree(nstEvents);
1294:   PetscFree(totaltimes);

1296:   /* Set outputs */
1297:   *p_self  = selftimes;
1298:   *p_nstMax = nstMax;
1299:   return(0);
1300: }

1302: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1303: {
1304:   PetscErrorCode     ierr;
1305:   int                i;
1306:   NestedEventId      nst;
1307:   PetscSortItem      *sortSelfTimes;
1308:   PetscLogDouble     *times, *maxTimes;
1309:   PetscStageLog      stageLog;
1310:   PetscEventRegInfo  *eventRegInfo;
1311:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1312:   PetscBool          wasPrinted;
1313:   MPI_Comm           comm;

1316:   PetscObjectGetComm((PetscObject)viewer,&comm);
1317:   PetscLogGetStageLog(&stageLog);
1318:   eventRegInfo  = stageLog->eventLog->eventInfo;

1320:   PetscMalloc1(nstMax+1,&times);
1321:   PetscMalloc1(nstMax+1,&maxTimes);
1322:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1323:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1324:   PetscFree(times);

1326:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1328:   /* Sort the self-timers on basis of the largest time needed */
1329:   for (nst=0; nst<=nstMax; nst++) {
1330:     sortSelfTimes[nst].id  = nst;
1331:     sortSelfTimes[nst].val = maxTimes[nst];
1332:   }
1333:   PetscFree(maxTimes);
1334:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1336:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1337:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1339:   for (i=0; i<=nstMax; i++) {
1340:     if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1341:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1342:       const char         *name    = eventRegInfo[(PetscLogEvent)nstEvent].name;
1343:       PetscEventPerfInfo selfPerfInfo;

1345:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1346:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1347:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1348:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1349:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1351:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1352:       if (wasPrinted){
1353:         PetscViewerXMLEndSection(viewer, "event");
1354:       }
1355:     }
1356:   }
1357:   PetscViewerXMLEndSection(viewer, "selftimertable");
1358:   PetscFree(sortSelfTimes);
1359:   return(0);
1360: }

1362: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1363: {
1364:   PetscErrorCode       ierr;
1365:   PetscLogDouble       locTotalTime, globTotalTime;
1366:   PetscNestedEventTree *tree = NULL;
1367:   PetscSelfTimer       *selftimers = NULL;
1368:   int                  nTimers = 0, nstMax = 0;
1369:   MPI_Comm             comm;

1372:   PetscObjectGetComm((PetscObject)viewer,&comm);
1373:   PetscViewerInitASCII_XML(viewer);
1374:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1375:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1377:   /* Get the total elapsed time, local and global maximum */
1378:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1379:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1381:   /* Print global information about this run */
1382:   PetscPrintExeSpecs(viewer);
1383:   PetscPrintGlobalPerformance(viewer, locTotalTime);

1385:   /* Collect nested timer tree info from all processes */
1386:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1387:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1388:   PetscLogNestedTreeDestroy(tree, nTimers);

1390:   /* Calculate self-time for all (not-nested) events */
1391:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1392:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1393:   PetscFree(selftimers);

1395:   PetscViewerXMLEndSection(viewer, "petscroot");
1396:   PetscViewerFinalASCII_XML(viewer);
1397:   return(0);
1398: }

1400: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1401: {
1402: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1404: #endif

1407:   petsc_send_ct++;
1408: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1409:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1410: #endif
1411:   return(0);
1412: }

1414: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1415: {
1416: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1418: #endif

1421:   petsc_recv_ct++;
1422: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1423:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1424: #endif
1425:   return(0);
1426: }

1428: PETSC_EXTERN PetscErrorCode PetscAReduce()
1429: {
1431:   petsc_allreduce_ct++;
1432:   return(0);
1433: }

1435: #endif