Actual source code: xmllogevent.c

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8:  #include <petsclog.h>
  9:  #include <petsc/private/logimpl.h>
 10:  #include <petsctime.h>
 11:  #include <petscviewer.h>
 12: #include "../src/sys/logging/xmllogevent.h"
 13: #include "../src/sys/logging/xmlviewer.h"

 15: #if defined(PETSC_USE_LOG)

 17: /*
 18:  * Support for nested PetscTimers
 19:  *
 20:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 21:  * message passing statistics, flop counts.  Information about the nested structure
 22:  * of the timers is lost. Example:
 23:  *
 24:  * 7:30   Start: awake
 25:  * 7:30      Start: morning routine
 26:  * 7:40         Start: eat
 27:  * 7:49         Done:  eat
 28:  * 7:43      Done:  morning routine
 29:  * 8:15      Start: work
 30:  * 12:15        Start: eat
 31:  * 12:45        Done:  eat
 32:  * 16:00     Done:  work
 33:  * 16:30     Start: evening routine
 34:  * 18:30        Start: eat
 35:  * 19:15        Done:  eat
 36:  * 22:00     Done:  evening routine
 37:  * 22:00  Done:  awake
 38:  *
 39:  * Petsc timers provide the following timer results:
 40:  *
 41:  *    awake:              1 call    14:30 hours
 42:  *    morning routine:    1 call     0:13 hours
 43:  *    eat:                3 calls    1:24 hours
 44:  *    work:               1 call     7:45 hours
 45:  *    evening routine     1 call     5:30 hours
 46:  *
 47:  * Nested timers can be used to get the following table:
 48:  *
 49:  *   [1 call]: awake                14:30 hours
 50:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 51:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 52:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 53:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 54:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 55:  *                   rest (work)               7:15 hours         (94 % of work)
 56:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 57:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 58:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 59:  *
 60:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 61:  * the nested timers make the stages unnecessary.
 62:  *
 63:  */

 65: /*
 66:  * Data structures for keeping track of nested timers:
 67:  *
 68:  *   nestedEvents: information about the timers that have actually been activated
 69:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 70:  *
 71:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 72:  * (one or more) default timers, where one of the default timers has the same event-id as the
 73:  * nested one.
 74:  *
 75:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 76:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 77:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 78:  * default timers with 'dft'.
 79:  */

 81: #define DFT_ID_AWAKE -1

 83: typedef PetscLogEvent NestedEventId;
 84: typedef struct {
 85:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 86:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 87:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 88:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 90:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 91:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 92: } PetscNestedEvent;

 94: static PetscLogEvent    dftParentActive        = DFT_ID_AWAKE;
 95: static int              nNestedEvents          = 0;
 96: static int              nNestedEventsAllocated = 0;
 97: static PetscNestedEvent *nestedEvents          = NULL;
 98: static PetscLogDouble   thresholdTime          = 0.01; /* initial value was 0.1 */

100: #define THRESHOLD (thresholdTime/100.0+1e-12)

102: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
104: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);


107: /*@C
108:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
109:   rates and object creation and should not slow programs down too much.

111:   Logically Collective over PETSC_COMM_WORLD

113:   Options Database Keys:
114: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

116:   Usage:
117: .vb
118:       PetscInitialize(...);
119:       PetscLogNestedBegin();
120:        ... code ...
121:       PetscLogView(viewer);
122:       PetscFinalize();
123: .ve

125:   Level: advanced

127: .keywords: log, begin
128: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
129: @*/
130: PetscErrorCode PetscLogNestedBegin(void)
131: {
134:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

136:   nNestedEventsAllocated = 10;
137:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);


140:   dftParentActive = DFT_ID_AWAKE;
141:   nNestedEvents =1;

143:   /* 'Awake' is nested event 0. It has no parents */
144:   nestedEvents[0].nstEvent          = 0;
145:   nestedEvents[0].nParents          = 0;
146:   nestedEvents[0].dftParentsSorted  = NULL;
147:   nestedEvents[0].dftEvents         = NULL;
148:   nestedEvents[0].dftParents        = NULL;
149:   nestedEvents[0].dftEventsSorted   = NULL;

151:   PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
152:   return(0);
153: }

155: /* Delete the data structures for the nested timers */
156: PetscErrorCode PetscLogNestedEnd(void)
157: {
159:   int            i;
161:   if (!nestedEvents) return(0);
162:   for (i=0; i<nNestedEvents; i++) {
163:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
164:   }
165:   PetscFree(nestedEvents);
166:   nestedEvents           = NULL;
167:   nNestedEvents          = 0;
168:   nNestedEventsAllocated = 0;
169:   return(0);
170: }


173: /*
174:  * UTILITIES: FIND STUFF IN SORTED ARRAYS
175:  *
176:  * Utility: find a default timer in a sorted array */
177: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex, /* index to be found */
178:                                                     const PetscLogEvent *dftArray,  /* sorted array of PetscLogEvent-ids */
179:                                                     int narray,  /* dimension of dftArray */
180:                                                     int *entry)  /* entry in the array where dftIndex may be found;
181:                                                                   * if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
182:                                                                   * In that case, the dftIndex can be inserted at this entry. */
183: {
185:   if (narray==0 || dftIndex <= dftArray[0]) {
186:     *entry = 0;
187:   } else if (dftIndex > dftArray[narray-1]) {
188:     *entry = narray;
189:   } else {
190:     int ihigh=narray-1,  ilow=0;
191:     while (ihigh>ilow) {
192:       const int imiddle = (ihigh+ilow)/2;
193:       if (dftArray[imiddle] > dftIndex) {
194:         ihigh=imiddle;
195:       } else if (dftArray[imiddle]<dftIndex) {
196:         ilow =imiddle+1;
197:       } else {
198:         ihigh=imiddle;
199:         ilow =imiddle;
200:       }
201:     }
202:     *entry = ihigh;
203:   }
204:   return(0);
205: }

207: /* Utility: find the nested event with given identification */
208: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, /* Nested event to be found */
209:                                                    int *entry)  /* entry in the nestedEvents where nstEvent may be found;
210:                                                                  * if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray */
211: {

214:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
215:     *entry = 0;
216:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
217:     *entry = nNestedEvents;
218:   } else {
219:     int ihigh=nNestedEvents-1,  ilow=0;
220:     while (ihigh>ilow) {
221:       const int imiddle = (ihigh+ilow)/2;
222:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
223:         ihigh=imiddle;
224:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
225:         ilow =imiddle+1;
226:       } else {
227:         ihigh=imiddle;
228:         ilow =imiddle;
229:       }
230:     }
231:     *entry = ihigh;
232:   }
233:   return(0);
234: }

236: /*
237:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
238:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
239: */

241: #define MAINSTAGE 0

243: static PetscLogStage savedStage = 0;

245: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
246: {
247:   PetscStageLog  stageLog = petsc_stageLog;

251:   if (stageLog->curStage == MAINSTAGE) return(0);
252:   savedStage = stageLog->curStage;
253:   stageLog->curStage = MAINSTAGE;
254:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
255:   return(0);
256: }

258: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
259: {
260:   PetscStageLog  stageLog = petsc_stageLog;

264:   if (savedStage == MAINSTAGE) return(0);
265:   stageLog->curStage = savedStage;
266:   PetscIntStackPop(stageLog->stack, &savedStage);
267:   return(0);
268: }

270: /******************************************************************************************/
271: /* Start a nested event */
272: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
273: {
274:   PetscErrorCode  ierr;
275:   int             entry, pentry, tentry,i;
276:   PetscLogEvent   dftEvent;

279:   PetscLogEventFindNestedTimer(nstEvent, &entry);
280:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
281:     /* Nested event doesn't exist yet: create it */

283:     if (nNestedEvents==nNestedEventsAllocated) {
284:       /* Enlarge and re-allocate nestedEvents if needed */
285:       PetscNestedEvent *tmp = nestedEvents;
286:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
287:       nNestedEventsAllocated*=2;
288:       PetscMemcpy(nestedEvents, tmp, nNestedEvents*sizeof(PetscNestedEvent));
289:       PetscFree(tmp);
290:     }

292:     /* Clear space in nestedEvents for new nested event */
293:     nNestedEvents++;
294:     for (i = nNestedEvents-1; i>entry; i--) {
295:       nestedEvents[i] = nestedEvents[i-1];
296:     }

298:     /* Create event in nestedEvents */
299:     nestedEvents[entry].nstEvent = nstEvent;
300:     nestedEvents[entry].nParents=1;
301:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

303:     /* Fill in new event */
304:     pentry = 0;
305:     dftEvent = (PetscLogEvent) nstEvent;

307:     nestedEvents[entry].nstEvent                 = nstEvent;
308:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
309:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
310:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
311:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

313:   } else {
314:     /* Nested event exists: find current dftParentActive among parents */
315:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
316:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
317:     int           nParents          = nestedEvents[entry].nParents;

319:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

321:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
322:       /* dftParentActive not in the list: add it to the list */
323:       int           i;
324:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
325:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
326:       char          name[100];

328:       /* Register a new default timer */
329:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
330:       PetscLogEventRegister(name, 0, &dftEvent);
331:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

333:       /* Reallocate parents and dftEvents to make space for new parent */
334:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
335:       PetscMemcpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents*sizeof(PetscLogEvent));
336:       PetscMemcpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents*sizeof(PetscLogEvent));
337:       PetscMemcpy(nestedEvents[entry].dftParents,       dftParents,       nParents*sizeof(PetscLogEvent));
338:       PetscMemcpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents*sizeof(PetscLogEvent));
339:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

341:       dftParents       = nestedEvents[entry].dftParents;
342:       dftEvents        = nestedEvents[entry].dftEvents;
343:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
344:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

346:       nestedEvents[entry].nParents++;
347:       nParents++;

349:       for (i = nParents-1; i>pentry; i--) {
350:         dftParentsSorted[i] = dftParentsSorted[i-1];
351:         dftEvents[i]        = dftEvents[i-1];
352:       }
353:       for (i = nParents-1; i>tentry; i--) {
354:         dftParents[i]      = dftParents[i-1];
355:         dftEventsSorted[i] = dftEventsSorted[i-1];
356:       }

358:       /* Fill in the new default timer */
359:       dftParentsSorted[pentry] = dftParentActive;
360:       dftEvents[pentry]        = dftEvent;
361:       dftParents[tentry]       = dftParentActive;
362:       dftEventsSorted[tentry]  = dftEvent;

364:     } else {
365:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
366:       dftEvent = nestedEvents[entry].dftEvents[pentry];
367:     }
368:   }

370:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
371:   PetscLogStageOverride();
372:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
373:   PetscLogStageRestore();
374:   dftParentActive = dftEvent;
375:   return(0);
376: }

378: /* End a nested event */
379: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
380: {
381:   PetscErrorCode  ierr;
382:   int             entry, pentry, nParents;
383:   PetscLogEvent  *dftEventsSorted;

386:   /* Find the nested event */
387:   PetscLogEventFindNestedTimer(nstEvent, &entry);
388:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
389:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
390:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
391:   nParents        = nestedEvents[entry].nParents;

393:   /* Find the current default timer among the 'dftEvents' of this event */
394:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

396:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
397:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

399:   /* Stop the default timer and update the dftParentActive */
400:   PetscLogStageOverride();
401:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
402:   PetscLogStageRestore();
403:   dftParentActive = nestedEvents[entry].dftParents[pentry];
404:   return(0);
405: }

407: /*@
408:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
409:           that takes 1 or more percent of the time.

411:   Logically Collective over PETSC_COMM_WORLD

413:   Input Parameter:
414: .   newThresh - the threshold to use

416:   Output Parameter:
417: .   oldThresh - the previously set threshold value

419:   Options Database Keys:
420: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

422:   Usage:
423: .vb
424:       PetscInitialize(...);
425:       PetscLogNestedBegin();
426:       PetscLogSetThreshold(0.1,&oldthresh);
427:        ... code ...
428:       PetscLogView(viewer);
429:       PetscFinalize();
430: .ve

432:   Level: advanced

434: .keywords: log, begin
435: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
436:           PetscLogNestedBegin()
437: @*/
438: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
439: {
441:   if (oldThresh) *oldThresh = thresholdTime;
442:   if (newThresh == PETSC_DECIDE)  newThresh = 0.01;
443:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
444:   thresholdTime = PetscMax(newThresh, 0.0);
445:   return(0);
446: }

448: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
449: {
450:   PetscErrorCode     ierr;
451:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
452:   char               version[256], buildoptions[128] = "";
453:   PetscMPIInt        size;
454:   size_t             len;

457:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
458:   PetscGetArchType(arch,sizeof(arch));
459:   PetscGetHostName(hostname,sizeof(hostname));
460:   PetscGetUserName(username,sizeof(username));
461:   PetscGetProgramName(pname,sizeof(pname));
462:   PetscGetDate(date,sizeof(date));
463:   PetscGetVersion(version,sizeof(version));

465:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
466:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname );
467:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch );
468:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
469:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size );
470:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
471:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
472:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);

474: #if defined(PETSC_USE_DEBUG)
475:   PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
476: #endif
477: #if defined(PETSC_USE_COMPLEX)
478:   PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
479: #endif
480: #if defined(PETSC_USE_REAL_SINGLE)
481:   PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
482: #elif defined(PETSC_USE_REAL___FLOAT128)
483:   PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
484: #elif defined(PETSC_USE_REAL___FP16)
485:   PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
486: #endif
487: #if defined(PETSC_USE_64BIT_INDICES)
488:   PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
489: #endif
490: #if defined(__cplusplus)
491:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
492: #endif
493:   PetscStrlen(buildoptions,&len);
494:   if (len) {
495:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
496:   }
497:   PetscViewerXMLEndSection(viewer, "runspecification");
498:   return(0);
499: }

501: /* Print the global performance: max, max/min, average and total of
502:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
503:  */
504: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble max, PetscLogDouble ratio, PetscLogDouble avg, PetscLogDouble tot)
505: {

509:   PetscViewerXMLStartSection(viewer, name, desc);
510:   PetscViewerXMLPutDouble(viewer, "max", NULL, max, "%e");
511:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
512:   if (avg>-1.0) {
513:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
514:   }
515:   if (tot>-1.0) {
516:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
517:   }
518:   PetscViewerXMLEndSection(viewer, name);
519:   return(0);
520: }

522: /* Print the global performance: max, max/min, average and total of
523:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
524:  */
525: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
526: {
527:   PetscErrorCode     ierr;
528:   PetscLogDouble     min, max, tot, ratio, avg;
529:   PetscLogDouble     flops, mem, red, mess;
530:   PetscMPIInt        size;
531:   MPI_Comm           comm;

534:   PetscObjectGetComm((PetscObject)viewer,&comm);
535:   MPI_Comm_size(comm, &size);

537:   /* Must preserve reduction count before we go on */
538:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

540:   /* Calculate summary information */
541:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

543:   /*   Time */
544:   MPIU_Allreduce(&locTotalTime, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
545:   MPIU_Allreduce(&locTotalTime, &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
546:   MPIU_Allreduce(&locTotalTime, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
547:   avg  = (tot)/((PetscLogDouble) size);
548:   if (min != 0.0) ratio = max/min;
549:   else ratio = 0.0;
550:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", max, ratio, avg, -1.0);

552:   /*   Objects */
553:   avg  = (PetscLogDouble) petsc_numObjects;
554:   MPIU_Allreduce(&avg,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
555:   MPIU_Allreduce(&avg,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
556:   MPIU_Allreduce(&avg,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
557:   avg  = (tot)/((PetscLogDouble) size);
558:   if (min != 0.0) ratio = max/min;
559:   else ratio = 0.0;
560:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", max, ratio, avg, -1.0);

562:   /*   Flop */
563:   MPIU_Allreduce(&petsc_TotalFlops,  &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
564:   MPIU_Allreduce(&petsc_TotalFlops,  &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
565:   MPIU_Allreduce(&petsc_TotalFlops,  &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
566:   avg  = (tot)/((PetscLogDouble) size);
567:   if (min != 0.0) ratio = max/min;
568:   else ratio = 0.0;
569:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

571:   /*   Flop/sec -- Must talk to Barry here */
572:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
573:   else flops = 0.0;
574:   MPIU_Allreduce(&flops,        &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
575:   MPIU_Allreduce(&flops,        &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
576:   MPIU_Allreduce(&flops,        &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
577:   avg  = (tot)/((PetscLogDouble) size);
578:   if (min != 0.0) ratio = max/min;
579:   else ratio = 0.0;
580:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

582:   /*   Memory */
583:   PetscMallocGetMaximumUsage(&mem);
584:   if (mem > 0.0) {
585:     MPIU_Allreduce(&mem,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
586:     MPIU_Allreduce(&mem,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
587:     MPIU_Allreduce(&mem,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
588:     avg  = (tot)/((PetscLogDouble) size);
589:     if (min != 0.0) ratio = max/min;
590:     else ratio = 0.0;
591:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);
592:   }
593:   /*   Messages */
594:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
595:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
596:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
597:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
598:   avg  = (tot)/((PetscLogDouble) size);
599:   if (min != 0.0) ratio = max/min;
600:   else ratio = 0.0;
601:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", max, ratio, avg, tot);

603:   /*   Message Volume */
604:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
605:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
606:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
607:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
608:   avg = (tot)/((PetscLogDouble) size);
609:   if (min != 0.0) ratio = max/min;
610:   else ratio = 0.0;
611:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);

613:   /*   Reductions */
614:   MPIU_Allreduce(&red,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
615:   MPIU_Allreduce(&red,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
616:   MPIU_Allreduce(&red,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
617:   if (min != 0.0) ratio = max/min;
618:   else ratio = 0.0;
619:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", max, ratio, -1, -1);
620:   PetscViewerXMLEndSection(viewer, "globalperformance");
621:   return(0);
622: }

624: typedef struct {
625:   PetscLogEvent  dftEvent;
626:   NestedEventId  nstEvent;
627:   PetscLogEvent  dftParent;
628:   NestedEventId  nstParent;
629:   PetscBool      own;
630:   int            depth;
631:   NestedEventId* nstPath;
632: } PetscNestedEventTree;

634: /* Compare timers to sort them in the tree */
635: static int compareTreeItems(const void *item1_, const void *item2_)
636: {
637:   int                  i;
638:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
639:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
640:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
641:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
642:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
643:   }
644:   if (item1->depth < item2->depth) return -1;
645:   if (item1->depth > item2->depth) return 1;
646:   return 0;
647: }
648: /*
649:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
650:  * calls that happen in some processes, but not in others.
651:  *
652:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
653:  * The tree is sorted so that the timers can be printed in the order of appearance.
654:  *
655:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
656:  * following rule is followed:
657:  * + if information from my own process is available, then that is the information stored in tree.
658:  *   otherwise it is some other process's information.
659:  */
660: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
661: {
662:   PetscNestedEventTree *tree = NULL, *newTree;
663:   int                  *treeIndices;
664:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
665:   int                  yesno;
666:   PetscBool            done;
667:   PetscErrorCode       ierr;
668:   int                  maxdepth;
669:   int                  depth;
670:   int                  illegalEvent;
671:   int                  iextra;
672:   NestedEventId        *nstPath, *nstMyPath;
673:   MPI_Comm             comm;

676:   PetscObjectGetComm((PetscObject)viewer,&comm);

678:   /* Calculate memory needed to store everybody's information and allocate tree */
679:   nTimers = 0;
680:   for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

682:   PetscMalloc1(nTimers,&tree);

684:   /* Fill tree with readily available information */
685:   iTimer0 = 0;
686:   maxDefaultTimer =0;
687:   for (i=0; i<nNestedEvents; i++) {
688:     int           nParents          = nestedEvents[i].nParents;
689:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
690:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
691:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
692:     for (j=0; j<nParents; j++) {
693:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

695:       tree[iTimer0+j].dftEvent   = dftEvents[j];
696:       tree[iTimer0+j].nstEvent   = nstEvent;
697:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
698:       tree[iTimer0+j].own        = PETSC_TRUE;

700:       tree[iTimer0+j].nstParent  = 0;
701:       tree[iTimer0+j].depth      = 0;
702:       tree[iTimer0+j].nstPath    = NULL;
703:     }
704:     iTimer0 += nParents;
705:   }

707:   /* Calculate the global maximum for the default timer index, so array treeIndices can
708:    * be allocated only once */
709:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
710:   maxDefaultTimer = j;

712:   /* Find default timer's place in the tree */
713:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
714:   treeIndices[0] = 0;
715:   for (i=0; i<nTimers; i++) {
716:     PetscLogEvent dftEvent = tree[i].dftEvent;
717:     treeIndices[dftEvent] = i;
718:   }

720:   /* Find each dftParent's nested identification */
721:   for (i=0; i<nTimers; i++) {
722:     PetscLogEvent dftParent = tree[i].dftParent;
723:     if (dftParent!= DFT_ID_AWAKE) {
724:       int j = treeIndices[dftParent];
725:       tree[i].nstParent = tree[j].nstEvent;
726:     }
727:   }

729:   /* Find depths for each timer path */
730:   done = PETSC_FALSE;
731:   maxdepth = 0;
732:   while (!done) {
733:     done = PETSC_TRUE;
734:     for (i=0; i<nTimers; i++) {
735:       if (tree[i].dftParent == DFT_ID_AWAKE) {
736:         tree[i].depth = 1;
737:         maxdepth = PetscMax(1,maxdepth);
738:       } else {
739:         int j = treeIndices[tree[i].dftParent];
740:         depth = 1+tree[j].depth;
741:         if (depth>tree[i].depth) {
742:           done          = PETSC_FALSE;
743:           tree[i].depth = depth;
744:           maxdepth      = PetscMax(depth,maxdepth);
745:         }
746:       }
747:     }
748:   }

750:   /* Allocate the paths in the entire tree */
751:   for (i=0; i<nTimers; i++) {
752:     depth = tree[i].depth;
753:     PetscCalloc1(depth,&tree[i].nstPath);
754:   }

756:   /* Calculate the paths for all timers */
757:   for (depth=1; depth<=maxdepth; depth++) {
758:     for (i=0; i<nTimers; i++) {
759:       if (tree[i].depth==depth) {
760:         if (depth>1) {
761:           int    j = treeIndices[tree[i].dftParent];
762:           PetscMemcpy(tree[i].nstPath,tree[j].nstPath,(depth-1)*sizeof(NestedEventId));
763:         }
764:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
765:       }
766:     }
767:   }
768:   PetscFree(treeIndices);

770:   /* Sort the tree on basis of the paths */
771:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

773:   /* Allocate an array to store paths */
774:   depth = maxdepth;
775:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
776:   PetscMalloc1(maxdepth+1, &nstPath);
777:   PetscMalloc1(maxdepth+1, &nstMyPath);

779:   /* Find an illegal nested event index (1+largest nested event index) */
780:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
781:   i = illegalEvent;
782:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

784:   /* First, detect timers which are not available in this process, but are available in others
785:    *        Allocate a new tree, that can contain all timers
786:    * Then,  fill the new tree with all (own and not-own) timers */
787:   newTree= NULL;
788:   for (yesno=0; yesno<=1; yesno++) {
789:     depth  = 1;
790:     i      = 0;
791:     iextra = 0;
792:     while (depth>0) {
793:       int       j;
794:       PetscBool same;

796:       /* Construct the next path in this process's tree:
797:        * if necessary, supplement with invalid path entries */
798:       depth++;
799:       if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
800:       if (i<nTimers) {
801:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
802:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
803:       } else {
804:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
805:       }

807:       /* Communicate with other processes to obtain the next path and its depth */
808:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
809:       for (j=depth-1; (int) j>=0; j--) {
810:         if (nstPath[j]==illegalEvent) depth=j;
811:       }

813:       if (depth>0) {
814:         /* If the path exists */

816:         /* check whether the next path is the same as this process's next path */
817:         same = PETSC_TRUE;
818:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}

820:         if (same) {
821:           /* Register 'own path' */
822:           if (newTree) newTree[i+iextra] = tree[i];
823:           i++;
824:         } else {
825:           /* Register 'not an own path' */
826:           if (newTree) {
827:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
828:             newTree[i+iextra].own        = PETSC_FALSE;
829:             newTree[i+iextra].depth      = depth;
830:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
831:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

833:             newTree[i+iextra].dftEvent  = 0;
834:             newTree[i+iextra].dftParent = 0;
835:             newTree[i+iextra].nstParent = 0;
836:           }
837:           iextra++;
838:         }

840:       }
841:     }

843:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
844:     totalNTimers = nTimers + iextra;
845:     if (!newTree) {
846:       PetscMalloc1(totalNTimers, &newTree);
847:     }
848:   }
849:   PetscFree(nstPath);
850:   PetscFree(nstMyPath);
851:   PetscFree(tree);
852:   tree = newTree;
853:   newTree = NULL;

855:   /* Set return value and return */
856:   *p_tree    = tree;
857:   *p_nTimers = totalNTimers;
858:   return(0);
859: }

861: /*
862:  * Delete the nested timer tree
863:  */
864: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
865: {
866:   int             i;
867:   PetscErrorCode  ierr;

870:   for (i=0; i<nTimers; i++) {
871:     PetscFree(tree[i].nstPath);
872:   }
873:   PetscFree(tree);
874:   return(0);
875: }

877: /* Print the global performance: max, max/min, average and total of
878:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
879:  */
880: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble minvalue, PetscLogDouble maxvalue, PetscLogDouble minmaxtreshold)
881: {

885:   PetscViewerXMLStartSection(viewer, name, NULL);
886:   if (maxvalue>minvalue*minmaxtreshold) {
887:     PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%f");
888:     PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%f");
889:   } else {
890:     PetscViewerXMLPutDouble(viewer, "value", NULL, (minvalue+maxvalue)/2.0, "%g");
891:   };
892:   PetscViewerXMLEndSection(viewer, name);
893:   return(0);
894: }

896: #define N_COMM 8
897: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
898: {
899:   PetscLogDouble time = perfInfo.time;
900:   PetscLogDouble timeMx,          timeMn;
901:   PetscLogDouble countsPerCallMx, countsPerCallMn;
902:   PetscLogDouble reductSpeedMx,   reductSpeedMn;
903:   PetscLogDouble flopSpeedMx,     flopSpeedMn;
904:   PetscLogDouble msgSpeedMx,      msgSpeedMn;
905:   PetscLogDouble commarr_in[N_COMM], commarr_out[N_COMM];
907:   MPI_Comm       comm;

910:   PetscObjectGetComm((PetscObject)viewer,&comm);

912:   commarr_in[0] =  time;
913:   commarr_in[1] = -time;
914:   MPIU_Allreduce(commarr_in, commarr_out,    2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
915:   timeMx =  commarr_out[0];
916:   timeMn = -commarr_out[1];

918:   commarr_in[0] = time>=timeMx*0.001 ?  perfInfo.flops/time         : 0;
919:   commarr_in[1] = time>=timeMx*0.001 ?  perfInfo.numReductions/time : 0;
920:   commarr_in[2] = time>=timeMx*0.001 ?  perfInfo.messageLength/time : 0;
921:   commarr_in[3] = parentCount>0      ?  countsPerCall               : 0;

923:   commarr_in[4] = time>=timeMx*0.001 ? -perfInfo.flops/time         : -1e30;
924:   commarr_in[5] = time>=timeMx*0.001 ? -perfInfo.numReductions/time : -1e30;
925:   commarr_in[6] = time>=timeMx*0.001 ? -perfInfo.messageLength/time : -1e30;
926:   commarr_in[7] = parentCount>0      ? -countsPerCall               : -1e30;

928:   MPIU_Allreduce(commarr_in, commarr_out,  N_COMM, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

930:   flopSpeedMx     =  commarr_out[0];
931:   reductSpeedMx   =  commarr_out[1];
932:   msgSpeedMx      =  commarr_out[2];
933:   countsPerCallMx =  commarr_out[3];

935:   flopSpeedMn     = -commarr_out[4];
936:   reductSpeedMn   = -commarr_out[5];
937:   msgSpeedMn      = -commarr_out[6];
938:   countsPerCallMn = -commarr_out[7];

940:   *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
941:   if (*isPrinted) {
942:     PetscViewerXMLStartSection(viewer, "event", NULL);
943:     PetscViewerXMLPutString(viewer, "name", NULL, name);
944:     PetscPrintXMLNestedLinePerfResults(viewer, "time", timeMn/totalTime*100.0, timeMx/totalTime*100.0, 1.02);

946:     if (countsPerCallMx<1.01 && countsPerCallMn>0.99) {
947:       /* One call per parent */
948:     } else {
949:       PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", countsPerCallMn, countsPerCallMx, 1.02);
950:     }

952:     if (flopSpeedMx<0.01) {
953:       /* NO flops: don't print */
954:     } else {
955:       PetscPrintXMLNestedLinePerfResults(viewer, "mflops", flopSpeedMn/1e6, flopSpeedMx/1e6, 1.05);
956:     }

958:     if (msgSpeedMx<0.01) {
959:       /* NO msgs: don't print */
960:     } else {
961:       PetscPrintXMLNestedLinePerfResults(viewer, "mbps", msgSpeedMn/1024.0/1024.0, msgSpeedMx/1024.0/1024.0, 1.05);
962:     }

964:     if (reductSpeedMx<0.01) {
965:       /* NO reductions: don't print */
966:     } else {
967:       PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", reductSpeedMn, reductSpeedMx, 1.05);
968:     }
969:   }
970:   return(0);
971: }

973: /* Count the number of times the parent event was called */

975: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
976: {
977:    if (tree[i].depth<=1) {
978:      return 1;  /* Main event: only once */
979:    } else if (!tree[i].own) {
980:      return 1;  /* This event didn't happen in this process, but did in another */
981:    } else {
982:      int iParent;
983:      for (iParent=i-1; iParent>=0; iParent--) {
984:        if (tree[iParent].depth == tree[i].depth-1) break;
985:      }
986:      if (tree[iParent].depth != tree[i].depth-1) {
987:        printf("\n\n   *****  Internal error: cannot find parent ****\n\n");
988:        return -2;
989:      } else {
990:        PetscLogEvent dftEvent  = tree[iParent].dftEvent;
991:        return eventPerfInfo[dftEvent].count;
992:      }
993:    }
994: }

996: typedef struct {
997:   int             id;
998:   PetscLogDouble  val;
999: } PetscSortItem;

1001: static int compareSortItems(const void *item1_, const void *item2_)
1002: {
1003:   PetscSortItem *item1 = (PetscSortItem *) item1_;
1004:   PetscSortItem *item2 = (PetscSortItem *) item2_;
1005:   if (item1->val > item2->val) return -1;
1006:   if (item1->val < item2->val) return +1;
1007:   return 0;
1008: }

1010: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1011: {
1012:   int                depth = tree[iStart].depth;
1013:   const char         *name;
1014:   int                parentCount, nChildren;
1015:   PetscSortItem      *children;
1016:   PetscErrorCode     ierr;
1017:   const int          stage = MAINSTAGE;
1018:   PetscStageLog      stageLog;
1019:   PetscEventRegInfo  *eventRegInfo;
1020:   PetscEventPerfInfo *eventPerfInfo;
1021:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
1022:   PetscLogDouble     countsPerCall;
1023:   PetscBool          wasPrinted;
1024:   PetscBool          childWasPrinted;
1025:   MPI_Comm           comm;

1027:   /* Look up the name of the event and its PerfInfo */
1028:   PetscLogGetStageLog(&stageLog);
1029:   eventRegInfo  = stageLog->eventLog->eventInfo;
1030:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1031:   name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;

1033:   PetscObjectGetComm((PetscObject)viewer,&comm);

1035:   /* Count the number of child processes */
1036:   nChildren = 0;
1037:   {
1038:     int i;
1039:     for (i=iStart+1; i<nTimers; i++) {
1040:       if (tree[i].depth <= depth) break;
1041:       if (tree[i].depth == depth + 1) nChildren++;
1042:     }
1043:   }

1045:   if (nChildren>0) {
1046:     /* Create an array for the id-s and maxTimes of the children,
1047:      *  leaving 2 spaces for self-time and other-time */
1048:     int            i;
1049:     PetscLogDouble *times, *maxTimes;

1051:     PetscMalloc1(nChildren+2,&children);
1052:     nChildren = 0;
1053:     for (i=iStart+1; i<nTimers; i++) {
1054:       if (tree[i].depth<=depth) break;
1055:       if (tree[i].depth == depth + 1) {
1056:         children[nChildren].id  = i;
1057:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1058:         nChildren++;
1059:       }
1060:     }

1062:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1063:     PetscMalloc1(nChildren,&times);
1064:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1066:     PetscMalloc1(nChildren,&maxTimes);
1067:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1068:     PetscFree(times);

1070:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1071:     PetscFree(maxTimes);
1072:   }

1074:   if (!tree[iStart].own) {
1075:   /* Set values for a timer that was not activated in this process
1076:    * (but was, in other processes of this run) */
1077:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

1079:     selfPerfInfo  = myPerfInfo;
1080:     otherPerfInfo = myPerfInfo;

1082:     parentCount   = 1;
1083:     countsPerCall = 0;
1084:   } else {
1085:   /* Set the values for a timer that was activated in this process */
1086:     int           i;
1087:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1089:     parentCount    = countParents( tree, eventPerfInfo, iStart);
1090:     myPerfInfo     = eventPerfInfo[dftEvent];
1091:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

1093:     selfPerfInfo                = myPerfInfo;
1094:     otherPerfInfo.time          = 0;
1095:     otherPerfInfo.flops         = 0;
1096:     otherPerfInfo.numMessages   = 0;
1097:     otherPerfInfo.messageLength = 0;
1098:     otherPerfInfo.numReductions = 0;

1100:     for (i=0; i<nChildren; i++) {
1101:       /* For all child counters: subtract the child values from self-timers */

1103:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1104:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1106:       selfPerfInfo.time          -= childPerfInfo.time;
1107:       selfPerfInfo.flops         -= childPerfInfo.flops;
1108:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1109:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1110:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1112:       if ((children[i].val/totalTime) < THRESHOLD) {
1113:         /* Add them to 'other' if the time is ignored in the output */
1114:         otherPerfInfo.time          += childPerfInfo.time;
1115:         otherPerfInfo.flops         += childPerfInfo.flops;
1116:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1117:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1118:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1119:       }
1120:     }
1121:   }

1123:   /* Main output for this timer */
1124:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1126:   /* Now print the lines for the children */
1127:   if (nChildren > 0) {
1128:     /* Calculate max-times for 'self' and 'other' */
1129:     int            i;
1130:     PetscLogDouble times[2], maxTimes[2];
1131:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1132:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1133:     children[nChildren+0].id = -1;
1134:     children[nChildren+0].val = maxTimes[0];
1135:     children[nChildren+1].id = -2;
1136:     children[nChildren+1].val = maxTimes[1];

1138:     /* Now sort the children (including 'self' and 'other') on total time */
1139:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1141:     /* Print (or ignore) the children in ascending order of total time */
1142:     PetscViewerXMLStartSection(viewer,"events", NULL);
1143:     for (i=0; i<nChildren+2; i++) {
1144:       if ((children[i].val/totalTime) < THRESHOLD) {
1145:         /* ignored: no output */
1146:       } else if (children[i].id==-1) {
1147:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1148:         if (childWasPrinted) {
1149:           PetscViewerXMLEndSection(viewer,"event");
1150:         }
1151:       } else if (children[i].id==-2) {
1152:         size_t  len;
1153:         char    *otherName;

1155:         PetscStrlen(name,&len);
1156:         PetscMalloc1(len+16,&otherName);
1157:         PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1158:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1159:         PetscFree(otherName);
1160:         if (childWasPrinted) {
1161:           PetscViewerXMLEndSection(viewer,"event");
1162:         }
1163:       } else {
1164:         /* Print the child with a recursive call to this function */
1165:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1166:       }
1167:     }
1168:     PetscViewerXMLEndSection(viewer,"events");
1169:     PetscFree(children);
1170:   }

1172:   if (wasPrinted) {
1173:     PetscViewerXMLEndSection(viewer, "event");
1174:   }
1175:   return 0;
1176: }

1178: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1179: {
1180:   int                i, nChildren;
1181:   PetscSortItem      *children;
1182:   PetscErrorCode     ierr;
1183:   const int          stage = MAINSTAGE;
1184:   PetscStageLog      stageLog;
1185:   PetscEventPerfInfo *eventPerfInfo;
1186:   MPI_Comm           comm;

1189:   PetscObjectGetComm((PetscObject)viewer,&comm);

1191:   /* Look up the PerfInfo */
1192:   PetscLogGetStageLog(&stageLog);
1193:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1195:   /* Count the number of child processes, and count total time */
1196:   nChildren = 0;
1197:   for (i=0; i<nTimers; i++)
1198:     if (tree[i].depth==1) nChildren++;

1200:   if (nChildren>0) {
1201:     /* Create an array for the id-s and maxTimes of the children,
1202:      *  leaving 2 spaces for self-time and other-time */
1203:     PetscLogDouble *times, *maxTimes;

1205:     PetscMalloc1(nChildren,&children);
1206:     nChildren = 0;
1207:     for (i=0; i<nTimers; i++) {
1208:       if (tree[i].depth == 1) {
1209:         children[nChildren].id  = i;
1210:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1211:         nChildren++;
1212:       }
1213:     }

1215:     /* Calculate the children's maximum times, to sort them */
1216:     PetscMalloc1(nChildren,&times);
1217:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1219:     PetscMalloc1(nChildren,&maxTimes);
1220:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1221:     PetscFree(times);

1223:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1224:     PetscFree(maxTimes);

1226:     /* Now sort the children on total time */
1227:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1228:     /* Print (or ignore) the children in ascending order of total time */
1229:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1230:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1231:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1233:     for (i=0; i<nChildren; i++) {
1234:       if ((children[i].val/totalTime) < THRESHOLD) {
1235:         /* ignored: no output */
1236:       } else {
1237:         /* Print the child with a recursive call to this function */
1238:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1239:       }
1240:     }
1241:     PetscViewerXMLEndSection(viewer, "timertree");
1242:     PetscFree(children);
1243:   }
1244:   return(0);
1245: }

1247: typedef struct {
1248:   char           *name;
1249:   PetscLogDouble time;
1250:   PetscLogDouble flops;
1251:   PetscLogDouble numMessages;
1252:   PetscLogDouble messageLength;
1253:   PetscLogDouble numReductions;
1254: } PetscSelfTimer;

1256: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1257: {
1258:   PetscErrorCode     ierr;
1259:   const int          stage = MAINSTAGE;
1260:   PetscStageLog      stageLog;
1261:   PetscEventRegInfo  *eventRegInfo;
1262:   PetscEventPerfInfo *eventPerfInfo;
1263:   PetscSelfTimer     *selftimes;
1264:   PetscSelfTimer     *totaltimes;
1265:   NestedEventId      *nstEvents;
1266:   int                i, j, maxDefaultTimer;
1267:   NestedEventId      nst;
1268:   PetscLogEvent      dft;
1269:   int                nstMax, nstMax_local;
1270:   MPI_Comm           comm;

1273:   PetscObjectGetComm((PetscObject)viewer,&comm);
1274:   PetscLogGetStageLog(&stageLog);
1275:   eventRegInfo  = stageLog->eventLog->eventInfo;
1276:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1278:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1279:   maxDefaultTimer =0;
1280:   for (i=0; i<nNestedEvents; i++) {
1281:     int           nParents   = nestedEvents[i].nParents;
1282:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1283:     for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1284:   }
1285:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1286:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1287:   for (i=0; i<nNestedEvents; i++) {
1288:     int           nParents   = nestedEvents[i].nParents;
1289:     NestedEventId nstEvent   = nestedEvents[i].nstEvent;
1290:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1291:     for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1292:   }

1294:   /* Calculate largest nested event-ID */
1295:   nstMax_local = 0;
1296:   for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1297:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);


1300:   /* Initialize all total-times with zero */
1301:   PetscMalloc1(nstMax+1,&selftimes);
1302:   PetscMalloc1(nstMax+1,&totaltimes);
1303:   for (nst=0; nst<=nstMax; nst++) {
1304:     totaltimes[nst].time          = 0;
1305:     totaltimes[nst].flops         = 0;
1306:     totaltimes[nst].numMessages   = 0;
1307:     totaltimes[nst].messageLength = 0;
1308:     totaltimes[nst].numReductions = 0;
1309:     totaltimes[nst].name          = NULL;
1310:   }

1312:   /* Calculate total-times */
1313:   for (i=0; i<nNestedEvents; i++) {
1314:     const int            nParents  = nestedEvents[i].nParents;
1315:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1316:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1317:     for (j=0; j<nParents; j++) {
1318:       const PetscLogEvent dftEvent = dftEvents[j];
1319:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1320:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1321:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1322:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1323:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1324:     }
1325:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1326:   }

1328:   /* Initialize: self-times := totaltimes */
1329:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1331:   /* Subtract timed subprocesses from self-times */
1332:   for (i=0; i<nNestedEvents; i++) {
1333:     const int           nParents          = nestedEvents[i].nParents;
1334:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1335:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1336:     for (j=0; j<nParents; j++) {
1337:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1338:         const PetscLogEvent dftEvent  = dftEvents[j];
1339:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1340:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1341:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1342:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1343:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1344:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1345:       }
1346:     }
1347:   }

1349:   PetscFree(nstEvents);
1350:   PetscFree(totaltimes);

1352:   /* Set outputs */
1353:   *p_self  = selftimes;
1354:   *p_nstMax = nstMax;
1355:   return(0);
1356: }

1358: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1359: {
1360:   PetscErrorCode     ierr;
1361:   int                i;
1362:   NestedEventId      nst;
1363:   PetscSortItem      *sortSelfTimes;
1364:   PetscLogDouble     *times, *maxTimes;
1365:   PetscStageLog      stageLog;
1366:   PetscEventRegInfo  *eventRegInfo;
1367:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1368:   PetscBool          wasPrinted;
1369:   MPI_Comm           comm;

1372:   PetscObjectGetComm((PetscObject)viewer,&comm);
1373:   PetscLogGetStageLog(&stageLog);
1374:   eventRegInfo  = stageLog->eventLog->eventInfo;

1376:   PetscMalloc1(nstMax+1,&times);
1377:   PetscMalloc1(nstMax+1,&maxTimes);
1378:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1379:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1380:   PetscFree(times);

1382:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1384:   /* Sort the self-timers on basis of the largest time needed */
1385:   for (nst=0; nst<=nstMax; nst++) {
1386:     sortSelfTimes[nst].id  = nst;
1387:     sortSelfTimes[nst].val = maxTimes[nst];
1388:   }
1389:   PetscFree(maxTimes);
1390:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1392:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1393:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1395:   for (i=0; i<=nstMax; i++) {
1396:     if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1397:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1398:       const char         *name    = eventRegInfo[(PetscLogEvent)nstEvent].name;
1399:       PetscEventPerfInfo selfPerfInfo;

1401:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1402:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1403:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1404:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1405:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1407:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1408:       if (wasPrinted){
1409:         PetscViewerXMLEndSection(viewer, "event");
1410:       }
1411:     }
1412:   }
1413:   PetscViewerXMLEndSection(viewer, "selftimertable");
1414:   PetscFree(sortSelfTimes);
1415:   return(0);
1416: }

1418: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1419: {
1420:   PetscErrorCode     ierr;
1421:   PetscLogDouble     locTotalTime, globTotalTime;
1422:   PetscNestedEventTree *tree = NULL;
1423:   PetscSelfTimer     *selftimers = NULL;
1424:   int                nTimers = 0, nstMax = 0;
1425:   MPI_Comm           comm;

1428:   PetscObjectGetComm((PetscObject)viewer,&comm);
1429:   PetscViewerInitASCII_XML(viewer);
1430:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1431:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1433:   /* Get the total elapsed time, local and global maximum */
1434:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1435:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1437:   /* Print global information about this run */
1438:   PetscPrintExeSpecs(viewer);
1439:   PetscPrintGlobalPerformance(viewer, locTotalTime);

1441:   /* Collect nested timer tree info from all processes */
1442:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1443:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1444:   PetscLogNestedTreeDestroy(tree, nTimers);

1446:   /* Calculate self-time for all (not-nested) events */
1447:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1448:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1449:   PetscFree(selftimers);

1451:   PetscViewerXMLEndSection(viewer, "petscroot");
1452:   PetscViewerFinalASCII_XML(viewer);
1453:   return(0);
1454: }

1456: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1457: {
1460: #endif

1463:   petsc_send_ct++;
1465:   PetscMPITypeSize(&petsc_send_len,count, MPI_Type_f2c((MPI_Fint) datatype));
1466: #endif
1467:   return(0);
1468: }

1470: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1471: {
1474: #endif

1477:   petsc_recv_ct++;
1479:   PetscMPITypeSize(&petsc_recv_len,count, MPI_Type_f2c((MPI_Fint) datatype));
1480: #endif
1481:   return(0);
1482: }

1484: PETSC_EXTERN PetscErrorCode PetscAReduce()
1485: {
1487:   petsc_allreduce_ct++;
1488:   return(0);
1489: }

1491: #endif