LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
55 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_MASTER, 0, arg) \
106  macro(OMP_MASKED, 0, arg) \
107  macro(OMP_TEAMS, 0, arg) \
108  macro(OMP_set_lock, 0, arg) \
109  macro(OMP_test_lock, 0, arg) \
110  macro(REDUCE_wait, 0, arg) \
111  macro(REDUCE_nowait, 0, arg) \
112  macro(OMP_TASKYIELD, 0, arg) \
113  macro(OMP_TASKLOOP, 0, arg) \
114  macro(TASK_executed, 0, arg) \
115  macro(TASK_cancelled, 0, arg) \
116  macro(TASK_stolen, 0, arg)
117 // clang-format on
118 
137 // clang-format off
138 #define KMP_FOREACH_TIMER(macro, arg) \
139  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
140  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
141  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
142  macro (OMP_teams, stats_flags_e::logEvent, arg) \
143  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
144  macro (OMP_loop_static, 0, arg) \
145  macro (OMP_loop_static_scheduling, 0, arg) \
146  macro (OMP_loop_dynamic, 0, arg) \
147  macro (OMP_loop_dynamic_scheduling, 0, arg) \
148  macro (OMP_distribute, 0, arg) \
149  macro (OMP_distribute_scheduling, 0, arg) \
150  macro (OMP_critical, 0, arg) \
151  macro (OMP_critical_wait, 0, arg) \
152  macro (OMP_single, 0, arg) \
153  macro (OMP_master, 0, arg) \
154  macro (OMP_masked, 0, arg) \
155  macro (OMP_task_immediate, 0, arg) \
156  macro (OMP_task_taskwait, 0, arg) \
157  macro (OMP_task_taskyield, 0, arg) \
158  macro (OMP_task_taskgroup, 0, arg) \
159  macro (OMP_task_join_bar, 0, arg) \
160  macro (OMP_task_plain_bar, 0, arg) \
161  macro (OMP_taskloop_scheduling, 0, arg) \
162  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
163  macro (OMP_idle, stats_flags_e::logEvent, arg) \
164  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
165  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
166  macro (OMP_serial, stats_flags_e::logEvent, arg) \
167  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168  arg) \
169  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
170  arg) \
171  macro (OMP_loop_static_iterations, \
172  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173  macro (OMP_loop_static_total_iterations, \
174  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175  macro (OMP_loop_dynamic_iterations, \
176  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177  macro (OMP_loop_dynamic_total_iterations, \
178  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179  macro (OMP_distribute_iterations, \
180  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
181  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
182 // clang-format on
183 
184 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
185 // initializing OpenMP or being created by a primary
186 // thread) until the thread is destroyed
187 // OMP_parallel -- Time thread spends executing work directly
188 // within a #pragma omp parallel
189 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
190 // OMP_loop_static -- Time thread spends executing loop iterations from
191 // a statically scheduled loop
192 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
193 // from a statically scheduled loop
194 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
195 // a dynamically scheduled loop
196 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
197 // from a dynamically scheduled loop
198 // OMP_critical -- Time thread spends executing critical section
199 // OMP_critical_wait -- Time thread spends waiting to enter
200 // a critical section
201 // OMP_single -- Time spent executing a "single" region
202 // OMP_master -- Time spent executing a "master" region
203 // OMP_masked -- Time spent executing a "masked" region
204 // OMP_task_immediate -- Time spent executing non-deferred tasks
205 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
206 // construct
207 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
208 // construct
209 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
210 // construct
211 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
212 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
213 // construct
214 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
215 // construct
216 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
217 // inside implicit barrier at end of worksharing
218 // construct
219 // OMP_idle -- Time worker threads spend waiting for next
220 // parallel region
221 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
222 // parallel region
223 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
224 // parallel region
225 // OMP_serial -- Time thread zero spends executing serial code
226 // OMP_set_numthreads -- Values passed to omp_set_num_threads
227 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
228 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
229 // statically scheduled loops
230 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
231 // dynamically scheduled loops
232 
233 #if (KMP_DEVELOPER_STATS)
234 // Timers which are of interest to runtime library developers, not end users.
235 // These have to be explicitly enabled in addition to the other stats.
236 
237 // KMP_fork_barrier -- time in __kmp_fork_barrier
238 // KMP_join_barrier -- time in __kmp_join_barrier
239 // KMP_barrier -- time in __kmp_barrier
240 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
241 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
242 // KMP_icv_copy -- start/stop timer for any ICV copying
243 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
244 // KMP_linear_release -- time in __kmp_linear_barrier_release
245 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
246 // KMP_tree_release -- time in __kmp_tree_barrier_release
247 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
248 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
249 // clang-format off
250 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
251  macro(KMP_fork_call, 0, arg) \
252  macro(KMP_join_call, 0, arg) \
253  macro(KMP_end_split_barrier, 0, arg) \
254  macro(KMP_hier_gather, 0, arg) \
255  macro(KMP_hier_release, 0, arg) \
256  macro(KMP_hyper_gather, 0, arg) \
257  macro(KMP_hyper_release, 0, arg) \
258  macro(KMP_linear_gather, 0, arg) \
259  macro(KMP_linear_release, 0, arg) \
260  macro(KMP_tree_gather, 0, arg) \
261  macro(KMP_tree_release, 0, arg) \
262  macro(USER_resume, 0, arg) \
263  macro(USER_suspend, 0, arg) \
264  macro(USER_mwait, 0, arg) \
265  macro(KMP_allocate_team, 0, arg) \
266  macro(KMP_setup_icv_copy, 0, arg) \
267  macro(USER_icv_copy, 0, arg) \
268  macro (FOR_static_steal_stolen, \
269  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
270  macro (FOR_static_steal_chunks, \
271  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
272 #else
273 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
274 #endif
275 // clang-format on
276 
296 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
297 
298 #define ENUMERATE(name, ignore, prefix) prefix##name,
299 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
300 
301 enum explicit_timer_e {
302  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
303 };
304 
305 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
306 #undef ENUMERATE
307 
308 /*
309  * A logarithmic histogram. It accumulates the number of values in each power of
310  * ten bin. So 1<=x<10, 10<=x<100, ...
311  * Mostly useful where we have some big outliers and want to see information
312  * about them.
313  */
314 class logHistogram {
315  enum {
316  numBins = 31, /* Number of powers of 10. If this changes you need to change
317  * the initializer for binMax */
318 
319  /*
320  * If you want to use this to analyse values that may be less than 1, (for
321  * instance times in s), then the logOffset gives you negative powers.
322  * In our case here, we're just looking at times in ticks, or counts, so we
323  * can never see values with magnitude < 1 (other than zero), so we can set
324  * it to 0. As above change the initializer if you change this.
325  */
326  logOffset = 0
327  };
328  uint32_t KMP_ALIGN_CACHE zeroCount;
329  struct {
330  uint32_t count;
331  double total;
332  } bins[numBins];
333 
334  static double binMax[numBins];
335 
336 #ifdef KMP_DEBUG
337  uint64_t _total;
338 
339  void check() const {
340  uint64_t t = zeroCount;
341  for (int i = 0; i < numBins; i++)
342  t += bins[i].count;
343  KMP_DEBUG_ASSERT(t == _total);
344  }
345 #else
346  void check() const {}
347 #endif
348 
349 public:
350  logHistogram() { reset(); }
351 
352  logHistogram(logHistogram const &o) {
353  for (int i = 0; i < numBins; i++)
354  bins[i] = o.bins[i];
355 #ifdef KMP_DEBUG
356  _total = o._total;
357 #endif
358  }
359 
360  void reset() {
361  zeroCount = 0;
362  for (int i = 0; i < numBins; i++) {
363  bins[i].count = 0;
364  bins[i].total = 0;
365  }
366 
367 #ifdef KMP_DEBUG
368  _total = 0;
369 #endif
370  }
371  uint32_t count(int b) const { return bins[b + logOffset].count; }
372  double total(int b) const { return bins[b + logOffset].total; }
373  static uint32_t findBin(double sample);
374 
375  logHistogram &operator+=(logHistogram const &o) {
376  zeroCount += o.zeroCount;
377  for (int i = 0; i < numBins; i++) {
378  bins[i].count += o.bins[i].count;
379  bins[i].total += o.bins[i].total;
380  }
381 #ifdef KMP_DEBUG
382  _total += o._total;
383  check();
384 #endif
385 
386  return *this;
387  }
388 
389  void addSample(double sample);
390  int minBin() const;
391  int maxBin() const;
392 
393  std::string format(char) const;
394 };
395 
396 class statistic {
397  double KMP_ALIGN_CACHE minVal;
398  double maxVal;
399  double meanVal;
400  double m2;
401  uint64_t sampleCount;
402  double offset;
403  bool collectingHist;
404  logHistogram hist;
405 
406 public:
407  statistic(bool doHist = bool(KMP_STATS_HIST)) {
408  reset();
409  collectingHist = doHist;
410  }
411  statistic(statistic const &o)
412  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
413  sampleCount(o.sampleCount), offset(o.offset),
414  collectingHist(o.collectingHist), hist(o.hist) {}
415  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
416  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
417  sampleCount(sc), offset(0.0), collectingHist(false) {}
418  bool haveHist() const { return collectingHist; }
419  double getMin() const { return minVal; }
420  double getMean() const { return meanVal; }
421  double getMax() const { return maxVal; }
422  uint64_t getCount() const { return sampleCount; }
423  double getSD() const { return sqrt(m2 / sampleCount); }
424  double getTotal() const { return sampleCount * meanVal; }
425  logHistogram const *getHist() const { return &hist; }
426  void setOffset(double d) { offset = d; }
427 
428  void reset() {
429  minVal = (std::numeric_limits<double>::max)();
430  maxVal = -minVal;
431  meanVal = 0.0;
432  m2 = 0.0;
433  sampleCount = 0;
434  offset = 0.0;
435  hist.reset();
436  }
437  void addSample(double sample);
438  void scale(double factor);
439  void scaleDown(double f) { scale(1. / f); }
440  void forceCount(uint64_t count) { sampleCount = count; }
441  statistic &operator+=(statistic const &other);
442 
443  std::string format(char unit, bool total = false) const;
444  std::string formatHist(char unit) const { return hist.format(unit); }
445 };
446 
447 struct statInfo {
448  const char *name;
449  uint32_t flags;
450 };
451 
452 class timeStat : public statistic {
453  static statInfo timerInfo[];
454 
455 public:
456  timeStat() : statistic() {}
457  static const char *name(timer_e e) { return timerInfo[e].name; }
458  static bool noTotal(timer_e e) {
459  return timerInfo[e].flags & stats_flags_e::noTotal;
460  }
461  static bool masterOnly(timer_e e) {
462  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
463  }
464  static bool workerOnly(timer_e e) {
465  return timerInfo[e].flags & stats_flags_e::notInMaster;
466  }
467  static bool noUnits(timer_e e) {
468  return timerInfo[e].flags & stats_flags_e::noUnits;
469  }
470  static bool logEvent(timer_e e) {
471  return timerInfo[e].flags & stats_flags_e::logEvent;
472  }
473  static void clearEventFlags() {
474  for (int i = 0; i < TIMER_LAST; i++) {
475  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
476  }
477  }
478 };
479 
480 // Where we need explicitly to start and end the timer, this version can be used
481 // Since these timers normally aren't nicely scoped, so don't have a good place
482 // to live on the stack of the thread, they're more work to use.
483 class explicitTimer {
484  timeStat *stat;
485  timer_e timerEnumValue;
486  tsc_tick_count startTime;
487  tsc_tick_count pauseStartTime;
488  tsc_tick_count::tsc_interval_t totalPauseTime;
489 
490 public:
491  explicitTimer(timeStat *s, timer_e te)
492  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
493  totalPauseTime() {}
494 
495  // void setStat(timeStat *s) { stat = s; }
496  void start(tsc_tick_count tick);
497  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
498  void resume(tsc_tick_count tick) {
499  totalPauseTime += (tick - pauseStartTime);
500  }
501  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
502  void reset() {
503  startTime = 0;
504  pauseStartTime = 0;
505  totalPauseTime = 0;
506  }
507  timer_e get_type() const { return timerEnumValue; }
508 };
509 
510 // Where you need to partition a threads clock ticks into separate states
511 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
512 // DOING_NOTHING would render these conditions:
513 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
514 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
515 // versa
516 class partitionedTimers {
517 private:
518  std::vector<explicitTimer> timer_stack;
519 
520 public:
521  partitionedTimers();
522  void init(explicitTimer timer);
523  void exchange(explicitTimer timer);
524  void push(explicitTimer timer);
525  void pop();
526  void windup();
527 };
528 
529 // Special wrapper around the partitioned timers to aid timing code blocks
530 // It avoids the need to have an explicit end, leaving the scope suffices.
531 class blockPartitionedTimer {
532  partitionedTimers *part_timers;
533 
534 public:
535  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
536  : part_timers(pt) {
537  part_timers->push(timer);
538  }
539  ~blockPartitionedTimer() { part_timers->pop(); }
540 };
541 
542 // Special wrapper around the thread state to aid in keeping state in code
543 // blocks It avoids the need to have an explicit end, leaving the scope
544 // suffices.
545 class blockThreadState {
546  stats_state_e *state_pointer;
547  stats_state_e old_state;
548 
549 public:
550  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
551  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
552  *state_pointer = new_state;
553  }
554  ~blockThreadState() { *state_pointer = old_state; }
555 };
556 
557 // If all you want is a count, then you can use this...
558 // The individual per-thread counts will be aggregated into a statistic at
559 // program exit.
560 class counter {
561  uint64_t value;
562  static const statInfo counterInfo[];
563 
564 public:
565  counter() : value(0) {}
566  void increment() { value++; }
567  uint64_t getValue() const { return value; }
568  void reset() { value = 0; }
569  static const char *name(counter_e e) { return counterInfo[e].name; }
570  static bool masterOnly(counter_e e) {
571  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
572  }
573 };
574 
575 /* ****************************************************************
576  Class to implement an event
577 
578  There are four components to an event: start time, stop time
579  nest_level, and timer_name.
580  The start and stop time should be obvious (recorded in clock ticks).
581  The nest_level relates to the bar width in the timeline graph.
582  The timer_name is used to determine which timer event triggered this event.
583 
584  the interface to this class is through four read-only operations:
585  1) getStart() -- returns the start time as 64 bit integer
586  2) getStop() -- returns the stop time as 64 bit integer
587  3) getNestLevel() -- returns the nest level of the event
588  4) getTimerName() -- returns the timer name that triggered event
589 
590  *MORE ON NEST_LEVEL*
591  The nest level is used in the bar graph that represents the timeline.
592  Its main purpose is for showing how events are nested inside eachother.
593  For example, say events, A, B, and C are recorded. If the timeline
594  looks like this:
595 
596 Begin -------------------------------------------------------------> Time
597  | | | | | |
598  A B C C B A
599  start start start end end end
600 
601  Then A, B, C will have a nest level of 1, 2, 3 respectively.
602  These values are then used to calculate the barwidth so you can
603  see that inside A, B has occurred, and inside B, C has occurred.
604  Currently, this is shown with A's bar width being larger than B's
605  bar width, and B's bar width being larger than C's bar width.
606 
607 **************************************************************** */
608 class kmp_stats_event {
609  uint64_t start;
610  uint64_t stop;
611  int nest_level;
612  timer_e timer_name;
613 
614 public:
615  kmp_stats_event()
616  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
617  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
618  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
619  inline uint64_t getStart() const { return start; }
620  inline uint64_t getStop() const { return stop; }
621  inline int getNestLevel() const { return nest_level; }
622  inline timer_e getTimerName() const { return timer_name; }
623 };
624 
625 /* ****************************************************************
626  Class to implement a dynamically expandable array of events
627 
628  ---------------------------------------------------------
629  | event 1 | event 2 | event 3 | event 4 | ... | event N |
630  ---------------------------------------------------------
631 
632  An event is pushed onto the back of this array at every
633  explicitTimer->stop() call. The event records the thread #,
634  start time, stop time, and nest level related to the bar width.
635 
636  The event vector starts at size INIT_SIZE and grows (doubles in size)
637  if needed. An implication of this behavior is that log(N)
638  reallocations are needed (where N is number of events). If you want
639  to avoid reallocations, then set INIT_SIZE to a large value.
640 
641  the interface to this class is through six operations:
642  1) reset() -- sets the internal_size back to 0 but does not deallocate any
643  memory
644  2) size() -- returns the number of valid elements in the vector
645  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
646  the back of the array
647  4) deallocate() -- frees all memory associated with the vector
648  5) sort() -- sorts the vector by start time
649  6) operator[index] or at(index) -- returns event reference at that index
650 **************************************************************** */
651 class kmp_stats_event_vector {
652  kmp_stats_event *events;
653  int internal_size;
654  int allocated_size;
655  static const int INIT_SIZE = 1024;
656 
657 public:
658  kmp_stats_event_vector() {
659  events =
660  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
661  internal_size = 0;
662  allocated_size = INIT_SIZE;
663  }
664  ~kmp_stats_event_vector() {}
665  inline void reset() { internal_size = 0; }
666  inline int size() const { return internal_size; }
667  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
668  timer_e name) {
669  int i;
670  if (internal_size == allocated_size) {
671  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
672  sizeof(kmp_stats_event) * allocated_size * 2);
673  for (i = 0; i < internal_size; i++)
674  tmp[i] = events[i];
675  __kmp_free(events);
676  events = tmp;
677  allocated_size *= 2;
678  }
679  events[internal_size] =
680  kmp_stats_event(start_time, stop_time, nest_level, name);
681  internal_size++;
682  return;
683  }
684  void deallocate();
685  void sort();
686  const kmp_stats_event &operator[](int index) const { return events[index]; }
687  kmp_stats_event &operator[](int index) { return events[index]; }
688  const kmp_stats_event &at(int index) const { return events[index]; }
689  kmp_stats_event &at(int index) { return events[index]; }
690 };
691 
692 /* ****************************************************************
693  Class to implement a doubly-linked, circular, statistics list
694 
695  |---| ---> |---| ---> |---| ---> |---| ---> ... next
696  | | | | | | | |
697  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
698  Sentinel first second third
699  Node node node node
700 
701  The Sentinel Node is the user handle on the list.
702  The first node corresponds to thread 0's statistics.
703  The second node corresponds to thread 1's statistics and so on...
704 
705  Each node has a _timers, _counters, and _explicitTimers array to hold that
706  thread's statistics. The _explicitTimers point to the correct _timer and
707  update its statistics at every stop() call. The explicitTimers' pointers are
708  set up in the constructor. Each node also has an event vector to hold that
709  thread's timing events. The event vector expands as necessary and records
710  the start-stop times for each timer.
711 
712  The nestLevel variable is for plotting events and is related
713  to the bar width in the timeline graph.
714 
715  Every thread will have a thread local pointer to its node in
716  the list. The sentinel node is used by the primary thread to
717  store "dummy" statistics before __kmp_create_worker() is called.
718 **************************************************************** */
719 class kmp_stats_list {
720  int gtid;
721  timeStat _timers[TIMER_LAST + 1];
722  counter _counters[COUNTER_LAST + 1];
723  explicitTimer thread_life_timer;
724  partitionedTimers _partitionedTimers;
725  int _nestLevel; // one per thread
726  kmp_stats_event_vector _event_vector;
727  kmp_stats_list *next;
728  kmp_stats_list *prev;
729  stats_state_e state;
730  int thread_is_idle_flag;
731 
732 public:
733  kmp_stats_list()
734  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
735  TIMER_OMP_worker_thread_life),
736  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
737  thread_is_idle_flag(0) {}
738  ~kmp_stats_list() {}
739  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
740  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
741  inline partitionedTimers *getPartitionedTimers() {
742  return &_partitionedTimers;
743  }
744  inline timeStat *getTimers() { return _timers; }
745  inline counter *getCounters() { return _counters; }
746  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
747  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
748  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
749  inline void resetEventVector() { _event_vector.reset(); }
750  inline void incrementNestValue() { _nestLevel++; }
751  inline int getNestValue() { return _nestLevel; }
752  inline void decrementNestValue() { _nestLevel--; }
753  inline int getGtid() const { return gtid; }
754  inline void setGtid(int newgtid) { gtid = newgtid; }
755  inline void setState(stats_state_e newstate) { state = newstate; }
756  inline stats_state_e getState() const { return state; }
757  inline stats_state_e *getStatePointer() { return &state; }
758  inline bool isIdle() { return thread_is_idle_flag == 1; }
759  inline void setIdleFlag() { thread_is_idle_flag = 1; }
760  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
761  kmp_stats_list *push_back(int gtid); // returns newly created list node
762  inline void push_event(uint64_t start_time, uint64_t stop_time,
763  int nest_level, timer_e name) {
764  _event_vector.push_back(start_time, stop_time, nest_level, name);
765  }
766  void deallocate();
767  class iterator;
768  kmp_stats_list::iterator begin();
769  kmp_stats_list::iterator end();
770  int size();
771  class iterator {
772  kmp_stats_list *ptr;
773  friend kmp_stats_list::iterator kmp_stats_list::begin();
774  friend kmp_stats_list::iterator kmp_stats_list::end();
775 
776  public:
777  iterator();
778  ~iterator();
779  iterator operator++();
780  iterator operator++(int dummy);
781  iterator operator--();
782  iterator operator--(int dummy);
783  bool operator!=(const iterator &rhs);
784  bool operator==(const iterator &rhs);
785  kmp_stats_list *operator*() const; // dereference operator
786  };
787 };
788 
789 /* ****************************************************************
790  Class to encapsulate all output functions and the environment variables
791 
792  This module holds filenames for various outputs (normal stats, events, plot
793  file), as well as coloring information for the plot file.
794 
795  The filenames and flags variables are read from environment variables.
796  These are read once by the constructor of the global variable
797  __kmp_stats_output which calls init().
798 
799  During this init() call, event flags for the timeStat::timerInfo[] global
800  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
801 
802  The only interface function that is public is outputStats(heading). This
803  function should print out everything it needs to, either to files or stderr,
804  depending on the environment variables described below
805 
806  ENVIRONMENT VARIABLES:
807  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
808  file, otherwise, print to stderr
809  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
810  either KMP_STATS_FILE or stderr
811  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
812  otherwise, the plot file is sent to "events.plt"
813  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
814  events
815  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
816  otherwise, output is sent to "events.dat"
817 **************************************************************** */
818 class kmp_stats_output_module {
819 
820 public:
821  struct rgb_color {
822  float r;
823  float g;
824  float b;
825  };
826 
827 private:
828  std::string outputFileName;
829  static const char *eventsFileName;
830  static const char *plotFileName;
831  static int printPerThreadFlag;
832  static int printPerThreadEventsFlag;
833  static const rgb_color globalColorArray[];
834  static rgb_color timerColorInfo[];
835 
836  void init();
837  static void setupEventColors();
838  static void printPloticusFile();
839  static void printHeaderInfo(FILE *statsOut);
840  static void printTimerStats(FILE *statsOut, statistic const *theStats,
841  statistic const *totalStats);
842  static void printCounterStats(FILE *statsOut, statistic const *theStats);
843  static void printCounters(FILE *statsOut, counter const *theCounters);
844  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
845  int gtid);
846  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
847  static void windupExplicitTimers();
848  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
849 
850 public:
851  kmp_stats_output_module() { init(); }
852  void outputStats(const char *heading);
853 };
854 
855 #ifdef __cplusplus
856 extern "C" {
857 #endif
858 void __kmp_stats_init();
859 void __kmp_stats_fini();
860 void __kmp_reset_stats();
861 void __kmp_output_stats(const char *);
862 void __kmp_accumulate_stats_at_exit(void);
863 // thread local pointer to stats node within list
864 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
865 // head to stats list.
866 extern kmp_stats_list *__kmp_stats_list;
867 // lock for __kmp_stats_list
868 extern kmp_tas_lock_t __kmp_stats_lock;
869 // reference start time
870 extern tsc_tick_count __kmp_stats_start_time;
871 // interface to output
872 extern kmp_stats_output_module __kmp_stats_output;
873 
874 #ifdef __cplusplus
875 }
876 #endif
877 
878 // Simple, standard interfaces that drop out completely if stats aren't enabled
879 
891 #define KMP_COUNT_VALUE(name, value) \
892  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
893 
904 #define KMP_COUNT_BLOCK(name) \
905  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
906 
924 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
925 
933 #define KMP_INIT_PARTITIONED_TIMERS(name) \
934  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
935  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
936 
937 #define KMP_TIME_PARTITIONED_BLOCK(name) \
938  blockPartitionedTimer __PBLOCKTIME__( \
939  __kmp_stats_thread_ptr->getPartitionedTimers(), \
940  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
941  TIMER_##name))
942 
943 #define KMP_PUSH_PARTITIONED_TIMER(name) \
944  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
945  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
946 
947 #define KMP_POP_PARTITIONED_TIMER() \
948  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
949 
950 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
951  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
952  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
953 
954 #define KMP_SET_THREAD_STATE(state_name) \
955  __kmp_stats_thread_ptr->setState(state_name)
956 
957 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
958 
959 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
960  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
961  state_name)
962 
970 #define KMP_RESET_STATS() __kmp_reset_stats()
971 
972 #if (KMP_DEVELOPER_STATS)
973 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
974 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
975 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
976 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
977 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
978 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
979  KMP_EXCHANGE_PARTITIONED_TIMER(n)
980 #else
981 // Null definitions
982 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
983 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
984 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
985 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
986 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
987 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
988 #endif
989 
990 #else // KMP_STATS_ENABLED
991 
992 // Null definitions
993 #define KMP_COUNT_VALUE(n, v) ((void)0)
994 #define KMP_COUNT_BLOCK(n) ((void)0)
995 
996 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
997 #define KMP_RESET_STATS() ((void)0)
998 
999 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1000 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1001 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1002 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1003 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1004 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1005 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1006 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1007 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1008 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1009 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1010 #define KMP_GET_THREAD_STATE() ((void)0)
1011 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1012 #endif // KMP_STATS_ENABLED
1013 
1014 #endif // KMP_STATS_H
statistic is valid only for primary thread
Definition: kmp_stats.h:51
statistic is valid only for non-primary threads
Definition: kmp_stats.h:53
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:296
statistic doesn&#39;t need units printed next to it
Definition: kmp_stats.h:52
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63