LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73  bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  // TODO: make nonmonotonic when static_steal is fixed
76  int monotonicity = SCHEDULE_MONOTONIC;
77 
78  // Let default be monotonic for executables
79  // compiled with OpenMP* 4.5 or less compilers
80  if (loc != NULL && loc->get_openmp_version() < 50)
81  monotonicity = SCHEDULE_MONOTONIC;
82 
83  if (use_hier || __kmp_force_monotonic)
84  monotonicity = SCHEDULE_MONOTONIC;
85  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86  monotonicity = SCHEDULE_NONMONOTONIC;
87  else if (SCHEDULE_HAS_MONOTONIC(schedule))
88  monotonicity = SCHEDULE_MONOTONIC;
89 
90  return monotonicity;
91 }
92 
93 #if KMP_STATIC_STEAL_ENABLED
94 enum { // values for steal_flag (possible states of private per-loop buffer)
95  UNUSED = 0,
96  CLAIMED = 1, // owner thread started initialization
97  READY = 2, // available for stealing
98  THIEF = 3 // finished by owner, or claimed by thief
99  // possible state changes:
100  // 0 -> 1 owner only, sync
101  // 0 -> 3 thief only, sync
102  // 1 -> 2 owner only, async
103  // 2 -> 3 owner only, async
104  // 3 -> 2 owner only, async
105  // 3 -> 0 last thread finishing the loop, async
106 };
107 #endif
108 
109 // Initialize a dispatch_private_info_template<T> buffer for a particular
110 // type of schedule,chunk. The loop description is found in lb (lower bound),
111 // ub (upper bound), and st (stride). nproc is the number of threads relevant
112 // to the scheduling (often the number of threads in a team, but not always if
113 // hierarchical scheduling is used). tid is the id of the thread calling
114 // the function within the group of nproc threads. It will have a value
115 // between 0 and nproc - 1. This is often just the thread id within a team, but
116 // is not necessarily the case when using hierarchical scheduling.
117 // loc is the source file location of the corresponding loop
118 // gtid is the global thread id
119 template <typename T>
120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121  dispatch_private_info_template<T> *pr,
122  enum sched_type schedule, T lb, T ub,
123  typename traits_t<T>::signed_t st,
124 #if USE_ITT_BUILD
125  kmp_uint64 *cur_chunk,
126 #endif
127  typename traits_t<T>::signed_t chunk,
128  T nproc, T tid) {
129  typedef typename traits_t<T>::unsigned_t UT;
130  typedef typename traits_t<T>::floating_t DBL;
131 
132  int active;
133  T tc;
134  kmp_info_t *th;
135  kmp_team_t *team;
136  int monotonicity;
137  bool use_hier;
138 
139 #ifdef KMP_DEBUG
140  typedef typename traits_t<T>::signed_t ST;
141  {
142  char *buff;
143  // create format specifiers before the debug output
144  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147  traits_t<T>::spec, traits_t<T>::spec,
148  traits_t<ST>::spec, traits_t<ST>::spec,
149  traits_t<T>::spec, traits_t<T>::spec);
150  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151  __kmp_str_free(&buff);
152  }
153 #endif
154  /* setup data */
155  th = __kmp_threads[gtid];
156  team = th->th.th_team;
157  active = !team->t.t_serialized;
158 
159 #if USE_ITT_BUILD
160  int itt_need_metadata_reporting =
161  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163  team->t.t_active_level == 1;
164 #endif
165 
166 #if KMP_USE_HIER_SCHED
167  use_hier = pr->flags.use_hier;
168 #else
169  use_hier = false;
170 #endif
171 
172  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175 
176  /* Pick up the nomerge/ordered bits from the scheduling type */
177  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178  pr->flags.nomerge = TRUE;
179  schedule =
180  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181  } else {
182  pr->flags.nomerge = FALSE;
183  }
184  pr->type_size = traits_t<T>::type_size; // remember the size of variables
185  if (kmp_ord_lower & schedule) {
186  pr->flags.ordered = TRUE;
187  schedule =
188  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189  } else {
190  pr->flags.ordered = FALSE;
191  }
192  // Ordered overrides nonmonotonic
193  if (pr->flags.ordered) {
194  monotonicity = SCHEDULE_MONOTONIC;
195  }
196 
197  if (schedule == kmp_sch_static) {
198  schedule = __kmp_static;
199  } else {
200  if (schedule == kmp_sch_runtime) {
201  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202  // not specified)
203  schedule = team->t.t_sched.r_sched_type;
204  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206  if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207  monotonicity = SCHEDULE_MONOTONIC;
208  // Detail the schedule if needed (global controls are differentiated
209  // appropriately)
210  if (schedule == kmp_sch_guided_chunked) {
211  schedule = __kmp_guided;
212  } else if (schedule == kmp_sch_static) {
213  schedule = __kmp_static;
214  }
215  // Use the chunk size specified by OMP_SCHEDULE (or default if not
216  // specified)
217  chunk = team->t.t_sched.chunk;
218 #if USE_ITT_BUILD
219  if (cur_chunk)
220  *cur_chunk = chunk;
221 #endif
222 #ifdef KMP_DEBUG
223  {
224  char *buff;
225  // create format specifiers before the debug output
226  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227  "schedule:%%d chunk:%%%s\n",
228  traits_t<ST>::spec);
229  KD_TRACE(10, (buff, gtid, schedule, chunk));
230  __kmp_str_free(&buff);
231  }
232 #endif
233  } else {
234  if (schedule == kmp_sch_guided_chunked) {
235  schedule = __kmp_guided;
236  }
237  if (chunk <= 0) {
238  chunk = KMP_DEFAULT_CHUNK;
239  }
240  }
241 
242  if (schedule == kmp_sch_auto) {
243  // mapping and differentiation: in the __kmp_do_serial_initialize()
244  schedule = __kmp_auto;
245 #ifdef KMP_DEBUG
246  {
247  char *buff;
248  // create format specifiers before the debug output
249  buff = __kmp_str_format(
250  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251  "schedule:%%d chunk:%%%s\n",
252  traits_t<ST>::spec);
253  KD_TRACE(10, (buff, gtid, schedule, chunk));
254  __kmp_str_free(&buff);
255  }
256 #endif
257  }
258 #if KMP_STATIC_STEAL_ENABLED
259  // map nonmonotonic:dynamic to static steal
260  if (schedule == kmp_sch_dynamic_chunked) {
261  if (monotonicity == SCHEDULE_NONMONOTONIC)
262  schedule = kmp_sch_static_steal;
263  }
264 #endif
265  /* guided analytical not safe for too many threads */
266  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267  schedule = kmp_sch_guided_iterative_chunked;
268  KMP_WARNING(DispatchManyThreads);
269  }
270  if (schedule == kmp_sch_runtime_simd) {
271  // compiler provides simd_width in the chunk parameter
272  schedule = team->t.t_sched.r_sched_type;
273  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275  // Detail the schedule if needed (global controls are differentiated
276  // appropriately)
277  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278  schedule == __kmp_static) {
279  schedule = kmp_sch_static_balanced_chunked;
280  } else {
281  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282  schedule = kmp_sch_guided_simd;
283  }
284  chunk = team->t.t_sched.chunk * chunk;
285  }
286 #if USE_ITT_BUILD
287  if (cur_chunk)
288  *cur_chunk = chunk;
289 #endif
290 #ifdef KMP_DEBUG
291  {
292  char *buff;
293  // create format specifiers before the debug output
294  buff = __kmp_str_format(
295  "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296  " chunk:%%%s\n",
297  traits_t<ST>::spec);
298  KD_TRACE(10, (buff, gtid, schedule, chunk));
299  __kmp_str_free(&buff);
300  }
301 #endif
302  }
303  pr->u.p.parm1 = chunk;
304  }
305  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306  "unknown scheduling type");
307 
308  pr->u.p.count = 0;
309 
310  if (__kmp_env_consistency_check) {
311  if (st == 0) {
312  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314  }
315  }
316  // compute trip count
317  if (st == 1) { // most common case
318  if (ub >= lb) {
319  tc = ub - lb + 1;
320  } else { // ub < lb
321  tc = 0; // zero-trip
322  }
323  } else if (st < 0) {
324  if (lb >= ub) {
325  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326  // where the division needs to be unsigned regardless of the result type
327  tc = (UT)(lb - ub) / (-st) + 1;
328  } else { // lb < ub
329  tc = 0; // zero-trip
330  }
331  } else { // st > 0
332  if (ub >= lb) {
333  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334  // where the division needs to be unsigned regardless of the result type
335  tc = (UT)(ub - lb) / st + 1;
336  } else { // ub < lb
337  tc = 0; // zero-trip
338  }
339  }
340 
341 #if KMP_STATS_ENABLED
342  if (KMP_MASTER_GTID(gtid)) {
343  KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344  }
345 #endif
346 
347  pr->u.p.lb = lb;
348  pr->u.p.ub = ub;
349  pr->u.p.st = st;
350  pr->u.p.tc = tc;
351 
352 #if KMP_OS_WINDOWS
353  pr->u.p.last_upper = ub + st;
354 #endif /* KMP_OS_WINDOWS */
355 
356  /* NOTE: only the active parallel region(s) has active ordered sections */
357 
358  if (active) {
359  if (pr->flags.ordered) {
360  pr->ordered_bumped = 0;
361  pr->u.p.ordered_lower = 1;
362  pr->u.p.ordered_upper = 0;
363  }
364  }
365 
366  switch (schedule) {
367 #if KMP_STATIC_STEAL_ENABLED
368  case kmp_sch_static_steal: {
369  T ntc, init;
370 
371  KD_TRACE(100,
372  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373  gtid));
374 
375  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376  if (nproc > 1 && ntc >= nproc) {
377  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378  T id = tid;
379  T small_chunk, extras;
380  kmp_uint32 old = UNUSED;
381  int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382  if (traits_t<T>::type_size > 4) {
383  // AC: TODO: check if 16-byte CAS available and use it to
384  // improve performance (probably wait for explicit request
385  // before spending time on this).
386  // For now use dynamically allocated per-private-buffer lock,
387  // free memory in __kmp_dispatch_next when status==0.
388  pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389  __kmp_init_lock(pr->u.p.steal_lock);
390  }
391  small_chunk = ntc / nproc;
392  extras = ntc % nproc;
393 
394  init = id * small_chunk + (id < extras ? id : extras);
395  pr->u.p.count = init;
396  if (claimed) { // are we succeeded in claiming own buffer?
397  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398  // Other threads will inspect steal_flag when searching for a victim.
399  // READY means other threads may steal from this thread from now on.
400  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401  } else {
402  // other thread has stolen whole our range
403  KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404  pr->u.p.ub = init; // mark there is no iterations to work on
405  }
406  pr->u.p.parm2 = ntc; // save number of chunks
407  // parm3 is the number of times to attempt stealing which is
408  // nproc (just a heuristics, could be optimized later on).
409  pr->u.p.parm3 = nproc;
410  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411  break;
412  } else {
413  /* too few chunks: switching to kmp_sch_dynamic_chunked */
414  schedule = kmp_sch_dynamic_chunked;
415  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416  "kmp_sch_dynamic_chunked\n",
417  gtid));
418  goto dynamic_init;
419  break;
420  } // if
421  } // case
422 #endif
423  case kmp_sch_static_balanced: {
424  T init, limit;
425 
426  KD_TRACE(
427  100,
428  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429  gtid));
430 
431  if (nproc > 1) {
432  T id = tid;
433 
434  if (tc < nproc) {
435  if (id < tc) {
436  init = id;
437  limit = id;
438  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439  } else {
440  pr->u.p.count = 1; /* means no more chunks to execute */
441  pr->u.p.parm1 = FALSE;
442  break;
443  }
444  } else {
445  T small_chunk = tc / nproc;
446  T extras = tc % nproc;
447  init = id * small_chunk + (id < extras ? id : extras);
448  limit = init + small_chunk - (id < extras ? 0 : 1);
449  pr->u.p.parm1 = (id == nproc - 1);
450  }
451  } else {
452  if (tc > 0) {
453  init = 0;
454  limit = tc - 1;
455  pr->u.p.parm1 = TRUE;
456  } else {
457  // zero trip count
458  pr->u.p.count = 1; /* means no more chunks to execute */
459  pr->u.p.parm1 = FALSE;
460  break;
461  }
462  }
463 #if USE_ITT_BUILD
464  // Calculate chunk for metadata report
465  if (itt_need_metadata_reporting)
466  if (cur_chunk)
467  *cur_chunk = limit - init + 1;
468 #endif
469  if (st == 1) {
470  pr->u.p.lb = lb + init;
471  pr->u.p.ub = lb + limit;
472  } else {
473  // calculated upper bound, "ub" is user-defined upper bound
474  T ub_tmp = lb + limit * st;
475  pr->u.p.lb = lb + init * st;
476  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477  // it exactly
478  if (st > 0) {
479  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480  } else {
481  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482  }
483  }
484  if (pr->flags.ordered) {
485  pr->u.p.ordered_lower = init;
486  pr->u.p.ordered_upper = limit;
487  }
488  break;
489  } // case
490  case kmp_sch_static_balanced_chunked: {
491  // similar to balanced, but chunk adjusted to multiple of simd width
492  T nth = nproc;
493  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494  " -> falling-through to static_greedy\n",
495  gtid));
496  schedule = kmp_sch_static_greedy;
497  if (nth > 1)
498  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499  else
500  pr->u.p.parm1 = tc;
501  break;
502  } // case
503  case kmp_sch_guided_simd:
504  case kmp_sch_guided_iterative_chunked: {
505  KD_TRACE(
506  100,
507  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508  " case\n",
509  gtid));
510 
511  if (nproc > 1) {
512  if ((2L * chunk + 1) * nproc >= tc) {
513  /* chunk size too large, switch to dynamic */
514  schedule = kmp_sch_dynamic_chunked;
515  goto dynamic_init;
516  } else {
517  // when remaining iters become less than parm2 - switch to dynamic
518  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519  *(double *)&pr->u.p.parm3 =
520  guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521  }
522  } else {
523  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524  "kmp_sch_static_greedy\n",
525  gtid));
526  schedule = kmp_sch_static_greedy;
527  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528  KD_TRACE(
529  100,
530  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531  gtid));
532  pr->u.p.parm1 = tc;
533  } // if
534  } // case
535  break;
536  case kmp_sch_guided_analytical_chunked: {
537  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538  "kmp_sch_guided_analytical_chunked case\n",
539  gtid));
540 
541  if (nproc > 1) {
542  if ((2L * chunk + 1) * nproc >= tc) {
543  /* chunk size too large, switch to dynamic */
544  schedule = kmp_sch_dynamic_chunked;
545  goto dynamic_init;
546  } else {
547  /* commonly used term: (2 nproc - 1)/(2 nproc) */
548  DBL x;
549 
550 #if KMP_USE_X87CONTROL
551  /* Linux* OS already has 64-bit computation by default for long double,
552  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554  instead of the default 53-bit. Even though long double doesn't work
555  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556  expected to impact the correctness of the algorithm, but this has not
557  been mathematically proven. */
558  // save original FPCW and set precision to 64-bit, as
559  // Windows* OS on IA-32 architecture defaults to 53-bit
560  unsigned int oldFpcw = _control87(0, 0);
561  _control87(_PC_64, _MCW_PC); // 0,0x30000
562 #endif
563  /* value used for comparison in solver for cross-over point */
564  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
565 
566  /* crossover point--chunk indexes equal to or greater than
567  this point switch to dynamic-style scheduling */
568  UT cross;
569 
570  /* commonly used term: (2 nproc - 1)/(2 nproc) */
571  x = 1.0 - 0.5 / (double)nproc;
572 
573 #ifdef KMP_DEBUG
574  { // test natural alignment
575  struct _test_a {
576  char a;
577  union {
578  char b;
579  DBL d;
580  };
581  } t;
582  ptrdiff_t natural_alignment =
583  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
584  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
585  // long)natural_alignment );
586  KMP_DEBUG_ASSERT(
587  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
588  }
589 #endif // KMP_DEBUG
590 
591  /* save the term in thread private dispatch structure */
592  *(DBL *)&pr->u.p.parm3 = x;
593 
594  /* solve for the crossover point to the nearest integer i for which C_i
595  <= chunk */
596  {
597  UT left, right, mid;
598  long double p;
599 
600  /* estimate initial upper and lower bound */
601 
602  /* doesn't matter what value right is as long as it is positive, but
603  it affects performance of the solver */
604  right = 229;
605  p = __kmp_pow<UT>(x, right);
606  if (p > target) {
607  do {
608  p *= p;
609  right <<= 1;
610  } while (p > target && right < (1 << 27));
611  /* lower bound is previous (failed) estimate of upper bound */
612  left = right >> 1;
613  } else {
614  left = 0;
615  }
616 
617  /* bisection root-finding method */
618  while (left + 1 < right) {
619  mid = (left + right) / 2;
620  if (__kmp_pow<UT>(x, mid) > target) {
621  left = mid;
622  } else {
623  right = mid;
624  }
625  } // while
626  cross = right;
627  }
628  /* assert sanity of computed crossover point */
629  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
630  __kmp_pow<UT>(x, cross) <= target);
631 
632  /* save the crossover point in thread private dispatch structure */
633  pr->u.p.parm2 = cross;
634 
635 // C75803
636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
638 #else
639 #define GUIDED_ANALYTICAL_WORKAROUND (x)
640 #endif
641  /* dynamic-style scheduling offset */
642  pr->u.p.count = tc -
643  __kmp_dispatch_guided_remaining(
644  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
645  cross * chunk;
646 #if KMP_USE_X87CONTROL
647  // restore FPCW
648  _control87(oldFpcw, _MCW_PC);
649 #endif
650  } // if
651  } else {
652  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
653  "kmp_sch_static_greedy\n",
654  gtid));
655  schedule = kmp_sch_static_greedy;
656  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
657  pr->u.p.parm1 = tc;
658  } // if
659  } // case
660  break;
661  case kmp_sch_static_greedy:
662  KD_TRACE(
663  100,
664  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
665  gtid));
666  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
667  break;
668  case kmp_sch_static_chunked:
669  case kmp_sch_dynamic_chunked:
670  dynamic_init:
671  if (pr->u.p.parm1 <= 0)
672  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
673  else if (pr->u.p.parm1 > tc)
674  pr->u.p.parm1 = tc;
675  // Store the total number of chunks to prevent integer overflow during
676  // bounds calculations in the get next chunk routine.
677  pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
678  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
679  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
680  gtid));
681  break;
682  case kmp_sch_trapezoidal: {
683  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
684 
685  T parm1, parm2, parm3, parm4;
686  KD_TRACE(100,
687  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
688  gtid));
689 
690  parm1 = chunk;
691 
692  /* F : size of the first cycle */
693  parm2 = (tc / (2 * nproc));
694 
695  if (parm2 < 1) {
696  parm2 = 1;
697  }
698 
699  /* L : size of the last cycle. Make sure the last cycle is not larger
700  than the first cycle. */
701  if (parm1 < 1) {
702  parm1 = 1;
703  } else if (parm1 > parm2) {
704  parm1 = parm2;
705  }
706 
707  /* N : number of cycles */
708  parm3 = (parm2 + parm1);
709  parm3 = (2 * tc + parm3 - 1) / parm3;
710 
711  if (parm3 < 2) {
712  parm3 = 2;
713  }
714 
715  /* sigma : decreasing incr of the trapezoid */
716  parm4 = (parm3 - 1);
717  parm4 = (parm2 - parm1) / parm4;
718 
719  // pointless check, because parm4 >= 0 always
720  // if ( parm4 < 0 ) {
721  // parm4 = 0;
722  //}
723 
724  pr->u.p.parm1 = parm1;
725  pr->u.p.parm2 = parm2;
726  pr->u.p.parm3 = parm3;
727  pr->u.p.parm4 = parm4;
728  } // case
729  break;
730 
731  default: {
732  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
733  KMP_HNT(GetNewerLibrary), // Hint
734  __kmp_msg_null // Variadic argument list terminator
735  );
736  } break;
737  } // switch
738  pr->schedule = schedule;
739 }
740 
741 #if KMP_USE_HIER_SCHED
742 template <typename T>
743 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
744  typename traits_t<T>::signed_t st);
745 template <>
746 inline void
747 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
748  kmp_int32 ub, kmp_int32 st) {
749  __kmp_dispatch_init_hierarchy<kmp_int32>(
750  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
751  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
752 }
753 template <>
754 inline void
755 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
756  kmp_uint32 ub, kmp_int32 st) {
757  __kmp_dispatch_init_hierarchy<kmp_uint32>(
758  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
759  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
760 }
761 template <>
762 inline void
763 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
764  kmp_int64 ub, kmp_int64 st) {
765  __kmp_dispatch_init_hierarchy<kmp_int64>(
766  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
767  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
768 }
769 template <>
770 inline void
771 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
772  kmp_uint64 ub, kmp_int64 st) {
773  __kmp_dispatch_init_hierarchy<kmp_uint64>(
774  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
775  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
776 }
777 
778 // free all the hierarchy scheduling memory associated with the team
779 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
780  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
781  for (int i = 0; i < num_disp_buff; ++i) {
782  // type does not matter here so use kmp_int32
783  auto sh =
784  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
785  &team->t.t_disp_buffer[i]);
786  if (sh->hier) {
787  sh->hier->deallocate();
788  __kmp_free(sh->hier);
789  }
790  }
791 }
792 #endif
793 
794 // UT - unsigned flavor of T, ST - signed flavor of T,
795 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
796 template <typename T>
797 static void
798 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
799  T ub, typename traits_t<T>::signed_t st,
800  typename traits_t<T>::signed_t chunk, int push_ws) {
801  typedef typename traits_t<T>::unsigned_t UT;
802 
803  int active;
804  kmp_info_t *th;
805  kmp_team_t *team;
806  kmp_uint32 my_buffer_index;
807  dispatch_private_info_template<T> *pr;
808  dispatch_shared_info_template<T> volatile *sh;
809 
810  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
811  sizeof(dispatch_private_info));
812  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
813  sizeof(dispatch_shared_info));
814  __kmp_assert_valid_gtid(gtid);
815 
816  if (!TCR_4(__kmp_init_parallel))
817  __kmp_parallel_initialize();
818 
819  __kmp_resume_if_soft_paused();
820 
821 #if INCLUDE_SSC_MARKS
822  SSC_MARK_DISPATCH_INIT();
823 #endif
824 #ifdef KMP_DEBUG
825  typedef typename traits_t<T>::signed_t ST;
826  {
827  char *buff;
828  // create format specifiers before the debug output
829  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
830  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
831  traits_t<ST>::spec, traits_t<T>::spec,
832  traits_t<T>::spec, traits_t<ST>::spec);
833  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
834  __kmp_str_free(&buff);
835  }
836 #endif
837  /* setup data */
838  th = __kmp_threads[gtid];
839  team = th->th.th_team;
840  active = !team->t.t_serialized;
841  th->th.th_ident = loc;
842 
843  // Any half-decent optimizer will remove this test when the blocks are empty
844  // since the macros expand to nothing
845  // when statistics are disabled.
846  if (schedule == __kmp_static) {
847  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
848  } else {
849  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
850  }
851 
852 #if KMP_USE_HIER_SCHED
853  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
854  // Hierarchical scheduling does not work with ordered, so if ordered is
855  // detected, then revert back to threaded scheduling.
856  bool ordered;
857  enum sched_type my_sched = schedule;
858  my_buffer_index = th->th.th_dispatch->th_disp_index;
859  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
860  &th->th.th_dispatch
861  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
862  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
863  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
864  my_sched =
865  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
866  ordered = (kmp_ord_lower & my_sched);
867  if (pr->flags.use_hier) {
868  if (ordered) {
869  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
870  "Disabling hierarchical scheduling.\n",
871  gtid));
872  pr->flags.use_hier = FALSE;
873  }
874  }
875  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
876  // Don't use hierarchical for ordered parallel loops and don't
877  // use the runtime hierarchy if one was specified in the program
878  if (!ordered && !pr->flags.use_hier)
879  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
880  }
881 #endif // KMP_USE_HIER_SCHED
882 
883 #if USE_ITT_BUILD
884  kmp_uint64 cur_chunk = chunk;
885  int itt_need_metadata_reporting =
886  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
887  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
888  team->t.t_active_level == 1;
889 #endif
890  if (!active) {
891  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
892  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
893  } else {
894  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
895  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
896 
897  my_buffer_index = th->th.th_dispatch->th_disp_index++;
898 
899  /* What happens when number of threads changes, need to resize buffer? */
900  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
901  &th->th.th_dispatch
902  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
903  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
904  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
905  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
906  my_buffer_index));
907  if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
908  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
909  " sh->buffer_index:%d\n",
910  gtid, my_buffer_index, sh->buffer_index));
911  __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
912  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
913  // Note: KMP_WAIT() cannot be used there: buffer index and
914  // my_buffer_index are *always* 32-bit integers.
915  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
916  "sh->buffer_index:%d\n",
917  gtid, my_buffer_index, sh->buffer_index));
918  }
919  }
920 
921  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
922 #if USE_ITT_BUILD
923  &cur_chunk,
924 #endif
925  chunk, (T)th->th.th_team_nproc,
926  (T)th->th.th_info.ds.ds_tid);
927  if (active) {
928  if (pr->flags.ordered == 0) {
929  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
930  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
931  } else {
932  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
933  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
934  }
935  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
936  th->th.th_dispatch->th_dispatch_sh_current =
937  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
938 #if USE_ITT_BUILD
939  if (pr->flags.ordered) {
940  __kmp_itt_ordered_init(gtid);
941  }
942  // Report loop metadata
943  if (itt_need_metadata_reporting) {
944  // Only report metadata by primary thread of active team at level 1
945  kmp_uint64 schedtype = 0;
946  switch (schedule) {
947  case kmp_sch_static_chunked:
948  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
949  break;
950  case kmp_sch_static_greedy:
951  cur_chunk = pr->u.p.parm1;
952  break;
953  case kmp_sch_dynamic_chunked:
954  schedtype = 1;
955  break;
956  case kmp_sch_guided_iterative_chunked:
957  case kmp_sch_guided_analytical_chunked:
958  case kmp_sch_guided_simd:
959  schedtype = 2;
960  break;
961  default:
962  // Should we put this case under "static"?
963  // case kmp_sch_static_steal:
964  schedtype = 3;
965  break;
966  }
967  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
968  }
969 #if KMP_USE_HIER_SCHED
970  if (pr->flags.use_hier) {
971  pr->u.p.count = 0;
972  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
973  }
974 #endif // KMP_USER_HIER_SCHED
975 #endif /* USE_ITT_BUILD */
976  }
977 
978 #ifdef KMP_DEBUG
979  {
980  char *buff;
981  // create format specifiers before the debug output
982  buff = __kmp_str_format(
983  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
984  "lb:%%%s ub:%%%s"
985  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
986  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
987  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
988  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
989  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
990  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
991  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
992  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
993  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
994  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
995  __kmp_str_free(&buff);
996  }
997 #endif
998 #if OMPT_SUPPORT && OMPT_OPTIONAL
999  if (ompt_enabled.ompt_callback_work) {
1000  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1001  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1002  ompt_callbacks.ompt_callback(ompt_callback_work)(
1003  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1004  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1005  }
1006 #endif
1007  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1008 }
1009 
1010 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1011  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1012  * every chunk of iterations. If the ordered section(s) were not executed
1013  * for this iteration (or every iteration in this chunk), we need to set the
1014  * ordered iteration counters so that the next thread can proceed. */
1015 template <typename UT>
1016 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1017  typedef typename traits_t<UT>::signed_t ST;
1018  __kmp_assert_valid_gtid(gtid);
1019  kmp_info_t *th = __kmp_threads[gtid];
1020 
1021  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1022  if (!th->th.th_team->t.t_serialized) {
1023 
1024  dispatch_private_info_template<UT> *pr =
1025  reinterpret_cast<dispatch_private_info_template<UT> *>(
1026  th->th.th_dispatch->th_dispatch_pr_current);
1027  dispatch_shared_info_template<UT> volatile *sh =
1028  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1029  th->th.th_dispatch->th_dispatch_sh_current);
1030  KMP_DEBUG_ASSERT(pr);
1031  KMP_DEBUG_ASSERT(sh);
1032  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1033  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1034 
1035  if (pr->ordered_bumped) {
1036  KD_TRACE(
1037  1000,
1038  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1039  gtid));
1040  pr->ordered_bumped = 0;
1041  } else {
1042  UT lower = pr->u.p.ordered_lower;
1043 
1044 #ifdef KMP_DEBUG
1045  {
1046  char *buff;
1047  // create format specifiers before the debug output
1048  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1049  "ordered_iteration:%%%s lower:%%%s\n",
1050  traits_t<UT>::spec, traits_t<UT>::spec);
1051  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1052  __kmp_str_free(&buff);
1053  }
1054 #endif
1055 
1056  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1057  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1058  KMP_MB(); /* is this necessary? */
1059 #ifdef KMP_DEBUG
1060  {
1061  char *buff;
1062  // create format specifiers before the debug output
1063  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1064  "ordered_iteration:%%%s lower:%%%s\n",
1065  traits_t<UT>::spec, traits_t<UT>::spec);
1066  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1067  __kmp_str_free(&buff);
1068  }
1069 #endif
1070 
1071  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1072  } // if
1073  } // if
1074  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1075 }
1076 
1077 #ifdef KMP_GOMP_COMPAT
1078 
1079 template <typename UT>
1080 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1081  typedef typename traits_t<UT>::signed_t ST;
1082  __kmp_assert_valid_gtid(gtid);
1083  kmp_info_t *th = __kmp_threads[gtid];
1084 
1085  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1086  if (!th->th.th_team->t.t_serialized) {
1087  dispatch_private_info_template<UT> *pr =
1088  reinterpret_cast<dispatch_private_info_template<UT> *>(
1089  th->th.th_dispatch->th_dispatch_pr_current);
1090  dispatch_shared_info_template<UT> volatile *sh =
1091  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1092  th->th.th_dispatch->th_dispatch_sh_current);
1093  KMP_DEBUG_ASSERT(pr);
1094  KMP_DEBUG_ASSERT(sh);
1095  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1096  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1097 
1098  UT lower = pr->u.p.ordered_lower;
1099  UT upper = pr->u.p.ordered_upper;
1100  UT inc = upper - lower + 1;
1101 
1102  if (pr->ordered_bumped == inc) {
1103  KD_TRACE(
1104  1000,
1105  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106  gtid));
1107  pr->ordered_bumped = 0;
1108  } else {
1109  inc -= pr->ordered_bumped;
1110 
1111 #ifdef KMP_DEBUG
1112  {
1113  char *buff;
1114  // create format specifiers before the debug output
1115  buff = __kmp_str_format(
1116  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120  __kmp_str_free(&buff);
1121  }
1122 #endif
1123 
1124  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126 
1127  KMP_MB(); /* is this necessary? */
1128  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129  "ordered_bumped to zero\n",
1130  gtid));
1131  pr->ordered_bumped = 0;
1133 #ifdef KMP_DEBUG
1134  {
1135  char *buff;
1136  // create format specifiers before the debug output
1137  buff = __kmp_str_format(
1138  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141  traits_t<UT>::spec);
1142  KD_TRACE(1000,
1143  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144  __kmp_str_free(&buff);
1145  }
1146 #endif
1147 
1148  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149  }
1150  // }
1151  }
1152  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153 }
1154 
1155 #endif /* KMP_GOMP_COMPAT */
1156 
1157 template <typename T>
1158 int __kmp_dispatch_next_algorithm(int gtid,
1159  dispatch_private_info_template<T> *pr,
1160  dispatch_shared_info_template<T> volatile *sh,
1161  kmp_int32 *p_last, T *p_lb, T *p_ub,
1162  typename traits_t<T>::signed_t *p_st, T nproc,
1163  T tid) {
1164  typedef typename traits_t<T>::unsigned_t UT;
1165  typedef typename traits_t<T>::signed_t ST;
1166  typedef typename traits_t<T>::floating_t DBL;
1167  int status = 0;
1168  bool last = false;
1169  T start;
1170  ST incr;
1171  UT limit, trip, init;
1172  kmp_info_t *th = __kmp_threads[gtid];
1173  kmp_team_t *team = th->th.th_team;
1174 
1175  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177  KMP_DEBUG_ASSERT(pr);
1178  KMP_DEBUG_ASSERT(sh);
1179  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180 #ifdef KMP_DEBUG
1181  {
1182  char *buff;
1183  // create format specifiers before the debug output
1184  buff =
1185  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186  "sh:%%p nproc:%%%s tid:%%%s\n",
1187  traits_t<T>::spec, traits_t<T>::spec);
1188  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189  __kmp_str_free(&buff);
1190  }
1191 #endif
1192 
1193  // zero trip count
1194  if (pr->u.p.tc == 0) {
1195  KD_TRACE(10,
1196  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197  "zero status:%d\n",
1198  gtid, status));
1199  return 0;
1200  }
1201 
1202  switch (pr->schedule) {
1203 #if KMP_STATIC_STEAL_ENABLED
1204  case kmp_sch_static_steal: {
1205  T chunk = pr->u.p.parm1;
1206  UT nchunks = pr->u.p.parm2;
1207  KD_TRACE(100,
1208  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209  gtid));
1210 
1211  trip = pr->u.p.tc - 1;
1212 
1213  if (traits_t<T>::type_size > 4) {
1214  // use lock for 8-byte induction variable.
1215  // TODO (optional): check presence and use 16-byte CAS
1216  kmp_lock_t *lck = pr->u.p.steal_lock;
1217  KMP_DEBUG_ASSERT(lck != NULL);
1218  if (pr->u.p.count < (UT)pr->u.p.ub) {
1219  KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1220  __kmp_acquire_lock(lck, gtid);
1221  // try to get own chunk of iterations
1222  init = (pr->u.p.count)++;
1223  status = (init < (UT)pr->u.p.ub);
1224  __kmp_release_lock(lck, gtid);
1225  } else {
1226  status = 0; // no own chunks
1227  }
1228  if (!status) { // try to steal
1229  kmp_lock_t *lckv; // victim buffer's lock
1230  T while_limit = pr->u.p.parm3;
1231  T while_index = 0;
1232  int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233  __kmp_dispatch_num_buffers; // current loop index
1234  // note: victim thread can potentially execute another loop
1235  KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1236  while ((!status) && (while_limit != ++while_index)) {
1237  dispatch_private_info_template<T> *v;
1238  T remaining;
1239  T victimId = pr->u.p.parm4;
1240  T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1241  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1242  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1243  KMP_DEBUG_ASSERT(v);
1244  while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1245  oldVictimId != victimId) {
1246  victimId = (victimId + 1) % nproc;
1247  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1248  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1249  KMP_DEBUG_ASSERT(v);
1250  }
1251  if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1252  continue; // try once more (nproc attempts in total)
1253  }
1254  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1255  kmp_uint32 old = UNUSED;
1256  // try to steal whole range from inactive victim
1257  status = v->steal_flag.compare_exchange_strong(old, THIEF);
1258  if (status) {
1259  // initialize self buffer with victim's whole range of chunks
1260  T id = victimId;
1261  T small_chunk, extras;
1262  small_chunk = nchunks / nproc; // chunks per thread
1263  extras = nchunks % nproc;
1264  init = id * small_chunk + (id < extras ? id : extras);
1265  __kmp_acquire_lock(lck, gtid);
1266  pr->u.p.count = init + 1; // exclude one we execute immediately
1267  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1268  __kmp_release_lock(lck, gtid);
1269  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1270  // no need to reinitialize other thread invariants: lb, st, etc.
1271 #ifdef KMP_DEBUG
1272  {
1273  char *buff;
1274  // create format specifiers before the debug output
1275  buff = __kmp_str_format(
1276  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1277  "count:%%%s ub:%%%s\n",
1278  traits_t<UT>::spec, traits_t<T>::spec);
1279  KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1280  __kmp_str_free(&buff);
1281  }
1282 #endif
1283  // activate non-empty buffer and let others steal from us
1284  if (pr->u.p.count < (UT)pr->u.p.ub)
1285  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1286  break;
1287  }
1288  }
1289  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1290  v->u.p.count >= (UT)v->u.p.ub) {
1291  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1292  continue; // no chunks to steal, try next victim
1293  }
1294  lckv = v->u.p.steal_lock;
1295  KMP_ASSERT(lckv != NULL);
1296  __kmp_acquire_lock(lckv, gtid);
1297  limit = v->u.p.ub; // keep initial ub
1298  if (v->u.p.count >= limit) {
1299  __kmp_release_lock(lckv, gtid);
1300  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1301  continue; // no chunks to steal, try next victim
1302  }
1303 
1304  // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1305  // TODO: is this heuristics good enough??
1306  remaining = limit - v->u.p.count;
1307  if (remaining > 7) {
1308  // steal 1/4 of remaining
1309  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1310  init = (v->u.p.ub -= (remaining >> 2));
1311  } else {
1312  // steal 1 chunk of 1..7 remaining
1313  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1314  init = (v->u.p.ub -= 1);
1315  }
1316  __kmp_release_lock(lckv, gtid);
1317 #ifdef KMP_DEBUG
1318  {
1319  char *buff;
1320  // create format specifiers before the debug output
1321  buff = __kmp_str_format(
1322  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1323  "count:%%%s ub:%%%s\n",
1324  traits_t<UT>::spec, traits_t<UT>::spec);
1325  KD_TRACE(10, (buff, gtid, victimId, init, limit));
1326  __kmp_str_free(&buff);
1327  }
1328 #endif
1329  KMP_DEBUG_ASSERT(init + 1 <= limit);
1330  pr->u.p.parm4 = victimId; // remember victim to steal from
1331  status = 1;
1332  // now update own count and ub with stolen range excluding init chunk
1333  __kmp_acquire_lock(lck, gtid);
1334  pr->u.p.count = init + 1;
1335  pr->u.p.ub = limit;
1336  __kmp_release_lock(lck, gtid);
1337  // activate non-empty buffer and let others steal from us
1338  if (init + 1 < limit)
1339  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1340  } // while (search for victim)
1341  } // if (try to find victim and steal)
1342  } else {
1343  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1344  // as all operations on pair (count, ub) must be done atomically
1345  typedef union {
1346  struct {
1347  UT count;
1348  T ub;
1349  } p;
1350  kmp_int64 b;
1351  } union_i4;
1352  union_i4 vold, vnew;
1353  if (pr->u.p.count < (UT)pr->u.p.ub) {
1354  KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1355  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1356  vnew.b = vold.b;
1357  vnew.p.count++; // get chunk from head of self range
1358  while (!KMP_COMPARE_AND_STORE_REL64(
1359  (volatile kmp_int64 *)&pr->u.p.count,
1360  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1361  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1362  KMP_CPU_PAUSE();
1363  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1364  vnew.b = vold.b;
1365  vnew.p.count++;
1366  }
1367  init = vold.p.count;
1368  status = (init < (UT)vold.p.ub);
1369  } else {
1370  status = 0; // no own chunks
1371  }
1372  if (!status) { // try to steal
1373  T while_limit = pr->u.p.parm3;
1374  T while_index = 0;
1375  int idx = (th->th.th_dispatch->th_disp_index - 1) %
1376  __kmp_dispatch_num_buffers; // current loop index
1377  // note: victim thread can potentially execute another loop
1378  KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1379  while ((!status) && (while_limit != ++while_index)) {
1380  dispatch_private_info_template<T> *v;
1381  T remaining;
1382  T victimId = pr->u.p.parm4;
1383  T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1384  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1385  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1386  KMP_DEBUG_ASSERT(v);
1387  while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1388  oldVictimId != victimId) {
1389  victimId = (victimId + 1) % nproc;
1390  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1391  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1392  KMP_DEBUG_ASSERT(v);
1393  }
1394  if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1395  continue; // try once more (nproc attempts in total)
1396  }
1397  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1398  kmp_uint32 old = UNUSED;
1399  // try to steal whole range from inactive victim
1400  status = v->steal_flag.compare_exchange_strong(old, THIEF);
1401  if (status) {
1402  // initialize self buffer with victim's whole range of chunks
1403  T id = victimId;
1404  T small_chunk, extras;
1405  small_chunk = nchunks / nproc; // chunks per thread
1406  extras = nchunks % nproc;
1407  init = id * small_chunk + (id < extras ? id : extras);
1408  vnew.p.count = init + 1;
1409  vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1410  // write pair (count, ub) at once atomically
1411 #if KMP_ARCH_X86
1412  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1413 #else
1414  *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1415 #endif
1416  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1417  // no need to initialize other thread invariants: lb, st, etc.
1418 #ifdef KMP_DEBUG
1419  {
1420  char *buff;
1421  // create format specifiers before the debug output
1422  buff = __kmp_str_format(
1423  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1424  "count:%%%s ub:%%%s\n",
1425  traits_t<UT>::spec, traits_t<T>::spec);
1426  KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1427  __kmp_str_free(&buff);
1428  }
1429 #endif
1430  // activate non-empty buffer and let others steal from us
1431  if (pr->u.p.count < (UT)pr->u.p.ub)
1432  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1433  break;
1434  }
1435  }
1436  while (1) { // CAS loop with check if victim still has enough chunks
1437  // many threads may be stealing concurrently from same victim
1438  vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1439  if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1440  vold.p.count >= (UT)vold.p.ub) {
1441  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1442  break; // no chunks to steal, try next victim
1443  }
1444  vnew.b = vold.b;
1445  remaining = vold.p.ub - vold.p.count;
1446  // try to steal 1/4 of remaining
1447  // TODO: is this heuristics good enough??
1448  if (remaining > 7) {
1449  vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1450  } else {
1451  vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1452  }
1453  KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1454  if (KMP_COMPARE_AND_STORE_REL64(
1455  (volatile kmp_int64 *)&v->u.p.count,
1456  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1457  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1458  // stealing succedded
1459 #ifdef KMP_DEBUG
1460  {
1461  char *buff;
1462  // create format specifiers before the debug output
1463  buff = __kmp_str_format(
1464  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1465  "count:%%%s ub:%%%s\n",
1466  traits_t<T>::spec, traits_t<T>::spec);
1467  KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1468  __kmp_str_free(&buff);
1469  }
1470 #endif
1471  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1472  vold.p.ub - vnew.p.ub);
1473  status = 1;
1474  pr->u.p.parm4 = victimId; // keep victim id
1475  // now update own count and ub
1476  init = vnew.p.ub;
1477  vold.p.count = init + 1;
1478 #if KMP_ARCH_X86
1479  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1480 #else
1481  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1482 #endif
1483  // activate non-empty buffer and let others steal from us
1484  if (vold.p.count < (UT)vold.p.ub)
1485  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1486  break;
1487  } // if (check CAS result)
1488  KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1489  } // while (try to steal from particular victim)
1490  } // while (search for victim)
1491  } // if (try to find victim and steal)
1492  } // if (4-byte induction variable)
1493  if (!status) {
1494  *p_lb = 0;
1495  *p_ub = 0;
1496  if (p_st != NULL)
1497  *p_st = 0;
1498  } else {
1499  start = pr->u.p.lb;
1500  init *= chunk;
1501  limit = chunk + init - 1;
1502  incr = pr->u.p.st;
1503  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1504 
1505  KMP_DEBUG_ASSERT(init <= trip);
1506  // keep track of done chunks for possible early exit from stealing
1507  // TODO: count executed chunks locally with rare update of shared location
1508  // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1509  if ((last = (limit >= trip)) != 0)
1510  limit = trip;
1511  if (p_st != NULL)
1512  *p_st = incr;
1513 
1514  if (incr == 1) {
1515  *p_lb = start + init;
1516  *p_ub = start + limit;
1517  } else {
1518  *p_lb = start + init * incr;
1519  *p_ub = start + limit * incr;
1520  }
1521  } // if
1522  break;
1523  } // case
1524 #endif // KMP_STATIC_STEAL_ENABLED
1525  case kmp_sch_static_balanced: {
1526  KD_TRACE(
1527  10,
1528  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1529  gtid));
1530  /* check if thread has any iteration to do */
1531  if ((status = !pr->u.p.count) != 0) {
1532  pr->u.p.count = 1;
1533  *p_lb = pr->u.p.lb;
1534  *p_ub = pr->u.p.ub;
1535  last = (pr->u.p.parm1 != 0);
1536  if (p_st != NULL)
1537  *p_st = pr->u.p.st;
1538  } else { /* no iterations to do */
1539  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1540  }
1541  } // case
1542  break;
1543  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1544  merged here */
1545  case kmp_sch_static_chunked: {
1546  T parm1;
1547 
1548  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1549  "kmp_sch_static_[affinity|chunked] case\n",
1550  gtid));
1551  parm1 = pr->u.p.parm1;
1552 
1553  trip = pr->u.p.tc - 1;
1554  init = parm1 * (pr->u.p.count + tid);
1555 
1556  if ((status = (init <= trip)) != 0) {
1557  start = pr->u.p.lb;
1558  incr = pr->u.p.st;
1559  limit = parm1 + init - 1;
1560 
1561  if ((last = (limit >= trip)) != 0)
1562  limit = trip;
1563 
1564  if (p_st != NULL)
1565  *p_st = incr;
1566 
1567  pr->u.p.count += nproc;
1568 
1569  if (incr == 1) {
1570  *p_lb = start + init;
1571  *p_ub = start + limit;
1572  } else {
1573  *p_lb = start + init * incr;
1574  *p_ub = start + limit * incr;
1575  }
1576 
1577  if (pr->flags.ordered) {
1578  pr->u.p.ordered_lower = init;
1579  pr->u.p.ordered_upper = limit;
1580  } // if
1581  } // if
1582  } // case
1583  break;
1584 
1585  case kmp_sch_dynamic_chunked: {
1586  UT chunk_number;
1587  UT chunk_size = pr->u.p.parm1;
1588  UT nchunks = pr->u.p.parm2;
1589 
1590  KD_TRACE(
1591  100,
1592  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1593  gtid));
1594 
1595  chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1596  status = (chunk_number < nchunks);
1597  if (!status) {
1598  *p_lb = 0;
1599  *p_ub = 0;
1600  if (p_st != NULL)
1601  *p_st = 0;
1602  } else {
1603  init = chunk_size * chunk_number;
1604  trip = pr->u.p.tc - 1;
1605  start = pr->u.p.lb;
1606  incr = pr->u.p.st;
1607 
1608  if ((last = (trip - init < (UT)chunk_size)))
1609  limit = trip;
1610  else
1611  limit = chunk_size + init - 1;
1612 
1613  if (p_st != NULL)
1614  *p_st = incr;
1615 
1616  if (incr == 1) {
1617  *p_lb = start + init;
1618  *p_ub = start + limit;
1619  } else {
1620  *p_lb = start + init * incr;
1621  *p_ub = start + limit * incr;
1622  }
1623 
1624  if (pr->flags.ordered) {
1625  pr->u.p.ordered_lower = init;
1626  pr->u.p.ordered_upper = limit;
1627  } // if
1628  } // if
1629  } // case
1630  break;
1631 
1632  case kmp_sch_guided_iterative_chunked: {
1633  T chunkspec = pr->u.p.parm1;
1634  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1635  "iterative case\n",
1636  gtid));
1637  trip = pr->u.p.tc;
1638  // Start atomic part of calculations
1639  while (1) {
1640  ST remaining; // signed, because can be < 0
1641  init = sh->u.s.iteration; // shared value
1642  remaining = trip - init;
1643  if (remaining <= 0) { // AC: need to compare with 0 first
1644  // nothing to do, don't try atomic op
1645  status = 0;
1646  break;
1647  }
1648  if ((T)remaining <
1649  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1650  // use dynamic-style schedule
1651  // atomically increment iterations, get old value
1652  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1653  (ST)chunkspec);
1654  remaining = trip - init;
1655  if (remaining <= 0) {
1656  status = 0; // all iterations got by other threads
1657  } else {
1658  // got some iterations to work on
1659  status = 1;
1660  if ((T)remaining > chunkspec) {
1661  limit = init + chunkspec - 1;
1662  } else {
1663  last = true; // the last chunk
1664  limit = init + remaining - 1;
1665  } // if
1666  } // if
1667  break;
1668  } // if
1669  limit = init + (UT)((double)remaining *
1670  *(double *)&pr->u.p.parm3); // divide by K*nproc
1671  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1672  (ST)init, (ST)limit)) {
1673  // CAS was successful, chunk obtained
1674  status = 1;
1675  --limit;
1676  break;
1677  } // if
1678  } // while
1679  if (status != 0) {
1680  start = pr->u.p.lb;
1681  incr = pr->u.p.st;
1682  if (p_st != NULL)
1683  *p_st = incr;
1684  *p_lb = start + init * incr;
1685  *p_ub = start + limit * incr;
1686  if (pr->flags.ordered) {
1687  pr->u.p.ordered_lower = init;
1688  pr->u.p.ordered_upper = limit;
1689  } // if
1690  } else {
1691  *p_lb = 0;
1692  *p_ub = 0;
1693  if (p_st != NULL)
1694  *p_st = 0;
1695  } // if
1696  } // case
1697  break;
1698 
1699  case kmp_sch_guided_simd: {
1700  // same as iterative but curr-chunk adjusted to be multiple of given
1701  // chunk
1702  T chunk = pr->u.p.parm1;
1703  KD_TRACE(100,
1704  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1705  gtid));
1706  trip = pr->u.p.tc;
1707  // Start atomic part of calculations
1708  while (1) {
1709  ST remaining; // signed, because can be < 0
1710  init = sh->u.s.iteration; // shared value
1711  remaining = trip - init;
1712  if (remaining <= 0) { // AC: need to compare with 0 first
1713  status = 0; // nothing to do, don't try atomic op
1714  break;
1715  }
1716  KMP_DEBUG_ASSERT(init % chunk == 0);
1717  // compare with K*nproc*(chunk+1), K=2 by default
1718  if ((T)remaining < pr->u.p.parm2) {
1719  // use dynamic-style schedule
1720  // atomically increment iterations, get old value
1721  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1722  (ST)chunk);
1723  remaining = trip - init;
1724  if (remaining <= 0) {
1725  status = 0; // all iterations got by other threads
1726  } else {
1727  // got some iterations to work on
1728  status = 1;
1729  if ((T)remaining > chunk) {
1730  limit = init + chunk - 1;
1731  } else {
1732  last = true; // the last chunk
1733  limit = init + remaining - 1;
1734  } // if
1735  } // if
1736  break;
1737  } // if
1738  // divide by K*nproc
1739  UT span;
1740  __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1741  &span);
1742  UT rem = span % chunk;
1743  if (rem) // adjust so that span%chunk == 0
1744  span += chunk - rem;
1745  limit = init + span;
1746  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1747  (ST)init, (ST)limit)) {
1748  // CAS was successful, chunk obtained
1749  status = 1;
1750  --limit;
1751  break;
1752  } // if
1753  } // while
1754  if (status != 0) {
1755  start = pr->u.p.lb;
1756  incr = pr->u.p.st;
1757  if (p_st != NULL)
1758  *p_st = incr;
1759  *p_lb = start + init * incr;
1760  *p_ub = start + limit * incr;
1761  if (pr->flags.ordered) {
1762  pr->u.p.ordered_lower = init;
1763  pr->u.p.ordered_upper = limit;
1764  } // if
1765  } else {
1766  *p_lb = 0;
1767  *p_ub = 0;
1768  if (p_st != NULL)
1769  *p_st = 0;
1770  } // if
1771  } // case
1772  break;
1773 
1774  case kmp_sch_guided_analytical_chunked: {
1775  T chunkspec = pr->u.p.parm1;
1776  UT chunkIdx;
1777 #if KMP_USE_X87CONTROL
1778  /* for storing original FPCW value for Windows* OS on
1779  IA-32 architecture 8-byte version */
1780  unsigned int oldFpcw;
1781  unsigned int fpcwSet = 0;
1782 #endif
1783  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1784  "kmp_sch_guided_analytical_chunked case\n",
1785  gtid));
1786 
1787  trip = pr->u.p.tc;
1788 
1789  KMP_DEBUG_ASSERT(nproc > 1);
1790  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1791 
1792  while (1) { /* this while loop is a safeguard against unexpected zero
1793  chunk sizes */
1794  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1795  if (chunkIdx >= (UT)pr->u.p.parm2) {
1796  --trip;
1797  /* use dynamic-style scheduling */
1798  init = chunkIdx * chunkspec + pr->u.p.count;
1799  /* need to verify init > 0 in case of overflow in the above
1800  * calculation */
1801  if ((status = (init > 0 && init <= trip)) != 0) {
1802  limit = init + chunkspec - 1;
1803 
1804  if ((last = (limit >= trip)) != 0)
1805  limit = trip;
1806  }
1807  break;
1808  } else {
1809 /* use exponential-style scheduling */
1810 /* The following check is to workaround the lack of long double precision on
1811  Windows* OS.
1812  This check works around the possible effect that init != 0 for chunkIdx == 0.
1813  */
1814 #if KMP_USE_X87CONTROL
1815  /* If we haven't already done so, save original
1816  FPCW and set precision to 64-bit, as Windows* OS
1817  on IA-32 architecture defaults to 53-bit */
1818  if (!fpcwSet) {
1819  oldFpcw = _control87(0, 0);
1820  _control87(_PC_64, _MCW_PC);
1821  fpcwSet = 0x30000;
1822  }
1823 #endif
1824  if (chunkIdx) {
1825  init = __kmp_dispatch_guided_remaining<T>(
1826  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1827  KMP_DEBUG_ASSERT(init);
1828  init = trip - init;
1829  } else
1830  init = 0;
1831  limit = trip - __kmp_dispatch_guided_remaining<T>(
1832  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1833  KMP_ASSERT(init <= limit);
1834  if (init < limit) {
1835  KMP_DEBUG_ASSERT(limit <= trip);
1836  --limit;
1837  status = 1;
1838  break;
1839  } // if
1840  } // if
1841  } // while (1)
1842 #if KMP_USE_X87CONTROL
1843  /* restore FPCW if necessary
1844  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1845  */
1846  if (fpcwSet && (oldFpcw & fpcwSet))
1847  _control87(oldFpcw, _MCW_PC);
1848 #endif
1849  if (status != 0) {
1850  start = pr->u.p.lb;
1851  incr = pr->u.p.st;
1852  if (p_st != NULL)
1853  *p_st = incr;
1854  *p_lb = start + init * incr;
1855  *p_ub = start + limit * incr;
1856  if (pr->flags.ordered) {
1857  pr->u.p.ordered_lower = init;
1858  pr->u.p.ordered_upper = limit;
1859  }
1860  } else {
1861  *p_lb = 0;
1862  *p_ub = 0;
1863  if (p_st != NULL)
1864  *p_st = 0;
1865  }
1866  } // case
1867  break;
1868 
1869  case kmp_sch_trapezoidal: {
1870  UT index;
1871  T parm2 = pr->u.p.parm2;
1872  T parm3 = pr->u.p.parm3;
1873  T parm4 = pr->u.p.parm4;
1874  KD_TRACE(100,
1875  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1876  gtid));
1877 
1878  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1879 
1880  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1881  trip = pr->u.p.tc - 1;
1882 
1883  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1884  *p_lb = 0;
1885  *p_ub = 0;
1886  if (p_st != NULL)
1887  *p_st = 0;
1888  } else {
1889  start = pr->u.p.lb;
1890  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1891  incr = pr->u.p.st;
1892 
1893  if ((last = (limit >= trip)) != 0)
1894  limit = trip;
1895 
1896  if (p_st != NULL)
1897  *p_st = incr;
1898 
1899  if (incr == 1) {
1900  *p_lb = start + init;
1901  *p_ub = start + limit;
1902  } else {
1903  *p_lb = start + init * incr;
1904  *p_ub = start + limit * incr;
1905  }
1906 
1907  if (pr->flags.ordered) {
1908  pr->u.p.ordered_lower = init;
1909  pr->u.p.ordered_upper = limit;
1910  } // if
1911  } // if
1912  } // case
1913  break;
1914  default: {
1915  status = 0; // to avoid complaints on uninitialized variable use
1916  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1917  KMP_HNT(GetNewerLibrary), // Hint
1918  __kmp_msg_null // Variadic argument list terminator
1919  );
1920  } break;
1921  } // switch
1922  if (p_last)
1923  *p_last = last;
1924 #ifdef KMP_DEBUG
1925  if (pr->flags.ordered) {
1926  char *buff;
1927  // create format specifiers before the debug output
1928  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1929  "ordered_lower:%%%s ordered_upper:%%%s\n",
1930  traits_t<UT>::spec, traits_t<UT>::spec);
1931  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1932  __kmp_str_free(&buff);
1933  }
1934  {
1935  char *buff;
1936  // create format specifiers before the debug output
1937  buff = __kmp_str_format(
1938  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1939  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1940  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1941  KMP_DEBUG_ASSERT(p_last);
1942  KMP_DEBUG_ASSERT(p_st);
1943  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1944  __kmp_str_free(&buff);
1945  }
1946 #endif
1947  return status;
1948 }
1949 
1950 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1951  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1952  is not called. */
1953 #if OMPT_SUPPORT && OMPT_OPTIONAL
1954 #define OMPT_LOOP_END \
1955  if (status == 0) { \
1956  if (ompt_enabled.ompt_callback_work) { \
1957  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1958  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1959  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1960  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1961  &(task_info->task_data), 0, codeptr); \
1962  } \
1963  }
1964 // TODO: implement count
1965 #else
1966 #define OMPT_LOOP_END // no-op
1967 #endif
1968 
1969 #if KMP_STATS_ENABLED
1970 #define KMP_STATS_LOOP_END \
1971  { \
1972  kmp_int64 u, l, t, i; \
1973  l = (kmp_int64)(*p_lb); \
1974  u = (kmp_int64)(*p_ub); \
1975  i = (kmp_int64)(pr->u.p.st); \
1976  if (status == 0) { \
1977  t = 0; \
1978  KMP_POP_PARTITIONED_TIMER(); \
1979  } else if (i == 1) { \
1980  if (u >= l) \
1981  t = u - l + 1; \
1982  else \
1983  t = 0; \
1984  } else if (i < 0) { \
1985  if (l >= u) \
1986  t = (l - u) / (-i) + 1; \
1987  else \
1988  t = 0; \
1989  } else { \
1990  if (u >= l) \
1991  t = (u - l) / i + 1; \
1992  else \
1993  t = 0; \
1994  } \
1995  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1996  }
1997 #else
1998 #define KMP_STATS_LOOP_END /* Nothing */
1999 #endif
2000 
2001 template <typename T>
2002 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2003  T *p_lb, T *p_ub,
2004  typename traits_t<T>::signed_t *p_st
2005 #if OMPT_SUPPORT && OMPT_OPTIONAL
2006  ,
2007  void *codeptr
2008 #endif
2009 ) {
2010 
2011  typedef typename traits_t<T>::unsigned_t UT;
2012  typedef typename traits_t<T>::signed_t ST;
2013  // This is potentially slightly misleading, schedule(runtime) will appear here
2014  // even if the actual runtime schedule is static. (Which points out a
2015  // disadvantage of schedule(runtime): even when static scheduling is used it
2016  // costs more than a compile time choice to use static scheduling would.)
2017  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2018 
2019  int status;
2020  dispatch_private_info_template<T> *pr;
2021  __kmp_assert_valid_gtid(gtid);
2022  kmp_info_t *th = __kmp_threads[gtid];
2023  kmp_team_t *team = th->th.th_team;
2024 
2025  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2026  KD_TRACE(
2027  1000,
2028  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2029  gtid, p_lb, p_ub, p_st, p_last));
2030 
2031  if (team->t.t_serialized) {
2032  /* NOTE: serialize this dispatch because we are not at the active level */
2033  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2034  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2035  KMP_DEBUG_ASSERT(pr);
2036 
2037  if ((status = (pr->u.p.tc != 0)) == 0) {
2038  *p_lb = 0;
2039  *p_ub = 0;
2040  // if ( p_last != NULL )
2041  // *p_last = 0;
2042  if (p_st != NULL)
2043  *p_st = 0;
2044  if (__kmp_env_consistency_check) {
2045  if (pr->pushed_ws != ct_none) {
2046  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2047  }
2048  }
2049  } else if (pr->flags.nomerge) {
2050  kmp_int32 last;
2051  T start;
2052  UT limit, trip, init;
2053  ST incr;
2054  T chunk = pr->u.p.parm1;
2055 
2056  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2057  gtid));
2058 
2059  init = chunk * pr->u.p.count++;
2060  trip = pr->u.p.tc - 1;
2061 
2062  if ((status = (init <= trip)) == 0) {
2063  *p_lb = 0;
2064  *p_ub = 0;
2065  // if ( p_last != NULL )
2066  // *p_last = 0;
2067  if (p_st != NULL)
2068  *p_st = 0;
2069  if (__kmp_env_consistency_check) {
2070  if (pr->pushed_ws != ct_none) {
2071  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2072  }
2073  }
2074  } else {
2075  start = pr->u.p.lb;
2076  limit = chunk + init - 1;
2077  incr = pr->u.p.st;
2078 
2079  if ((last = (limit >= trip)) != 0) {
2080  limit = trip;
2081 #if KMP_OS_WINDOWS
2082  pr->u.p.last_upper = pr->u.p.ub;
2083 #endif /* KMP_OS_WINDOWS */
2084  }
2085  if (p_last != NULL)
2086  *p_last = last;
2087  if (p_st != NULL)
2088  *p_st = incr;
2089  if (incr == 1) {
2090  *p_lb = start + init;
2091  *p_ub = start + limit;
2092  } else {
2093  *p_lb = start + init * incr;
2094  *p_ub = start + limit * incr;
2095  }
2096 
2097  if (pr->flags.ordered) {
2098  pr->u.p.ordered_lower = init;
2099  pr->u.p.ordered_upper = limit;
2100 #ifdef KMP_DEBUG
2101  {
2102  char *buff;
2103  // create format specifiers before the debug output
2104  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2105  "ordered_lower:%%%s ordered_upper:%%%s\n",
2106  traits_t<UT>::spec, traits_t<UT>::spec);
2107  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2108  pr->u.p.ordered_upper));
2109  __kmp_str_free(&buff);
2110  }
2111 #endif
2112  } // if
2113  } // if
2114  } else {
2115  pr->u.p.tc = 0;
2116  *p_lb = pr->u.p.lb;
2117  *p_ub = pr->u.p.ub;
2118 #if KMP_OS_WINDOWS
2119  pr->u.p.last_upper = *p_ub;
2120 #endif /* KMP_OS_WINDOWS */
2121  if (p_last != NULL)
2122  *p_last = TRUE;
2123  if (p_st != NULL)
2124  *p_st = pr->u.p.st;
2125  } // if
2126 #ifdef KMP_DEBUG
2127  {
2128  char *buff;
2129  // create format specifiers before the debug output
2130  buff = __kmp_str_format(
2131  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2132  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2133  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2134  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2135  (p_last ? *p_last : 0), status));
2136  __kmp_str_free(&buff);
2137  }
2138 #endif
2139 #if INCLUDE_SSC_MARKS
2140  SSC_MARK_DISPATCH_NEXT();
2141 #endif
2142  OMPT_LOOP_END;
2143  KMP_STATS_LOOP_END;
2144  return status;
2145  } else {
2146  kmp_int32 last = 0;
2147  dispatch_shared_info_template<T> volatile *sh;
2148 
2149  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2150  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2151 
2152  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2153  th->th.th_dispatch->th_dispatch_pr_current);
2154  KMP_DEBUG_ASSERT(pr);
2155  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2156  th->th.th_dispatch->th_dispatch_sh_current);
2157  KMP_DEBUG_ASSERT(sh);
2158 
2159 #if KMP_USE_HIER_SCHED
2160  if (pr->flags.use_hier)
2161  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2162  else
2163 #endif // KMP_USE_HIER_SCHED
2164  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2165  p_st, th->th.th_team_nproc,
2166  th->th.th_info.ds.ds_tid);
2167  // status == 0: no more iterations to execute
2168  if (status == 0) {
2169  ST num_done;
2170  num_done = test_then_inc<ST>(&sh->u.s.num_done);
2171 #ifdef KMP_DEBUG
2172  {
2173  char *buff;
2174  // create format specifiers before the debug output
2175  buff = __kmp_str_format(
2176  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2177  traits_t<ST>::spec);
2178  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2179  __kmp_str_free(&buff);
2180  }
2181 #endif
2182 
2183 #if KMP_USE_HIER_SCHED
2184  pr->flags.use_hier = FALSE;
2185 #endif
2186  if (num_done == th->th.th_team_nproc - 1) {
2187 #if KMP_STATIC_STEAL_ENABLED
2188  if (pr->schedule == kmp_sch_static_steal) {
2189  int i;
2190  int idx = (th->th.th_dispatch->th_disp_index - 1) %
2191  __kmp_dispatch_num_buffers; // current loop index
2192  // loop complete, safe to destroy locks used for stealing
2193  for (i = 0; i < th->th.th_team_nproc; ++i) {
2194  dispatch_private_info_template<T> *buf =
2195  reinterpret_cast<dispatch_private_info_template<T> *>(
2196  &team->t.t_dispatch[i].th_disp_buffer[idx]);
2197  KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2198  KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2199  if (traits_t<T>::type_size > 4) {
2200  // destroy locks used for stealing
2201  kmp_lock_t *lck = buf->u.p.steal_lock;
2202  KMP_ASSERT(lck != NULL);
2203  __kmp_destroy_lock(lck);
2204  __kmp_free(lck);
2205  buf->u.p.steal_lock = NULL;
2206  }
2207  }
2208  }
2209 #endif
2210  /* NOTE: release shared buffer to be reused */
2211 
2212  KMP_MB(); /* Flush all pending memory write invalidates. */
2213 
2214  sh->u.s.num_done = 0;
2215  sh->u.s.iteration = 0;
2216 
2217  /* TODO replace with general release procedure? */
2218  if (pr->flags.ordered) {
2219  sh->u.s.ordered_iteration = 0;
2220  }
2221 
2222  sh->buffer_index += __kmp_dispatch_num_buffers;
2223  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2224  gtid, sh->buffer_index));
2225 
2226  KMP_MB(); /* Flush all pending memory write invalidates. */
2227 
2228  } // if
2229  if (__kmp_env_consistency_check) {
2230  if (pr->pushed_ws != ct_none) {
2231  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2232  }
2233  }
2234 
2235  th->th.th_dispatch->th_deo_fcn = NULL;
2236  th->th.th_dispatch->th_dxo_fcn = NULL;
2237  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2238  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2239  } // if (status == 0)
2240 #if KMP_OS_WINDOWS
2241  else if (last) {
2242  pr->u.p.last_upper = pr->u.p.ub;
2243  }
2244 #endif /* KMP_OS_WINDOWS */
2245  if (p_last != NULL && status != 0)
2246  *p_last = last;
2247  } // if
2248 
2249 #ifdef KMP_DEBUG
2250  {
2251  char *buff;
2252  // create format specifiers before the debug output
2253  buff = __kmp_str_format(
2254  "__kmp_dispatch_next: T#%%d normal case: "
2255  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2256  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2257  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2258  (p_last ? *p_last : 0), status));
2259  __kmp_str_free(&buff);
2260  }
2261 #endif
2262 #if INCLUDE_SSC_MARKS
2263  SSC_MARK_DISPATCH_NEXT();
2264 #endif
2265  OMPT_LOOP_END;
2266  KMP_STATS_LOOP_END;
2267  return status;
2268 }
2269 
2270 template <typename T>
2271 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2272  kmp_int32 *plastiter, T *plower, T *pupper,
2273  typename traits_t<T>::signed_t incr) {
2274  typedef typename traits_t<T>::unsigned_t UT;
2275  kmp_uint32 team_id;
2276  kmp_uint32 nteams;
2277  UT trip_count;
2278  kmp_team_t *team;
2279  kmp_info_t *th;
2280 
2281  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2282  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2283 #ifdef KMP_DEBUG
2284  typedef typename traits_t<T>::signed_t ST;
2285  {
2286  char *buff;
2287  // create format specifiers before the debug output
2288  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2289  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2290  traits_t<T>::spec, traits_t<T>::spec,
2291  traits_t<ST>::spec, traits_t<T>::spec);
2292  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2293  __kmp_str_free(&buff);
2294  }
2295 #endif
2296 
2297  if (__kmp_env_consistency_check) {
2298  if (incr == 0) {
2299  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2300  loc);
2301  }
2302  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2303  // The loop is illegal.
2304  // Some zero-trip loops maintained by compiler, e.g.:
2305  // for(i=10;i<0;++i) // lower >= upper - run-time check
2306  // for(i=0;i>10;--i) // lower <= upper - run-time check
2307  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2308  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2309  // Compiler does not check the following illegal loops:
2310  // for(i=0;i<10;i+=incr) // where incr<0
2311  // for(i=10;i>0;i-=incr) // where incr<0
2312  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2313  }
2314  }
2315  __kmp_assert_valid_gtid(gtid);
2316  th = __kmp_threads[gtid];
2317  team = th->th.th_team;
2318  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2319  nteams = th->th.th_teams_size.nteams;
2320  team_id = team->t.t_master_tid;
2321  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2322 
2323  // compute global trip count
2324  if (incr == 1) {
2325  trip_count = *pupper - *plower + 1;
2326  } else if (incr == -1) {
2327  trip_count = *plower - *pupper + 1;
2328  } else if (incr > 0) {
2329  // upper-lower can exceed the limit of signed type
2330  trip_count = (UT)(*pupper - *plower) / incr + 1;
2331  } else {
2332  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2333  }
2334 
2335  if (trip_count <= nteams) {
2336  KMP_DEBUG_ASSERT(
2337  __kmp_static == kmp_sch_static_greedy ||
2338  __kmp_static ==
2339  kmp_sch_static_balanced); // Unknown static scheduling type.
2340  // only some teams get single iteration, others get nothing
2341  if (team_id < trip_count) {
2342  *pupper = *plower = *plower + team_id * incr;
2343  } else {
2344  *plower = *pupper + incr; // zero-trip loop
2345  }
2346  if (plastiter != NULL)
2347  *plastiter = (team_id == trip_count - 1);
2348  } else {
2349  if (__kmp_static == kmp_sch_static_balanced) {
2350  UT chunk = trip_count / nteams;
2351  UT extras = trip_count % nteams;
2352  *plower +=
2353  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2354  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2355  if (plastiter != NULL)
2356  *plastiter = (team_id == nteams - 1);
2357  } else {
2358  T chunk_inc_count =
2359  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2360  T upper = *pupper;
2361  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2362  // Unknown static scheduling type.
2363  *plower += team_id * chunk_inc_count;
2364  *pupper = *plower + chunk_inc_count - incr;
2365  // Check/correct bounds if needed
2366  if (incr > 0) {
2367  if (*pupper < *plower)
2368  *pupper = traits_t<T>::max_value;
2369  if (plastiter != NULL)
2370  *plastiter = *plower <= upper && *pupper > upper - incr;
2371  if (*pupper > upper)
2372  *pupper = upper; // tracker C73258
2373  } else {
2374  if (*pupper > *plower)
2375  *pupper = traits_t<T>::min_value;
2376  if (plastiter != NULL)
2377  *plastiter = *plower >= upper && *pupper < upper - incr;
2378  if (*pupper < upper)
2379  *pupper = upper; // tracker C73258
2380  }
2381  }
2382  }
2383 }
2384 
2385 //-----------------------------------------------------------------------------
2386 // Dispatch routines
2387 // Transfer call to template< type T >
2388 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2389 // T lb, T ub, ST st, ST chunk )
2390 extern "C" {
2391 
2408 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2409  enum sched_type schedule, kmp_int32 lb,
2410  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2411  KMP_DEBUG_ASSERT(__kmp_init_serial);
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413  OMPT_STORE_RETURN_ADDRESS(gtid);
2414 #endif
2415  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2416 }
2420 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2421  enum sched_type schedule, kmp_uint32 lb,
2422  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2423  KMP_DEBUG_ASSERT(__kmp_init_serial);
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425  OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2428 }
2429 
2433 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2434  enum sched_type schedule, kmp_int64 lb,
2435  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2436  KMP_DEBUG_ASSERT(__kmp_init_serial);
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438  OMPT_STORE_RETURN_ADDRESS(gtid);
2439 #endif
2440  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2441 }
2442 
2446 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2447  enum sched_type schedule, kmp_uint64 lb,
2448  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2449  KMP_DEBUG_ASSERT(__kmp_init_serial);
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451  OMPT_STORE_RETURN_ADDRESS(gtid);
2452 #endif
2453  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2454 }
2455 
2465 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2466  enum sched_type schedule, kmp_int32 *p_last,
2467  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2468  kmp_int32 chunk) {
2469  KMP_DEBUG_ASSERT(__kmp_init_serial);
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471  OMPT_STORE_RETURN_ADDRESS(gtid);
2472 #endif
2473  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2474  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2475 }
2476 
2477 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2478  enum sched_type schedule, kmp_int32 *p_last,
2479  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2480  kmp_int32 chunk) {
2481  KMP_DEBUG_ASSERT(__kmp_init_serial);
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483  OMPT_STORE_RETURN_ADDRESS(gtid);
2484 #endif
2485  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2486  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2487 }
2488 
2489 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2490  enum sched_type schedule, kmp_int32 *p_last,
2491  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2492  kmp_int64 chunk) {
2493  KMP_DEBUG_ASSERT(__kmp_init_serial);
2494 #if OMPT_SUPPORT && OMPT_OPTIONAL
2495  OMPT_STORE_RETURN_ADDRESS(gtid);
2496 #endif
2497  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2498  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2499 }
2500 
2501 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2502  enum sched_type schedule, kmp_int32 *p_last,
2503  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2504  kmp_int64 chunk) {
2505  KMP_DEBUG_ASSERT(__kmp_init_serial);
2506 #if OMPT_SUPPORT && OMPT_OPTIONAL
2507  OMPT_STORE_RETURN_ADDRESS(gtid);
2508 #endif
2509  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2510  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2511 }
2512 
2526 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2527  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2528 #if OMPT_SUPPORT && OMPT_OPTIONAL
2529  OMPT_STORE_RETURN_ADDRESS(gtid);
2530 #endif
2531  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2532 #if OMPT_SUPPORT && OMPT_OPTIONAL
2533  ,
2534  OMPT_LOAD_RETURN_ADDRESS(gtid)
2535 #endif
2536  );
2537 }
2538 
2542 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2544  kmp_int32 *p_st) {
2545 #if OMPT_SUPPORT && OMPT_OPTIONAL
2546  OMPT_STORE_RETURN_ADDRESS(gtid);
2547 #endif
2548  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2549 #if OMPT_SUPPORT && OMPT_OPTIONAL
2550  ,
2551  OMPT_LOAD_RETURN_ADDRESS(gtid)
2552 #endif
2553  );
2554 }
2555 
2559 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2560  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2561 #if OMPT_SUPPORT && OMPT_OPTIONAL
2562  OMPT_STORE_RETURN_ADDRESS(gtid);
2563 #endif
2564  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2565 #if OMPT_SUPPORT && OMPT_OPTIONAL
2566  ,
2567  OMPT_LOAD_RETURN_ADDRESS(gtid)
2568 #endif
2569  );
2570 }
2571 
2575 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2576  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2577  kmp_int64 *p_st) {
2578 #if OMPT_SUPPORT && OMPT_OPTIONAL
2579  OMPT_STORE_RETURN_ADDRESS(gtid);
2580 #endif
2581  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2582 #if OMPT_SUPPORT && OMPT_OPTIONAL
2583  ,
2584  OMPT_LOAD_RETURN_ADDRESS(gtid)
2585 #endif
2586  );
2587 }
2588 
2595 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2596  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2597 }
2598 
2602 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2603  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2604 }
2605 
2609 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2610  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2611 }
2612 
2616 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2617  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2618 }
2621 //-----------------------------------------------------------------------------
2622 // Non-template routines from kmp_dispatch.cpp used in other sources
2623 
2624 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2625  return value == checker;
2626 }
2627 
2628 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2629  return value != checker;
2630 }
2631 
2632 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2633  return value < checker;
2634 }
2635 
2636 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2637  return value >= checker;
2638 }
2639 
2640 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2641  return value <= checker;
2642 }
2643 
2644 kmp_uint32
2645 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2646  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2647  void *obj // Higher-level synchronization object, or NULL.
2648 ) {
2649  // note: we may not belong to a team at this point
2650  volatile kmp_uint32 *spin = spinner;
2651  kmp_uint32 check = checker;
2652  kmp_uint32 spins;
2653  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2654  kmp_uint32 r;
2655 
2656  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2657  KMP_INIT_YIELD(spins);
2658  // main wait spin loop
2659  while (!f(r = TCR_4(*spin), check)) {
2660  KMP_FSYNC_SPIN_PREPARE(obj);
2661  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2662  split. It causes problems with infinite recursion because of exit lock */
2663  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2664  __kmp_abort_thread(); */
2665  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2666  }
2667  KMP_FSYNC_SPIN_ACQUIRED(obj);
2668  return r;
2669 }
2670 
2671 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2672  kmp_uint32 (*pred)(void *, kmp_uint32),
2673  void *obj // Higher-level synchronization object, or NULL.
2674 ) {
2675  // note: we may not belong to a team at this point
2676  void *spin = spinner;
2677  kmp_uint32 check = checker;
2678  kmp_uint32 spins;
2679  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2680 
2681  KMP_FSYNC_SPIN_INIT(obj, spin);
2682  KMP_INIT_YIELD(spins);
2683  // main wait spin loop
2684  while (!f(spin, check)) {
2685  KMP_FSYNC_SPIN_PREPARE(obj);
2686  /* if we have waited a bit, or are noversubscribed, yield */
2687  /* pause is in the following code */
2688  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2689  }
2690  KMP_FSYNC_SPIN_ACQUIRED(obj);
2691 }
2692 
2693 } // extern "C"
2694 
2695 #ifdef KMP_GOMP_COMPAT
2696 
2697 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2698  enum sched_type schedule, kmp_int32 lb,
2699  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2700  int push_ws) {
2701  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2702  push_ws);
2703 }
2704 
2705 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2706  enum sched_type schedule, kmp_uint32 lb,
2707  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2708  int push_ws) {
2709  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2710  push_ws);
2711 }
2712 
2713 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2714  enum sched_type schedule, kmp_int64 lb,
2715  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2716  int push_ws) {
2717  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2718  push_ws);
2719 }
2720 
2721 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2722  enum sched_type schedule, kmp_uint64 lb,
2723  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2724  int push_ws) {
2725  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2726  push_ws);
2727 }
2728 
2729 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2730  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2731 }
2732 
2733 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2734  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2735 }
2736 
2737 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2738  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2739 }
2740 
2741 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2742  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2743 }
2744 
2745 #endif /* KMP_GOMP_COMPAT */
2746 
2747 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:891
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:904
sched_type
Definition: kmp.h:355
Definition: kmp.h:233
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)