13 #ifndef KMP_DISPATCH_HIER_H 14 #define KMP_DISPATCH_HIER_H 16 #include "kmp_dispatch.h" 19 enum kmp_hier_layer_e {
30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
32 case kmp_hier_layer_e::LAYER_THREAD:
34 case kmp_hier_layer_e::LAYER_L1:
36 case kmp_hier_layer_e::LAYER_L2:
38 case kmp_hier_layer_e::LAYER_L3:
40 case kmp_hier_layer_e::LAYER_NUMA:
42 case kmp_hier_layer_e::LAYER_LOOP:
44 case kmp_hier_layer_e::LAYER_LAST:
53 typedef struct kmp_hier_sched_env_t {
57 kmp_int32 *small_chunks;
58 kmp_int64 *large_chunks;
59 kmp_hier_layer_e *layers;
61 void append(
enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
64 kmp_hier_layer_e::LAYER_LAST);
65 small_chunks = (kmp_int32 *)__kmp_allocate(
sizeof(kmp_int32) *
66 kmp_hier_layer_e::LAYER_LAST);
67 large_chunks = (kmp_int64 *)__kmp_allocate(
sizeof(kmp_int64) *
68 kmp_hier_layer_e::LAYER_LAST);
69 layers = (kmp_hier_layer_e *)__kmp_allocate(
sizeof(kmp_hier_layer_e) *
70 kmp_hier_layer_e::LAYER_LAST);
71 capacity = kmp_hier_layer_e::LAYER_LAST;
73 int current_size = size;
74 KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
75 scheds[current_size] = sched;
76 layers[current_size] = layer;
77 small_chunks[current_size] = chunk;
78 large_chunks[current_size] = (kmp_int64)chunk;
86 for (
int i = 0; i < size; ++i) {
88 for (
int j = i + 1; j < size; ++j) {
89 if (layers[j] < layers[switch_index])
92 if (switch_index != i) {
93 kmp_hier_layer_e temp1 = layers[i];
95 kmp_int32 temp3 = small_chunks[i];
96 kmp_int64 temp4 = large_chunks[i];
97 layers[i] = layers[switch_index];
98 scheds[i] = scheds[switch_index];
99 small_chunks[i] = small_chunks[switch_index];
100 large_chunks[i] = large_chunks[switch_index];
101 layers[switch_index] = temp1;
102 scheds[switch_index] = temp2;
103 small_chunks[switch_index] = temp3;
104 large_chunks[switch_index] = temp4;
113 __kmp_free(small_chunks);
114 __kmp_free(large_chunks);
123 } kmp_hier_sched_env_t;
125 extern int __kmp_dispatch_hand_threading;
126 extern kmp_hier_sched_env_t __kmp_hier_scheds;
129 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
130 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
132 extern int __kmp_dispatch_get_index(
int tid, kmp_hier_layer_e type);
133 extern int __kmp_dispatch_get_id(
int gtid, kmp_hier_layer_e type);
134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
135 kmp_hier_layer_e t2);
136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
138 template <
typename T>
struct kmp_hier_shared_bdata_t {
139 typedef typename traits_t<T>::signed_t ST;
140 volatile kmp_uint64 val[2];
145 dispatch_shared_info_template<T> sh[2];
148 status[0] = status[1] = 0;
152 sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
154 void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
159 status[1 - index] = nstatus;
161 void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
165 status[1 - index] = nstatus;
166 sh[1 - index].u.s.iteration = 0;
169 kmp_int32 get_next_status(kmp_uint64 index)
const {
170 return status[1 - index];
172 T get_next_lb(kmp_uint64 index)
const {
return lb[1 - index]; }
173 T get_next_ub(kmp_uint64 index)
const {
return ub[1 - index]; }
174 ST get_next_st(kmp_uint64 index)
const {
return st[1 - index]; }
175 dispatch_shared_info_template<T>
volatile *get_next_sh(kmp_uint64 index) {
176 return &(sh[1 - index]);
179 kmp_int32 get_curr_status(kmp_uint64 index)
const {
return status[index]; }
180 T get_curr_lb(kmp_uint64 index)
const {
return lb[index]; }
181 T get_curr_ub(kmp_uint64 index)
const {
return ub[index]; }
182 ST get_curr_st(kmp_uint64 index)
const {
return st[index]; }
183 dispatch_shared_info_template<T>
volatile *get_curr_sh(kmp_uint64 index) {
209 template <
typename T>
class core_barrier_impl {
210 static inline kmp_uint64 get_wait_val(
int num_active) {
211 kmp_uint64 wait_val = 0LL;
212 switch (num_active) {
217 wait_val = 0x010101LL;
220 wait_val = 0x01010101LL;
223 wait_val = 0x0101010101LL;
226 wait_val = 0x010101010101LL;
229 wait_val = 0x01010101010101LL;
232 wait_val = 0x0101010101010101LL;
242 static void reset_private(kmp_int32 num_active,
243 kmp_hier_private_bdata_t *tdata);
244 static void reset_shared(kmp_int32 num_active,
245 kmp_hier_shared_bdata_t<T> *bdata);
246 static void barrier(kmp_int32
id, kmp_hier_shared_bdata_t<T> *bdata,
247 kmp_hier_private_bdata_t *tdata);
250 template <
typename T>
251 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
252 kmp_hier_private_bdata_t *tdata) {
253 tdata->num_active = num_active;
255 tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
257 template <
typename T>
258 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
259 kmp_hier_shared_bdata_t<T> *bdata) {
260 bdata->val[0] = bdata->val[1] = 0LL;
261 bdata->status[0] = bdata->status[1] = 0LL;
263 template <
typename T>
264 void core_barrier_impl<T>::barrier(kmp_int32
id,
265 kmp_hier_shared_bdata_t<T> *bdata,
266 kmp_hier_private_bdata_t *tdata) {
267 kmp_uint64 current_index = tdata->index;
268 kmp_uint64 next_index = 1 - current_index;
269 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
270 kmp_uint64 next_wait_value =
271 (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272 KD_TRACE(10, (
"core_barrier_impl::barrier(): T#%d current_index:%llu " 273 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274 __kmp_get_gtid(), current_index, next_index, current_wait_value,
276 char v = (current_wait_value ?
'\1' :
'\0');
277 (RCAST(
volatile char *, &(bdata->val[current_index])))[
id] = v;
278 __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
279 __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
280 tdata->wait_val[current_index] = next_wait_value;
281 tdata->index = next_index;
286 template <
typename T>
class counter_barrier_impl {
288 static void reset_private(kmp_int32 num_active,
289 kmp_hier_private_bdata_t *tdata);
290 static void reset_shared(kmp_int32 num_active,
291 kmp_hier_shared_bdata_t<T> *bdata);
292 static void barrier(kmp_int32
id, kmp_hier_shared_bdata_t<T> *bdata,
293 kmp_hier_private_bdata_t *tdata);
296 template <
typename T>
297 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
298 kmp_hier_private_bdata_t *tdata) {
299 tdata->num_active = num_active;
301 tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
303 template <
typename T>
304 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
305 kmp_hier_shared_bdata_t<T> *bdata) {
306 bdata->val[0] = bdata->val[1] = 0LL;
307 bdata->status[0] = bdata->status[1] = 0LL;
309 template <
typename T>
310 void counter_barrier_impl<T>::barrier(kmp_int32
id,
311 kmp_hier_shared_bdata_t<T> *bdata,
312 kmp_hier_private_bdata_t *tdata) {
313 volatile kmp_int64 *val;
314 kmp_uint64 current_index = tdata->index;
315 kmp_uint64 next_index = 1 - current_index;
316 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317 kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
319 KD_TRACE(10, (
"counter_barrier_impl::barrier(): T#%d current_index:%llu " 320 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321 __kmp_get_gtid(), current_index, next_index, current_wait_value,
323 val = RCAST(
volatile kmp_int64 *, &(bdata->val[current_index]));
324 KMP_TEST_THEN_INC64(val);
325 __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
326 __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
327 tdata->wait_val[current_index] = next_wait_value;
328 tdata->index = next_index;
333 template <
typename T>
struct kmp_hier_top_unit_t {
334 typedef typename traits_t<T>::signed_t ST;
335 typedef typename traits_t<T>::unsigned_t UT;
338 dispatch_private_info_template<T> hier_pr;
339 kmp_hier_top_unit_t<T> *hier_parent;
340 kmp_hier_shared_bdata_t<T> hier_barrier;
342 kmp_int32 get_hier_id()
const {
return hier_pr.hier_id; }
343 void reset_shared_barrier() {
344 KMP_DEBUG_ASSERT(active > 0);
348 if (active >= 2 && active <= 8) {
349 core_barrier_impl<T>::reset_shared(active, &hier_barrier);
351 counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
354 void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
355 KMP_DEBUG_ASSERT(tdata);
356 KMP_DEBUG_ASSERT(active > 0);
359 if (active >= 2 && active <= 8) {
360 core_barrier_impl<T>::reset_private(active, tdata);
362 counter_barrier_impl<T>::reset_private(active, tdata);
365 void barrier(kmp_int32
id, kmp_hier_private_bdata_t *tdata) {
366 KMP_DEBUG_ASSERT(tdata);
367 KMP_DEBUG_ASSERT(active > 0);
368 KMP_DEBUG_ASSERT(
id >= 0 &&
id < active);
370 tdata->index = 1 - tdata->index;
373 if (active >= 2 && active <= 8) {
374 core_barrier_impl<T>::barrier(
id, &hier_barrier, tdata);
376 counter_barrier_impl<T>::barrier(
id, &hier_barrier, tdata);
380 kmp_int32 get_next_status(kmp_uint64 index)
const {
381 return hier_barrier.get_next_status(index);
383 T get_next_lb(kmp_uint64 index)
const {
384 return hier_barrier.get_next_lb(index);
386 T get_next_ub(kmp_uint64 index)
const {
387 return hier_barrier.get_next_ub(index);
389 ST get_next_st(kmp_uint64 index)
const {
390 return hier_barrier.get_next_st(index);
392 dispatch_shared_info_template<T>
volatile *get_next_sh(kmp_uint64 index) {
393 return hier_barrier.get_next_sh(index);
396 kmp_int32 get_curr_status(kmp_uint64 index)
const {
397 return hier_barrier.get_curr_status(index);
399 T get_curr_lb(kmp_uint64 index)
const {
400 return hier_barrier.get_curr_lb(index);
402 T get_curr_ub(kmp_uint64 index)
const {
403 return hier_barrier.get_curr_ub(index);
405 ST get_curr_st(kmp_uint64 index)
const {
406 return hier_barrier.get_curr_st(index);
408 dispatch_shared_info_template<T>
volatile *get_curr_sh(kmp_uint64 index) {
409 return hier_barrier.get_curr_sh(index);
412 void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
414 hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
416 void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
417 hier_barrier.set_next(lb, ub, st, status, index);
419 dispatch_private_info_template<T> *get_my_pr() {
return &hier_pr; }
420 kmp_hier_top_unit_t<T> *get_parent() {
return hier_parent; }
421 dispatch_private_info_template<T> *get_parent_pr() {
422 return &(hier_parent->hier_pr);
425 kmp_int32 is_active()
const {
return active; }
426 kmp_int32 get_num_active()
const {
return active; }
431 (
" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432 active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
439 template <
typename T>
struct kmp_hier_layer_info_t {
441 kmp_hier_layer_e type;
443 typename traits_t<T>::signed_t chunk;
449 const char *t = __kmp_get_hier_str(type);
452 (
" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d " 454 num_active, t, sched, chunk, length));
472 template <
typename T>
struct kmp_hier_t {
474 typedef typename traits_t<T>::unsigned_t UT;
475 typedef typename traits_t<T>::signed_t ST;
478 int next_recurse(
ident_t *loc,
int gtid, kmp_hier_top_unit_t<T> *current,
479 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
480 kmp_int32 previous_id,
int hier_level) {
482 kmp_info_t *th = __kmp_threads[gtid];
483 auto parent = current->get_parent();
484 bool last_layer = (hier_level == get_num_layers() - 1);
485 KMP_DEBUG_ASSERT(th);
486 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
487 KMP_DEBUG_ASSERT(current);
488 KMP_DEBUG_ASSERT(hier_level >= 0);
489 KMP_DEBUG_ASSERT(hier_level < get_num_layers());
490 KMP_DEBUG_ASSERT(tdata);
491 KMP_DEBUG_ASSERT(parent || last_layer);
494 1, (
"kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
496 T hier_id = (T)current->get_hier_id();
498 if (previous_id == 0) {
499 KD_TRACE(1, (
"kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
501 kmp_int32 contains_last;
505 dispatch_shared_info_template<T>
volatile *my_sh;
506 dispatch_private_info_template<T> *my_pr;
511 (
"kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
513 my_sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
514 th->th.th_dispatch->th_dispatch_sh_current);
515 nproc = (T)get_top_level_nproc();
519 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
522 parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523 nproc = (T)parent->get_num_active();
525 my_pr = current->get_my_pr();
526 KMP_DEBUG_ASSERT(my_sh);
527 KMP_DEBUG_ASSERT(my_pr);
528 enum sched_type schedule = get_sched(hier_level);
529 ST chunk = (ST)get_chunk(hier_level);
530 status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531 &contains_last, &my_lb, &my_ub,
532 &my_st, nproc, hier_id);
535 (
"kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536 gtid, hier_level, status));
539 if (status == 0 && !last_layer) {
541 __kmp_type_convert(hier_id, &hid);
542 status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
543 &my_st, hid, hier_level + 1);
546 (
"kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
547 gtid, hier_level, status));
549 kmp_hier_private_bdata_t *upper_tdata =
550 &(th->th.th_hier_bar_data[hier_level + 1]);
551 my_sh = parent->get_curr_sh(upper_tdata->index);
552 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
554 __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
555 parent->get_curr_lb(upper_tdata->index),
556 parent->get_curr_ub(upper_tdata->index),
557 parent->get_curr_st(upper_tdata->index),
561 chunk, nproc, hier_id);
562 status = __kmp_dispatch_next_algorithm<T>(
563 gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
566 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) status not 1 " 573 current->set_next(my_lb, my_ub, my_st, status, tdata->index);
582 if (last_layer || parent->hier_pr.flags.contains_last) {
583 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr " 584 "to contain last.\n",
586 current->hier_pr.flags.contains_last = contains_last;
588 if (!current->hier_pr.flags.contains_last)
589 contains_last = FALSE;
592 *p_last = contains_last;
594 if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
596 (
"kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
598 current->barrier(previous_id, tdata);
600 (
"kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
601 gtid, hier_level, current->get_curr_status(tdata->index)));
603 KMP_DEBUG_ASSERT(previous_id == 0);
606 return current->get_curr_status(tdata->index);
614 kmp_hier_layer_info_t<T> *info;
615 kmp_hier_top_unit_t<T> **layers;
618 for (
int i = 0; i < num_layers; ++i)
619 if (layers[i] != NULL) {
620 __kmp_free(layers[i]);
622 if (layers != NULL) {
634 bool need_to_reallocate(
int n,
const kmp_hier_layer_e *new_layers,
636 const ST *new_chunks)
const {
637 if (!valid || layers == NULL || info == NULL ||
638 traits_t<T>::type_size != type_size || n != num_layers)
640 for (
int i = 0; i < n; ++i) {
641 if (info[i].type != new_layers[i])
643 if (info[i].sched != new_scheds[i])
645 if (info[i].chunk != new_chunks[i])
655 void allocate_hier(
int n,
const kmp_hier_layer_e *new_layers,
656 const enum sched_type *new_scheds,
const ST *new_chunks) {
658 if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
661 (
"kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
662 for (
int i = 0; i < n; ++i) {
663 info[i].num_active = 0;
664 for (
int j = 0; j < get_length(i); ++j)
665 layers[i][j].active = 0;
669 KD_TRACE(10, (
"kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
671 type_size = traits_t<T>::type_size;
673 info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
674 sizeof(kmp_hier_layer_info_t<T>) * n);
675 layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
676 sizeof(kmp_hier_top_unit_t<T> *) * n);
677 for (
int i = 0; i < n; ++i) {
679 kmp_hier_layer_e layer = new_layers[i];
680 info[i].num_active = 0;
681 info[i].type = layer;
682 info[i].sched = new_scheds[i];
683 info[i].chunk = new_chunks[i];
684 max = __kmp_hier_max_units[layer + 1];
687 KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
691 info[i].length = max;
692 layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
693 sizeof(kmp_hier_top_unit_t<T>) * max);
694 for (
int j = 0; j < max; ++j) {
695 layers[i][j].active = 0;
696 layers[i][j].hier_pr.flags.use_hier = TRUE;
712 int next(
ident_t *loc,
int gtid, dispatch_private_info_template<T> *pr,
713 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
715 kmp_int32 contains_last = 0;
716 kmp_info_t *th = __kmp_threads[gtid];
717 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
718 auto parent = pr->get_parent();
719 KMP_DEBUG_ASSERT(parent);
720 KMP_DEBUG_ASSERT(th);
721 KMP_DEBUG_ASSERT(tdata);
722 KMP_DEBUG_ASSERT(parent);
723 T nproc = (T)parent->get_num_active();
724 T unit_id = (T)pr->get_hier_id();
727 (
"kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
728 gtid, nproc, unit_id));
737 if (__kmp_dispatch_hand_threading) {
739 (
"kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
745 auto sh = &(parent->hier_barrier.sh[0]);
746 KMP_DEBUG_ASSERT(sh);
747 status = __kmp_dispatch_next_algorithm<T>(
748 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
754 __kmp_type_convert(unit_id, &uid);
755 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
758 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
759 parent->get_next_lb(tdata->index),
760 parent->get_next_ub(tdata->index),
761 parent->get_next_st(tdata->index),
765 pr->u.p.parm1, nproc, unit_id);
766 sh->u.s.iteration = 0;
767 status = __kmp_dispatch_next_algorithm<T>(
768 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
772 (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " 778 }
else if (status == 2) {
779 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " 786 parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
788 parent->barrier(pr->get_hier_id(), tdata);
790 *p_lb = parent->get_curr_lb(tdata->index);
791 *p_ub = parent->get_curr_ub(tdata->index);
792 *p_st = parent->get_curr_st(tdata->index);
793 status = parent->get_curr_status(tdata->index);
798 auto sh = parent->get_curr_sh(tdata->index);
799 KMP_DEBUG_ASSERT(sh);
800 status = __kmp_dispatch_next_algorithm<T>(
801 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
803 (
"kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d " 804 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
805 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
811 __kmp_type_convert(unit_id, &uid);
812 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
815 sh = parent->get_curr_sh(tdata->index);
816 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
817 parent->get_curr_lb(tdata->index),
818 parent->get_curr_ub(tdata->index),
819 parent->get_curr_st(tdata->index),
823 pr->u.p.parm1, nproc, unit_id);
824 status = __kmp_dispatch_next_algorithm<T>(
825 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
827 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " 833 }
else if (status == 2) {
834 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " 842 if (contains_last && !parent->hier_pr.flags.contains_last) {
843 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL resetting " 844 "contains_last to FALSE\n",
846 contains_last = FALSE;
849 *p_last = contains_last;
850 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
856 kmp_hier_layer_e get_type(
int level)
const {
857 KMP_DEBUG_ASSERT(level >= 0);
858 KMP_DEBUG_ASSERT(level < num_layers);
859 return info[level].type;
863 KMP_DEBUG_ASSERT(level >= 0);
864 KMP_DEBUG_ASSERT(level < num_layers);
865 return info[level].sched;
868 ST get_chunk(
int level)
const {
869 KMP_DEBUG_ASSERT(level >= 0);
870 KMP_DEBUG_ASSERT(level < num_layers);
871 return info[level].chunk;
874 int get_num_active(
int level)
const {
875 KMP_DEBUG_ASSERT(level >= 0);
876 KMP_DEBUG_ASSERT(level < num_layers);
877 return info[level].num_active;
880 int get_length(
int level)
const {
881 KMP_DEBUG_ASSERT(level >= 0);
882 KMP_DEBUG_ASSERT(level < num_layers);
883 return info[level].length;
886 kmp_hier_top_unit_t<T> *get_unit(
int level,
int index) {
887 KMP_DEBUG_ASSERT(level >= 0);
888 KMP_DEBUG_ASSERT(level < num_layers);
889 KMP_DEBUG_ASSERT(index >= 0);
890 KMP_DEBUG_ASSERT(index < get_length(level));
891 return &(layers[level][index]);
894 int get_num_layers()
const {
return num_layers; }
898 int get_top_level_nproc()
const {
return top_level_nproc; }
900 bool is_valid()
const {
return valid; }
904 KD_TRACE(10, (
"kmp_hier_t:\n"));
905 for (
int i = num_layers - 1; i >= 0; --i) {
906 KD_TRACE(10, (
"Info[%d] = ", i));
909 for (
int i = num_layers - 1; i >= 0; --i) {
910 KD_TRACE(10, (
"Layer[%d] =\n", i));
911 for (
int j = 0; j < info[i].length; ++j) {
912 layers[i][j].print();
919 template <
typename T>
920 void __kmp_dispatch_init_hierarchy(
ident_t *loc,
int n,
921 kmp_hier_layer_e *new_layers,
923 typename traits_t<T>::signed_t *new_chunks,
925 typename traits_t<T>::signed_t st) {
926 int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
927 unsigned int my_buffer_index;
930 dispatch_private_info_template<T> *pr;
931 dispatch_shared_info_template<T>
volatile *sh;
932 gtid = __kmp_entry_gtid();
933 tid = __kmp_tid_from_gtid(gtid);
935 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
937 for (
int i = 0; i < n; ++i) {
938 const char *layer = __kmp_get_hier_str(new_layers[i]);
939 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, " 940 "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
941 gtid, i, layer, i, (
int)new_scheds[i], i, new_chunks[i]));
944 KMP_DEBUG_ASSERT(n > 0);
945 KMP_DEBUG_ASSERT(new_layers);
946 KMP_DEBUG_ASSERT(new_scheds);
947 KMP_DEBUG_ASSERT(new_chunks);
948 if (!TCR_4(__kmp_init_parallel))
949 __kmp_parallel_initialize();
950 __kmp_resume_if_soft_paused();
952 th = __kmp_threads[gtid];
953 team = th->th.th_team;
954 active = !team->t.t_serialized;
955 th->th.th_ident = loc;
956 num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
957 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
958 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
959 my_buffer_index = th->th.th_dispatch->th_disp_index;
960 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
962 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
963 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
964 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
966 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d not active parallel. " 967 "Using normal dispatch functions.\n",
969 KMP_DEBUG_ASSERT(pr);
970 pr->flags.use_hier = FALSE;
971 pr->flags.contains_last = FALSE;
974 KMP_DEBUG_ASSERT(pr);
975 KMP_DEBUG_ASSERT(sh);
976 pr->flags.use_hier = TRUE;
979 if (__kmp_tid_from_gtid(gtid) == 0) {
980 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating " 983 if (sh->hier == NULL) {
984 sh->hier = (kmp_hier_t<T> *)__kmp_allocate(
sizeof(kmp_hier_t<T>));
986 sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
987 sh->u.s.iteration = 0;
989 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
991 kmp_hier_t<T> *hier = sh->hier;
992 if (!sh->hier->is_valid()) {
993 pr->flags.use_hier = FALSE;
998 if (th->th.th_hier_bar_data == NULL) {
999 th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
1000 sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
1005 for (
int i = 0; i < n; ++i) {
1006 int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1007 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1010 pr->hier_parent = my_unit;
1012 if (my_unit->is_active()) {
1013 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " 1014 "is already active (%d)\n",
1015 gtid, my_unit, my_unit->active));
1016 KMP_TEST_THEN_INC32(&(my_unit->active));
1020 if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1024 my_unit->get_my_pr()->hier_id =
1025 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1026 hier->get_type(i + 1));
1027 int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1028 my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1031 my_unit->get_my_pr()->hier_id =
1032 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1033 kmp_hier_layer_e::LAYER_LOOP);
1034 KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1035 my_unit->hier_parent =
nullptr;
1039 my_unit->get_my_pr()->u.p.tc = 0;
1041 KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1042 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " 1043 "incrementing num_active\n",
1046 KMP_TEST_THEN_INC32(&(my_unit->active));
1051 num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1052 kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1053 pr->hier_id = tid % num_threads_per_layer1;
1056 if (tid >= num_hw_threads)
1057 pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1059 10, (
"__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060 gtid, pr->hier_id));
1062 pr->flags.contains_last = FALSE;
1063 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1068 int prev_id = pr->get_hier_id();
1069 for (
int i = 0; i < n; ++i) {
1072 int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1073 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1075 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1077 my_unit->reset_shared_barrier();
1078 my_unit->hier_pr.flags.contains_last = FALSE;
1083 __kmp_dispatch_init_algorithm<T>(
1084 loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1088 hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1090 prev_id = my_unit->get_hier_id();
1093 kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1094 for (
int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1095 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1096 unit->reset_private_barrier(tdata);
1098 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1101 if (__kmp_tid_from_gtid(gtid) == 0) {
1102 for (
int i = 0; i < n; ++i) {
1104 (
"__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1105 gtid, i, hier->get_num_active(i)));
1109 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);