14 #include "kmp_wait_release.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 21 #include <immintrin.h> 22 #define USE_NGO_STORES 1 25 #if KMP_MIC && USE_NGO_STORES 27 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 28 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 29 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 30 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 32 #define ngo_load(src) ((void)0) 33 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 34 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 35 #define ngo_sync() ((void)0) 38 void __kmp_print_structure(
void);
43 template <
bool cancellable = false>
44 static bool __kmp_linear_barrier_gather_template(
45 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
46 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
47 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
48 kmp_team_t *team = this_thr->th.th_team;
49 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
50 kmp_info_t **other_threads = team->t.t_threads;
54 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY 60 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
61 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
62 __itt_get_timestamp();
67 if (!KMP_MASTER_TID(tid)) {
69 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 70 "arrived(%p): %llu => %llu\n",
71 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
72 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
73 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
81 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
82 int nproc = this_thr->th.th_team_nproc;
85 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
88 for (i = 1; i < nproc; ++i) {
92 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
94 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 95 "arrived(%p) == %llu\n",
96 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
98 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 kmp_flag_64<true, false> flag(
103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
104 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
107 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
109 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY 114 if (__kmp_forkjoin_frames_mode == 2) {
115 this_thr->th.th_bar_min_time = KMP_MIN(
116 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
124 OMPT_REDUCTION_DECL(this_thr, gtid);
125 OMPT_REDUCTION_BEGIN;
126 (*reduce)(this_thr->th.th_local.reduce_data,
127 other_threads[i]->th.th_local.reduce_data);
132 team_bar->b_arrived = new_state;
133 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 134 "arrived(%p) = %llu\n",
135 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
140 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141 gtid, team->t.t_id, tid, bt));
145 template <
bool cancellable = false>
146 static bool __kmp_linear_barrier_release_template(
147 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
148 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
149 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
150 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
153 if (KMP_MASTER_TID(tid)) {
155 kmp_uint32 nproc = this_thr->th.th_team_nproc;
156 kmp_info_t **other_threads;
158 team = __kmp_threads[gtid]->th.th_team;
159 KMP_DEBUG_ASSERT(team != NULL);
160 other_threads = team->t.t_threads;
162 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 164 gtid, team->t.t_id, tid, bt));
167 #if KMP_BARRIER_ICV_PUSH 169 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
170 if (propagate_icvs) {
171 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
172 for (i = 1; i < nproc; ++i) {
173 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
175 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
176 &team->t.t_implicit_task_taskdata[0].td_icvs);
181 #endif // KMP_BARRIER_ICV_PUSH 184 for (i = 1; i < nproc; ++i) {
188 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
192 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 193 "go(%p): %u => %u\n",
194 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
195 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
196 other_threads[i]->th.th_bar[bt].bb.b_go,
197 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
198 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
204 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
205 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
207 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
208 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
211 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
212 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
214 #if USE_ITT_BUILD && USE_ITT_NOTIFY 215 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
220 __kmp_itt_task_starting(itt_sync_obj);
222 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
225 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
226 if (itt_sync_obj != NULL)
228 __kmp_itt_task_finished(itt_sync_obj);
232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
236 tid = __kmp_tid_from_gtid(gtid);
237 team = __kmp_threads[gtid]->th.th_team;
239 KMP_DEBUG_ASSERT(team != NULL);
240 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
242 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
243 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
248 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
249 gtid, team->t.t_id, tid, bt));
253 static void __kmp_linear_barrier_gather(
254 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
255 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
256 __kmp_linear_barrier_gather_template<false>(
257 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
260 static bool __kmp_linear_barrier_gather_cancellable(
261 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
262 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
263 return __kmp_linear_barrier_gather_template<true>(
264 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 static void __kmp_linear_barrier_release(
268 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
269 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
270 __kmp_linear_barrier_release_template<false>(
271 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
274 static bool __kmp_linear_barrier_release_cancellable(
275 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
276 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
277 return __kmp_linear_barrier_release_template<true>(
278 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
282 static void __kmp_tree_barrier_gather(
283 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
284 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
285 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
286 kmp_team_t *team = this_thr->th.th_team;
287 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
288 kmp_info_t **other_threads = team->t.t_threads;
289 kmp_uint32 nproc = this_thr->th.th_team_nproc;
290 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
291 kmp_uint32 branch_factor = 1 << branch_bits;
293 kmp_uint32 child_tid;
294 kmp_uint64 new_state = 0;
297 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
298 gtid, team->t.t_id, tid, bt));
299 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
301 #if USE_ITT_BUILD && USE_ITT_NOTIFY 303 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
304 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
305 __itt_get_timestamp();
310 child_tid = (tid << branch_bits) + 1;
311 if (child_tid < nproc) {
313 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
316 kmp_info_t *child_thr = other_threads[child_tid];
317 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
320 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
322 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
325 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 326 "arrived(%p) == %llu\n",
327 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
328 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
330 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
331 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY 335 if (__kmp_forkjoin_frames_mode == 2) {
336 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
337 child_thr->th.th_bar_min_time);
342 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
343 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
344 team->t.t_id, child_tid));
345 OMPT_REDUCTION_DECL(this_thr, gtid);
346 OMPT_REDUCTION_BEGIN;
347 (*reduce)(this_thr->th.th_local.reduce_data,
348 child_thr->th.th_local.reduce_data);
353 }
while (child <= branch_factor && child_tid < nproc);
356 if (!KMP_MASTER_TID(tid)) {
357 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
360 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 361 "arrived(%p): %llu => %llu\n",
362 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
363 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
364 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
370 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
375 team->t.t_bar[bt].b_arrived = new_state;
377 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
378 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 379 "arrived(%p) = %llu\n",
380 gtid, team->t.t_id, tid, team->t.t_id,
381 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
384 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
385 gtid, team->t.t_id, tid, bt));
388 static void __kmp_tree_barrier_release(
389 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
390 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
391 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
393 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
395 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
396 kmp_uint32 branch_factor = 1 << branch_bits;
398 kmp_uint32 child_tid;
403 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
404 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
406 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
407 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
408 #if USE_ITT_BUILD && USE_ITT_NOTIFY 409 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
412 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
414 __kmp_itt_task_starting(itt_sync_obj);
416 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
419 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
420 if (itt_sync_obj != NULL)
422 __kmp_itt_task_finished(itt_sync_obj);
426 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430 team = __kmp_threads[gtid]->th.th_team;
431 KMP_DEBUG_ASSERT(team != NULL);
432 tid = __kmp_tid_from_gtid(gtid);
434 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
436 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
437 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
440 team = __kmp_threads[gtid]->th.th_team;
441 KMP_DEBUG_ASSERT(team != NULL);
442 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 444 gtid, team->t.t_id, tid, bt));
446 nproc = this_thr->th.th_team_nproc;
447 child_tid = (tid << branch_bits) + 1;
449 if (child_tid < nproc) {
450 kmp_info_t **other_threads = team->t.t_threads;
454 kmp_info_t *child_thr = other_threads[child_tid];
455 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
458 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
460 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
463 #if KMP_BARRIER_ICV_PUSH 465 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
466 if (propagate_icvs) {
467 __kmp_init_implicit_task(team->t.t_ident,
468 team->t.t_threads[child_tid], team,
470 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
471 &team->t.t_implicit_task_taskdata[0].td_icvs);
474 #endif // KMP_BARRIER_ICV_PUSH 476 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 477 "go(%p): %u => %u\n",
478 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
479 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
480 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
482 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
486 }
while (child <= branch_factor && child_tid < nproc);
489 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
490 gtid, team->t.t_id, tid, bt));
494 static void __kmp_hyper_barrier_gather(
495 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
496 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
497 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
498 kmp_team_t *team = this_thr->th.th_team;
499 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
500 kmp_info_t **other_threads = team->t.t_threads;
501 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
502 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
503 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
504 kmp_uint32 branch_factor = 1 << branch_bits;
510 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
511 gtid, team->t.t_id, tid, bt));
512 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
514 #if USE_ITT_BUILD && USE_ITT_NOTIFY 516 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
517 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
518 __itt_get_timestamp();
523 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
524 for (level = 0, offset = 1; offset < num_threads;
525 level += branch_bits, offset <<= branch_bits) {
527 kmp_uint32 child_tid;
529 if (((tid >> level) & (branch_factor - 1)) != 0) {
530 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
534 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 535 "arrived(%p): %llu => %llu\n",
536 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
537 team->t.t_id, parent_tid, &thr_bar->b_arrived,
539 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
544 p_flag.set_waiter(other_threads[parent_tid]);
550 if (new_state == KMP_BARRIER_UNUSED_STATE)
551 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
552 for (child = 1, child_tid = tid + (1 << level);
553 child < branch_factor && child_tid < num_threads;
554 child++, child_tid += (1 << level)) {
555 kmp_info_t *child_thr = other_threads[child_tid];
556 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
558 kmp_uint32 next_child_tid = child_tid + (1 << level);
560 if (child + 1 < branch_factor && next_child_tid < num_threads)
562 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
565 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 566 "arrived(%p) == %llu\n",
567 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
568 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
570 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
571 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY 576 if (__kmp_forkjoin_frames_mode == 2) {
577 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
578 child_thr->th.th_bar_min_time);
583 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585 team->t.t_id, child_tid));
586 OMPT_REDUCTION_DECL(this_thr, gtid);
587 OMPT_REDUCTION_BEGIN;
588 (*reduce)(this_thr->th.th_local.reduce_data,
589 child_thr->th.th_local.reduce_data);
595 if (KMP_MASTER_TID(tid)) {
597 if (new_state == KMP_BARRIER_UNUSED_STATE)
598 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
600 team->t.t_bar[bt].b_arrived = new_state;
601 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 602 "arrived(%p) = %llu\n",
603 gtid, team->t.t_id, tid, team->t.t_id,
604 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
607 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
608 gtid, team->t.t_id, tid, bt));
612 #define KMP_REVERSE_HYPER_BAR 613 static void __kmp_hyper_barrier_release(
614 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
615 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
616 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
618 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
619 kmp_info_t **other_threads;
620 kmp_uint32 num_threads;
621 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
622 kmp_uint32 branch_factor = 1 << branch_bits;
624 kmp_uint32 child_tid;
632 if (KMP_MASTER_TID(tid)) {
633 team = __kmp_threads[gtid]->th.th_team;
634 KMP_DEBUG_ASSERT(team != NULL);
635 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 637 gtid, team->t.t_id, tid, bt));
638 #if KMP_BARRIER_ICV_PUSH 639 if (propagate_icvs) {
640 copy_icvs(&thr_bar->th_fixed_icvs,
641 &team->t.t_implicit_task_taskdata[tid].td_icvs);
645 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
646 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
648 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
649 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
650 #if USE_ITT_BUILD && USE_ITT_NOTIFY 651 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
653 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
655 __kmp_itt_task_starting(itt_sync_obj);
657 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
660 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
661 if (itt_sync_obj != NULL)
663 __kmp_itt_task_finished(itt_sync_obj);
667 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
671 team = __kmp_threads[gtid]->th.th_team;
672 KMP_DEBUG_ASSERT(team != NULL);
673 tid = __kmp_tid_from_gtid(gtid);
675 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
677 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
678 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
681 num_threads = this_thr->th.th_team_nproc;
682 other_threads = team->t.t_threads;
684 #ifdef KMP_REVERSE_HYPER_BAR 686 for (level = 0, offset = 1;
687 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
688 level += branch_bits, offset <<= branch_bits)
692 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
693 level -= branch_bits, offset >>= branch_bits)
696 for (level = 0, offset = 1; offset < num_threads;
697 level += branch_bits, offset <<= branch_bits)
700 #ifdef KMP_REVERSE_HYPER_BAR 703 child = num_threads >> ((level == 0) ? level : level - 1);
704 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
705 child_tid = tid + (child << level);
706 child >= 1; child--, child_tid -= (1 << level))
708 if (((tid >> level) & (branch_factor - 1)) != 0)
713 for (child = 1, child_tid = tid + (1 << level);
714 child < branch_factor && child_tid < num_threads;
715 child++, child_tid += (1 << level))
716 #endif // KMP_REVERSE_HYPER_BAR 718 if (child_tid >= num_threads)
721 kmp_info_t *child_thr = other_threads[child_tid];
722 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
724 kmp_uint32 next_child_tid = child_tid - (1 << level);
726 #ifdef KMP_REVERSE_HYPER_BAR 727 if (child - 1 >= 1 && next_child_tid < num_threads)
729 if (child + 1 < branch_factor && next_child_tid < num_threads)
730 #endif // KMP_REVERSE_HYPER_BAR 732 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
735 #if KMP_BARRIER_ICV_PUSH 737 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
738 #endif // KMP_BARRIER_ICV_PUSH 742 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 743 "go(%p): %u => %u\n",
744 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
745 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
746 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
748 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
753 #if KMP_BARRIER_ICV_PUSH 754 if (propagate_icvs &&
755 !KMP_MASTER_TID(tid)) {
756 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
758 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
759 &thr_bar->th_fixed_icvs);
764 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
765 gtid, team->t.t_id, tid, bt));
778 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
779 kmp_bstate_t *thr_bar,
780 kmp_uint32 nproc,
int gtid,
781 int tid, kmp_team_t *team) {
783 bool uninitialized = thr_bar->team == NULL;
784 bool team_changed = team != thr_bar->team;
785 bool team_sz_changed = nproc != thr_bar->nproc;
786 bool tid_changed = tid != thr_bar->old_tid;
789 if (uninitialized || team_sz_changed) {
790 __kmp_get_hierarchy(nproc, thr_bar);
793 if (uninitialized || team_sz_changed || tid_changed) {
794 thr_bar->my_level = thr_bar->depth - 1;
795 thr_bar->parent_tid = -1;
796 if (!KMP_MASTER_TID(tid)) {
799 while (d < thr_bar->depth) {
802 if (d == thr_bar->depth - 2) {
803 thr_bar->parent_tid = 0;
804 thr_bar->my_level = d;
806 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
809 thr_bar->parent_tid = tid - rem;
810 thr_bar->my_level = d;
816 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
817 (thr_bar->skip_per_level[thr_bar->my_level])),
819 thr_bar->old_tid = tid;
820 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
821 thr_bar->team = team;
822 thr_bar->parent_bar =
823 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
825 if (uninitialized || team_changed || tid_changed) {
826 thr_bar->team = team;
827 thr_bar->parent_bar =
828 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
831 if (uninitialized || team_sz_changed || tid_changed) {
832 thr_bar->nproc = nproc;
833 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
834 if (thr_bar->my_level == 0)
835 thr_bar->leaf_kids = 0;
836 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
837 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
838 thr_bar->leaf_state = 0;
839 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
840 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
845 static void __kmp_hierarchical_barrier_gather(
846 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
847 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
848 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
849 kmp_team_t *team = this_thr->th.th_team;
850 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
851 kmp_uint32 nproc = this_thr->th.th_team_nproc;
852 kmp_info_t **other_threads = team->t.t_threads;
853 kmp_uint64 new_state = 0;
855 int level = team->t.t_level;
857 ->th.th_teams_microtask)
858 if (this_thr->th.th_teams_size.nteams > 1)
861 thr_bar->use_oncore_barrier = 1;
863 thr_bar->use_oncore_barrier = 0;
865 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 867 gtid, team->t.t_id, tid, bt));
868 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY 872 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
873 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
877 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
880 if (thr_bar->my_level) {
883 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
884 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
885 thr_bar->use_oncore_barrier) {
886 if (thr_bar->leaf_kids) {
888 kmp_uint64 leaf_state =
890 ? thr_bar->b_arrived | thr_bar->leaf_state
891 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
892 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 894 gtid, team->t.t_id, tid));
895 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
896 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
898 OMPT_REDUCTION_DECL(this_thr, gtid);
899 OMPT_REDUCTION_BEGIN;
900 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
902 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 904 gtid, team->t.t_id, tid,
905 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
907 (*reduce)(this_thr->th.th_local.reduce_data,
908 other_threads[child_tid]->th.th_local.reduce_data);
913 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
916 for (kmp_uint32 d = 1; d < thr_bar->my_level;
918 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
919 skip = thr_bar->skip_per_level[d];
922 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
923 kmp_info_t *child_thr = other_threads[child_tid];
924 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
925 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 927 "arrived(%p) == %llu\n",
928 gtid, team->t.t_id, tid,
929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930 child_tid, &child_bar->b_arrived, new_state));
931 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
932 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
934 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 936 gtid, team->t.t_id, tid,
937 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
939 (*reduce)(this_thr->th.th_local.reduce_data,
940 child_thr->th.th_local.reduce_data);
945 for (kmp_uint32 d = 0; d < thr_bar->my_level;
947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948 skip = thr_bar->skip_per_level[d];
951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952 kmp_info_t *child_thr = other_threads[child_tid];
953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 956 "arrived(%p) == %llu\n",
957 gtid, team->t.t_id, tid,
958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 child_tid, &child_bar->b_arrived, new_state));
960 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
963 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 965 gtid, team->t.t_id, tid,
966 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968 (*reduce)(this_thr->th.th_local.reduce_data,
969 child_thr->th.th_local.reduce_data);
977 if (!KMP_MASTER_TID(tid)) {
978 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 979 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
980 gtid, team->t.t_id, tid,
981 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
982 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
983 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
987 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
988 !thr_bar->use_oncore_barrier) {
990 kmp_flag_64<> flag(&thr_bar->b_arrived,
991 other_threads[thr_bar->parent_tid]);
995 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
996 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
997 thr_bar->offset + 1);
998 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1002 team->t.t_bar[bt].b_arrived = new_state;
1003 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1004 "arrived(%p) = %llu\n",
1005 gtid, team->t.t_id, tid, team->t.t_id,
1006 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1009 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1010 "barrier type %d\n",
1011 gtid, team->t.t_id, tid, bt));
1014 static void __kmp_hierarchical_barrier_release(
1015 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1016 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1017 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1019 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1021 bool team_change =
false;
1023 if (KMP_MASTER_TID(tid)) {
1024 team = __kmp_threads[gtid]->th.th_team;
1025 KMP_DEBUG_ASSERT(team != NULL);
1026 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1027 "entered barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1031 if (!thr_bar->use_oncore_barrier ||
1032 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1033 thr_bar->team == NULL) {
1035 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1036 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1037 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1038 TCW_8(thr_bar->b_go,
1039 KMP_INIT_BARRIER_STATE);
1043 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1044 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1045 thr_bar->offset + 1, bt,
1046 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1047 flag.wait(this_thr, TRUE);
1048 if (thr_bar->wait_flag ==
1049 KMP_BARRIER_SWITCHING) {
1050 TCW_8(thr_bar->b_go,
1051 KMP_INIT_BARRIER_STATE);
1053 (RCAST(
volatile char *,
1054 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1057 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1059 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1062 team = __kmp_threads[gtid]->th.th_team;
1063 KMP_DEBUG_ASSERT(team != NULL);
1064 tid = __kmp_tid_from_gtid(gtid);
1068 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1069 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1073 nproc = this_thr->th.th_team_nproc;
1074 int level = team->t.t_level;
1075 if (team->t.t_threads[0]
1076 ->th.th_teams_microtask) {
1077 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1078 this_thr->th.th_teams_level == level)
1080 if (this_thr->th.th_teams_size.nteams > 1)
1084 thr_bar->use_oncore_barrier = 1;
1086 thr_bar->use_oncore_barrier = 0;
1090 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1091 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1092 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1098 #if KMP_BARRIER_ICV_PUSH 1099 if (propagate_icvs) {
1100 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1104 copy_icvs(&thr_bar->th_fixed_icvs,
1105 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1106 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1107 thr_bar->use_oncore_barrier) {
1108 if (!thr_bar->my_level)
1111 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1112 &thr_bar->parent_bar->th_fixed_icvs);
1115 if (thr_bar->my_level)
1117 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1119 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1120 &thr_bar->parent_bar->th_fixed_icvs);
1123 #endif // KMP_BARRIER_ICV_PUSH 1126 if (thr_bar->my_level) {
1127 kmp_int32 child_tid;
1129 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1130 thr_bar->use_oncore_barrier) {
1131 if (KMP_MASTER_TID(tid)) {
1134 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1137 ngo_load(&thr_bar->th_fixed_icvs);
1140 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1141 child_tid += thr_bar->skip_per_level[1]) {
1142 kmp_bstate_t *child_bar =
1143 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1144 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1145 "releasing T#%d(%d:%d)" 1146 " go(%p): %u => %u\n",
1147 gtid, team->t.t_id, tid,
1148 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1149 child_tid, &child_bar->b_go, child_bar->b_go,
1150 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1153 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1157 TCW_8(thr_bar->b_go,
1158 KMP_INIT_BARRIER_STATE);
1160 if (thr_bar->leaf_kids) {
1163 old_leaf_kids < thr_bar->leaf_kids) {
1164 if (old_leaf_kids) {
1165 thr_bar->b_go |= old_leaf_state;
1168 last = tid + thr_bar->skip_per_level[1];
1171 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1173 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1174 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1177 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1178 " T#%d(%d:%d) go(%p): %u => %u\n",
1179 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1180 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1181 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1183 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1188 thr_bar->b_go |= thr_bar->leaf_state;
1192 for (
int d = thr_bar->my_level - 1; d >= 0;
1194 last = tid + thr_bar->skip_per_level[d + 1];
1195 kmp_uint32 skip = thr_bar->skip_per_level[d];
1198 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1199 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1200 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1201 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1202 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1203 gtid, team->t.t_id, tid,
1204 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1205 child_tid, &child_bar->b_go, child_bar->b_go,
1206 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1208 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1213 #if KMP_BARRIER_ICV_PUSH 1214 if (propagate_icvs && !KMP_MASTER_TID(tid))
1216 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1217 &thr_bar->th_fixed_icvs);
1218 #endif // KMP_BARRIER_ICV_PUSH 1220 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1221 "barrier type %d\n",
1222 gtid, team->t.t_id, tid, bt));
1230 template <
bool cancellable>
struct is_cancellable {};
1231 template <>
struct is_cancellable<true> {
1233 is_cancellable() : value(false) {}
1234 is_cancellable(
bool b) : value(b) {}
1235 is_cancellable &operator=(
bool b) {
1239 operator bool()
const {
return value; }
1241 template <>
struct is_cancellable<false> {
1242 is_cancellable &operator=(
bool b) {
return *
this; }
1243 constexpr
operator bool()
const {
return false; }
1254 template <
bool cancellable = false>
1255 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1256 size_t reduce_size,
void *reduce_data,
1257 void (*reduce)(
void *,
void *)) {
1258 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1259 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1260 int tid = __kmp_tid_from_gtid(gtid);
1261 kmp_info_t *this_thr = __kmp_threads[gtid];
1262 kmp_team_t *team = this_thr->th.th_team;
1264 is_cancellable<cancellable> cancelled;
1265 #if OMPT_SUPPORT && OMPT_OPTIONAL 1266 ompt_data_t *my_task_data;
1267 ompt_data_t *my_parallel_data;
1268 void *return_address;
1269 ompt_sync_region_t barrier_kind;
1272 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1273 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1276 if (ompt_enabled.enabled) {
1278 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1279 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1280 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1281 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1282 if (ompt_enabled.ompt_callback_sync_region) {
1283 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1284 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1287 if (ompt_enabled.ompt_callback_sync_region_wait) {
1288 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1289 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1296 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1300 if (!team->t.t_serialized) {
1303 void *itt_sync_obj = NULL;
1305 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1306 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1309 if (__kmp_tasking_mode == tskm_extra_barrier) {
1310 __kmp_tasking_barrier(team, this_thr, gtid);
1312 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1313 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1320 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1322 this_thr->th.th_team_bt_intervals =
1323 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1324 this_thr->th.th_team_bt_set =
1325 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1327 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1332 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1333 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1337 if (KMP_MASTER_TID(tid)) {
1338 team->t.t_bar[bt].b_master_arrived += 1;
1340 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1343 if (reduce != NULL) {
1345 this_thr->th.th_local.reduce_data = reduce_data;
1348 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1350 __kmp_task_team_setup(this_thr, team, 0);
1353 cancelled = __kmp_linear_barrier_gather_cancellable(
1354 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1356 switch (__kmp_barrier_gather_pattern[bt]) {
1357 case bp_hyper_bar: {
1359 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1360 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1361 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1364 case bp_hierarchical_bar: {
1365 __kmp_hierarchical_barrier_gather(
1366 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1371 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1372 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1373 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1377 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1378 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1385 if (KMP_MASTER_TID(tid)) {
1387 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1388 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1393 team->t.t_bar[bt].b_team_arrived += 1;
1396 if (__kmp_omp_cancellation) {
1397 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1399 if (cancel_request == cancel_loop ||
1400 cancel_request == cancel_sections) {
1401 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1409 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1410 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1412 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1414 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1415 __kmp_forkjoin_frames_mode &&
1416 (this_thr->th.th_teams_microtask == NULL ||
1417 this_thr->th.th_teams_size.nteams == 1) &&
1418 team->t.t_active_level == 1) {
1419 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1420 kmp_uint64 cur_time = __itt_get_timestamp();
1421 kmp_info_t **other_threads = team->t.t_threads;
1422 int nproc = this_thr->th.th_team_nproc;
1424 switch (__kmp_forkjoin_frames_mode) {
1426 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1428 this_thr->th.th_frame_time = cur_time;
1432 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1436 if (__itt_metadata_add_ptr) {
1438 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1441 this_thr->th.th_bar_arrive_time = 0;
1442 for (i = 1; i < nproc; ++i) {
1443 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1444 other_threads[i]->th.th_bar_arrive_time = 0;
1446 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1448 (kmp_uint64)(reduce != NULL));
1450 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1452 this_thr->th.th_frame_time = cur_time;
1460 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1461 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1464 if ((status == 1 || !is_split) && !cancelled) {
1466 cancelled = __kmp_linear_barrier_release_cancellable(
1467 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1469 switch (__kmp_barrier_release_pattern[bt]) {
1470 case bp_hyper_bar: {
1471 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1472 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1473 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1476 case bp_hierarchical_bar: {
1477 __kmp_hierarchical_barrier_release(
1478 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1482 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1483 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1484 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1488 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1489 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1493 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1494 __kmp_task_team_sync(this_thr, team);
1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1507 if (__kmp_tasking_mode != tskm_immediate_exec) {
1508 if (this_thr->th.th_task_team != NULL) {
1510 void *itt_sync_obj = NULL;
1511 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1512 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1513 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1517 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1519 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1520 __kmp_task_team_setup(this_thr, team, 0);
1523 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1524 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1529 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1530 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1531 __kmp_tid_from_gtid(gtid), status));
1534 if (ompt_enabled.enabled) {
1536 if (ompt_enabled.ompt_callback_sync_region_wait) {
1537 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1538 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1541 if (ompt_enabled.ompt_callback_sync_region) {
1542 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1543 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1547 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1552 return (
int)cancelled;
1557 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
1558 size_t reduce_size,
void *reduce_data,
1559 void (*reduce)(
void *,
void *)) {
1560 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1564 #if defined(KMP_GOMP_COMPAT) 1566 int __kmp_barrier_gomp_cancel(
int gtid) {
1567 if (__kmp_omp_cancellation) {
1568 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1571 int tid = __kmp_tid_from_gtid(gtid);
1572 kmp_info_t *this_thr = __kmp_threads[gtid];
1573 if (KMP_MASTER_TID(tid)) {
1577 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1578 KMP_BARRIER_STATE_BUMP;
1583 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1588 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
1589 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1590 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1591 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1592 int tid = __kmp_tid_from_gtid(gtid);
1593 kmp_info_t *this_thr = __kmp_threads[gtid];
1594 kmp_team_t *team = this_thr->th.th_team;
1596 if (!team->t.t_serialized) {
1597 if (KMP_MASTER_GTID(gtid)) {
1598 switch (__kmp_barrier_release_pattern[bt]) {
1599 case bp_hyper_bar: {
1600 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1601 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1602 FALSE USE_ITT_BUILD_ARG(NULL));
1605 case bp_hierarchical_bar: {
1606 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1607 FALSE USE_ITT_BUILD_ARG(NULL));
1611 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1612 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1613 FALSE USE_ITT_BUILD_ARG(NULL));
1617 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1618 FALSE USE_ITT_BUILD_ARG(NULL));
1621 if (__kmp_tasking_mode != tskm_immediate_exec) {
1622 __kmp_task_team_sync(this_thr, team);
1628 void __kmp_join_barrier(
int gtid) {
1629 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1630 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1632 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1634 kmp_info_t *this_thr = __kmp_threads[gtid];
1637 kmp_info_t *master_thread;
1643 void *itt_sync_obj = NULL;
1645 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1647 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1653 team = this_thr->th.th_team;
1654 nproc = this_thr->th.th_team_nproc;
1655 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1656 tid = __kmp_tid_from_gtid(gtid);
1658 team_id = team->t.t_id;
1660 master_thread = this_thr->th.th_team_master;
1662 if (master_thread != team->t.t_threads[0]) {
1663 __kmp_print_structure();
1666 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1670 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1671 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1672 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1673 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1674 gtid, team_id, tid));
1677 if (ompt_enabled.enabled) {
1679 ompt_data_t *my_task_data;
1680 ompt_data_t *my_parallel_data;
1681 void *codeptr = NULL;
1682 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1683 if (KMP_MASTER_TID(ds_tid) &&
1684 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1685 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1686 codeptr = team->t.ompt_team_info.master_return_address;
1687 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1688 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1689 if (ompt_enabled.ompt_callback_sync_region) {
1690 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1691 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1692 my_task_data, codeptr);
1694 if (ompt_enabled.ompt_callback_sync_region_wait) {
1695 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1696 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1697 my_task_data, codeptr);
1699 if (!KMP_MASTER_TID(ds_tid))
1700 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1702 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1706 if (__kmp_tasking_mode == tskm_extra_barrier) {
1707 __kmp_tasking_barrier(team, this_thr, gtid);
1708 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1712 if (__kmp_tasking_mode != tskm_immediate_exec) {
1713 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1714 "%p, th_task_team = %p\n",
1715 __kmp_gtid_from_thread(this_thr), team_id,
1716 team->t.t_task_team[this_thr->th.th_task_state],
1717 this_thr->th.th_task_team));
1718 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1719 team->t.t_task_team[this_thr->th.th_task_state]);
1728 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1730 this_thr->th.th_team_bt_intervals =
1731 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1732 this_thr->th.th_team_bt_set =
1733 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1735 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1740 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1741 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1744 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1745 case bp_hyper_bar: {
1746 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1747 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1748 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1751 case bp_hierarchical_bar: {
1752 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1753 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1757 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1758 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1759 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1763 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1764 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1772 if (KMP_MASTER_TID(tid)) {
1773 if (__kmp_tasking_mode != tskm_immediate_exec) {
1774 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1776 if (__kmp_display_affinity) {
1777 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1779 #if KMP_STATS_ENABLED 1783 for (
int i = 0; i < team->t.t_nproc; ++i) {
1784 kmp_info_t *team_thread = team->t.t_threads[i];
1785 if (team_thread == this_thr)
1787 team_thread->th.th_stats->setIdleFlag();
1788 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1789 team_thread->th.th_sleep_loc != NULL)
1790 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1791 team_thread->th.th_sleep_loc);
1795 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1796 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1799 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1801 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1802 __kmp_forkjoin_frames_mode &&
1803 (this_thr->th.th_teams_microtask == NULL ||
1804 this_thr->th.th_teams_size.nteams == 1) &&
1805 team->t.t_active_level == 1) {
1806 kmp_uint64 cur_time = __itt_get_timestamp();
1807 ident_t *loc = team->t.t_ident;
1808 kmp_info_t **other_threads = team->t.t_threads;
1809 int nproc = this_thr->th.th_team_nproc;
1811 switch (__kmp_forkjoin_frames_mode) {
1813 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1817 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1821 if (__itt_metadata_add_ptr) {
1823 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1826 this_thr->th.th_bar_arrive_time = 0;
1827 for (i = 1; i < nproc; ++i) {
1828 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1829 other_threads[i]->th.th_bar_arrive_time = 0;
1831 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1832 cur_time, delta, 0);
1834 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1836 this_thr->th.th_frame_time = cur_time;
1844 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1850 if (KMP_MASTER_TID(tid)) {
1853 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1854 gtid, team_id, tid, nproc));
1861 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1867 void __kmp_fork_barrier(
int gtid,
int tid) {
1868 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1869 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1870 kmp_info_t *this_thr = __kmp_threads[gtid];
1871 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1873 void *itt_sync_obj = NULL;
1877 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1878 (team != NULL) ? team->t.t_id : -1, tid));
1881 if (KMP_MASTER_TID(tid)) {
1882 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1883 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1885 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1886 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1891 KMP_DEBUG_ASSERT(team);
1892 kmp_info_t **other_threads = team->t.t_threads;
1898 for (i = 1; i < team->t.t_nproc; ++i) {
1900 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1902 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1903 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1904 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1906 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1907 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1908 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1912 if (__kmp_tasking_mode != tskm_immediate_exec) {
1914 __kmp_task_team_setup(this_thr, team, 0);
1923 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1925 this_thr->th.th_team_bt_intervals =
1926 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1927 this_thr->th.th_team_bt_set =
1928 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1930 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1935 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1936 case bp_hyper_bar: {
1937 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1938 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1939 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1942 case bp_hierarchical_bar: {
1943 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1944 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1948 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1949 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1950 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1954 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1955 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1960 if (ompt_enabled.enabled &&
1961 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1962 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1963 ompt_data_t *task_data = (team)
1964 ? OMPT_CUR_TASK_DATA(this_thr)
1965 : &(this_thr->th.ompt_thread_info.task_data);
1966 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1968 void *codeptr = NULL;
1969 if (KMP_MASTER_TID(ds_tid) &&
1970 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1971 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1972 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
1973 if (ompt_enabled.ompt_callback_sync_region_wait) {
1974 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1975 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1978 if (ompt_enabled.ompt_callback_sync_region) {
1979 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1980 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1984 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1985 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1986 ompt_scope_end, NULL, task_data, 0, ds_tid,
1987 ompt_task_implicit);
1993 if (TCR_4(__kmp_global.g.g_done)) {
1994 this_thr->th.th_task_team = NULL;
1996 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1997 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1998 if (!KMP_MASTER_TID(tid)) {
1999 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2001 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2005 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2013 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2014 KMP_DEBUG_ASSERT(team != NULL);
2015 tid = __kmp_tid_from_gtid(gtid);
2017 #if KMP_BARRIER_ICV_PULL 2025 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2026 if (!KMP_MASTER_TID(tid)) {
2030 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2031 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2033 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2034 &team->t.t_threads[0]
2035 ->th.th_bar[bs_forkjoin_barrier]
2039 #endif // KMP_BARRIER_ICV_PULL 2041 if (__kmp_tasking_mode != tskm_immediate_exec) {
2042 __kmp_task_team_sync(this_thr, team);
2045 #if KMP_AFFINITY_SUPPORTED 2046 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2047 if (proc_bind == proc_bind_intel) {
2049 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2050 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2052 }
else if (proc_bind != proc_bind_false) {
2053 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2054 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2055 __kmp_gtid_from_thread(this_thr),
2056 this_thr->th.th_current_place));
2058 __kmp_affinity_set_place(gtid);
2061 #endif // KMP_AFFINITY_SUPPORTED 2063 if (__kmp_display_affinity) {
2064 if (team->t.t_display_affinity
2065 #
if KMP_AFFINITY_SUPPORTED
2066 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2070 __kmp_aux_display_affinity(gtid, NULL);
2071 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2072 this_thr->th.th_prev_level = team->t.t_level;
2075 if (!KMP_MASTER_TID(tid))
2076 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2078 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2079 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2080 if (!KMP_MASTER_TID(tid)) {
2082 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2083 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2087 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2088 team->t.t_id, tid));
2091 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2092 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2093 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2095 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2096 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2101 #if KMP_BARRIER_ICV_PULL 2105 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2108 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2110 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2111 team->t.t_threads[0], team));
2112 #elif KMP_BARRIER_ICV_PUSH 2115 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2116 team->t.t_threads[0], team));
2121 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2123 for (
int f = 1; f < new_nproc; ++f) {
2125 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2126 f, team->t.t_threads[f], team));
2127 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2128 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2129 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2130 f, team->t.t_threads[f], team));
2133 #endif // KMP_BARRIER_ICV_PULL