LLVM OpenMP* Runtime Library
kmp_csupport.cpp
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37  // By default __kmpc_begin() is no-op.
38  char *env;
39  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40  __kmp_str_match_true(env)) {
41  __kmp_middle_initialize();
42  __kmp_assign_root_init_mask();
43  KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44  } else if (__kmp_ignore_mppbeg() == FALSE) {
45  // By default __kmp_ignore_mppbeg() returns TRUE.
46  __kmp_internal_begin();
47  KC_TRACE(10, ("__kmpc_begin: called\n"));
48  }
49 }
50 
59 void __kmpc_end(ident_t *loc) {
60  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63  // returns FALSE and __kmpc_end() will unregister this root (it can cause
64  // library shut down).
65  if (__kmp_ignore_mppend() == FALSE) {
66  KC_TRACE(10, ("__kmpc_end: called\n"));
67  KA_TRACE(30, ("__kmpc_end\n"));
68 
69  __kmp_internal_end_thread(-1);
70  }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72  // Normal exit process on Windows does not allow worker threads of the final
73  // parallel region to finish reporting their events, so shutting down the
74  // library here fixes the issue at least for the cases where __kmpc_end() is
75  // placed properly.
76  if (ompt_enabled.enabled)
77  __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
100  kmp_int32 gtid = __kmp_entry_gtid();
101 
102  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104  return gtid;
105 }
106 
122  KC_TRACE(10,
123  ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125  return TCR_4(__kmp_all_nth);
126 }
127 
135  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136  return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
145  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147  return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159  return TRUE;
160 
161 #else
162 
163  const char *semi2;
164  const char *semi3;
165  int line_no;
166 
167  if (__kmp_par_range == 0) {
168  return TRUE;
169  }
170  semi2 = loc->psource;
171  if (semi2 == NULL) {
172  return TRUE;
173  }
174  semi2 = strchr(semi2, ';');
175  if (semi2 == NULL) {
176  return TRUE;
177  }
178  semi2 = strchr(semi2 + 1, ';');
179  if (semi2 == NULL) {
180  return TRUE;
181  }
182  if (__kmp_par_range_filename[0]) {
183  const char *name = semi2 - 1;
184  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185  name--;
186  }
187  if ((*name == '/') || (*name == ';')) {
188  name++;
189  }
190  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191  return __kmp_par_range < 0;
192  }
193  }
194  semi3 = strchr(semi2 + 1, ';');
195  if (__kmp_par_range_routine[0]) {
196  if ((semi3 != NULL) && (semi3 > semi2) &&
197  (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198  return __kmp_par_range < 0;
199  }
200  }
201  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203  return __kmp_par_range > 0;
204  }
205  return __kmp_par_range < 0;
206  }
207  return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219  return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232  kmp_int32 num_threads) {
233  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234  global_tid, num_threads));
235  __kmp_assert_valid_gtid(global_tid);
236  __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241  /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245  kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247  proc_bind));
248  __kmp_assert_valid_gtid(global_tid);
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263  int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266  // If we were in a serial region, then stop the serial timer, record
267  // the event, and start parallel region timer
268  stats_state_e previous_state = KMP_GET_THREAD_STATE();
269  if (previous_state == stats_state_e::SERIAL_REGION) {
270  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271  } else {
272  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273  }
274  int inParallel = __kmpc_in_parallel(loc);
275  if (inParallel) {
276  KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277  } else {
278  KMP_COUNT_BLOCK(OMP_PARALLEL);
279  }
280 #endif
281 
282  // maybe to save thr_state is enough here
283  {
284  va_list ap;
285  va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288  ompt_frame_t *ompt_frame;
289  if (ompt_enabled.enabled) {
290  kmp_info_t *master_th = __kmp_threads[gtid];
291  kmp_team_t *parent_team = master_th->th.th_team;
292  ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293  if (lwt)
294  ompt_frame = &(lwt->ompt_task_info.frame);
295  else {
296  int tid = __kmp_tid_from_gtid(gtid);
297  ompt_frame = &(
298  parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299  }
300  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301  }
302  OMPT_STORE_RETURN_ADDRESS(gtid);
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306  SSC_MARK_FORKING();
307 #endif
308  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311  kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313  SSC_MARK_JOINING();
314 #endif
315  __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317  ,
318  fork_context_intel
319 #endif
320  );
321 
322  va_end(ap);
323  }
324 
325 #if KMP_STATS_ENABLED
326  if (previous_state == stats_state_e::SERIAL_REGION) {
327  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328  KMP_SET_THREAD_STATE(previous_state);
329  } else {
330  KMP_POP_PARTITIONED_TIMER();
331  }
332 #endif // KMP_STATS_ENABLED
333 }
334 
346 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
347  kmp_int32 num_teams, kmp_int32 num_threads) {
348  KA_TRACE(20,
349  ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
350  global_tid, num_teams, num_threads));
351  __kmp_assert_valid_gtid(global_tid);
352  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
353 }
354 
371 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
372  kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
373  kmp_int32 num_threads) {
374  KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
375  " num_teams_ub=%d num_threads=%d\n",
376  global_tid, num_teams_lb, num_teams_ub, num_threads));
377  __kmp_assert_valid_gtid(global_tid);
378  __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
379  num_threads);
380 }
381 
392 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
393  ...) {
394  int gtid = __kmp_entry_gtid();
395  kmp_info_t *this_thr = __kmp_threads[gtid];
396  va_list ap;
397  va_start(ap, microtask);
398 
399 #if KMP_STATS_ENABLED
400  KMP_COUNT_BLOCK(OMP_TEAMS);
401  stats_state_e previous_state = KMP_GET_THREAD_STATE();
402  if (previous_state == stats_state_e::SERIAL_REGION) {
403  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
404  } else {
405  KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
406  }
407 #endif
408 
409  // remember teams entry point and nesting level
410  this_thr->th.th_teams_microtask = microtask;
411  this_thr->th.th_teams_level =
412  this_thr->th.th_team->t.t_level; // AC: can be >0 on host
413 
414 #if OMPT_SUPPORT
415  kmp_team_t *parent_team = this_thr->th.th_team;
416  int tid = __kmp_tid_from_gtid(gtid);
417  if (ompt_enabled.enabled) {
418  parent_team->t.t_implicit_task_taskdata[tid]
419  .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
420  }
421  OMPT_STORE_RETURN_ADDRESS(gtid);
422 #endif
423 
424  // check if __kmpc_push_num_teams called, set default number of teams
425  // otherwise
426  if (this_thr->th.th_teams_size.nteams == 0) {
427  __kmp_push_num_teams(loc, gtid, 0, 0);
428  }
429  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
430  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
431  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
432 
433  __kmp_fork_call(
434  loc, gtid, fork_context_intel, argc,
435  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
436  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
437  __kmp_join_call(loc, gtid
438 #if OMPT_SUPPORT
439  ,
440  fork_context_intel
441 #endif
442  );
443 
444  // Pop current CG root off list
445  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
447  this_thr->th.th_cg_roots = tmp->up;
448  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
449  " to node %p. cg_nthreads was %d\n",
450  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
451  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
452  int i = tmp->cg_nthreads--;
453  if (i == 1) { // check is we are the last thread in CG (not always the case)
454  __kmp_free(tmp);
455  }
456  // Restore current task's thread_limit from CG root
457  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
458  this_thr->th.th_current_task->td_icvs.thread_limit =
459  this_thr->th.th_cg_roots->cg_thread_limit;
460 
461  this_thr->th.th_teams_microtask = NULL;
462  this_thr->th.th_teams_level = 0;
463  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
464  va_end(ap);
465 #if KMP_STATS_ENABLED
466  if (previous_state == stats_state_e::SERIAL_REGION) {
467  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
468  KMP_SET_THREAD_STATE(previous_state);
469  } else {
470  KMP_POP_PARTITIONED_TIMER();
471  }
472 #endif // KMP_STATS_ENABLED
473 }
474 
475 // I don't think this function should ever have been exported.
476 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
477 // openmp code ever called it, but it's been exported from the RTL for so
478 // long that I'm afraid to remove the definition.
479 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
480 
493 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494  // The implementation is now in kmp_runtime.cpp so that it can share static
495  // functions with kmp_fork_call since the tasks to be done are similar in
496  // each case.
497  __kmp_assert_valid_gtid(global_tid);
498 #if OMPT_SUPPORT
499  OMPT_STORE_RETURN_ADDRESS(global_tid);
500 #endif
501  __kmp_serialized_parallel(loc, global_tid);
502 }
503 
511 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
512  kmp_internal_control_t *top;
513  kmp_info_t *this_thr;
514  kmp_team_t *serial_team;
515 
516  KC_TRACE(10,
517  ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
518 
519  /* skip all this code for autopar serialized loops since it results in
520  unacceptable overhead */
521  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
522  return;
523 
524  // Not autopar code
525  __kmp_assert_valid_gtid(global_tid);
526  if (!TCR_4(__kmp_init_parallel))
527  __kmp_parallel_initialize();
528 
529  __kmp_resume_if_soft_paused();
530 
531  this_thr = __kmp_threads[global_tid];
532  serial_team = this_thr->th.th_serial_team;
533 
534  kmp_task_team_t *task_team = this_thr->th.th_task_team;
535  // we need to wait for the proxy tasks before finishing the thread
536  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
537  __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
538 
539  KMP_MB();
540  KMP_DEBUG_ASSERT(serial_team);
541  KMP_ASSERT(serial_team->t.t_serialized);
542  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
543  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
544  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
545  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
546 
547 #if OMPT_SUPPORT
548  if (ompt_enabled.enabled &&
549  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
550  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
551  if (ompt_enabled.ompt_callback_implicit_task) {
552  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
553  ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
554  OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
555  }
556 
557  // reset clear the task id only after unlinking the task
558  ompt_data_t *parent_task_data;
559  __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
560 
561  if (ompt_enabled.ompt_callback_parallel_end) {
562  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
563  &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
564  ompt_parallel_invoker_program | ompt_parallel_team,
565  OMPT_LOAD_RETURN_ADDRESS(global_tid));
566  }
567  __ompt_lw_taskteam_unlink(this_thr);
568  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
569  }
570 #endif
571 
572  /* If necessary, pop the internal control stack values and replace the team
573  * values */
574  top = serial_team->t.t_control_stack_top;
575  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
576  copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
577  serial_team->t.t_control_stack_top = top->next;
578  __kmp_free(top);
579  }
580 
581  // if( serial_team -> t.t_serialized > 1 )
582  serial_team->t.t_level--;
583 
584  /* pop dispatch buffers stack */
585  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
586  {
587  dispatch_private_info_t *disp_buffer =
588  serial_team->t.t_dispatch->th_disp_buffer;
589  serial_team->t.t_dispatch->th_disp_buffer =
590  serial_team->t.t_dispatch->th_disp_buffer->next;
591  __kmp_free(disp_buffer);
592  }
593  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
594 
595  --serial_team->t.t_serialized;
596  if (serial_team->t.t_serialized == 0) {
597 
598  /* return to the parallel section */
599 
600 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
601  if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
602  __kmp_clear_x87_fpu_status_word();
603  __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
604  __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
605  }
606 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
607 
608 #if OMPD_SUPPORT
609  if (ompd_state & OMPD_ENABLE_BP)
610  ompd_bp_parallel_end();
611 #endif
612 
613  this_thr->th.th_team = serial_team->t.t_parent;
614  this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
615 
616  /* restore values cached in the thread */
617  this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
618  this_thr->th.th_team_master =
619  serial_team->t.t_parent->t.t_threads[0]; /* JPH */
620  this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
621 
622  /* TODO the below shouldn't need to be adjusted for serialized teams */
623  this_thr->th.th_dispatch =
624  &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
625 
626  __kmp_pop_current_task_from_thread(this_thr);
627 
628  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
629  this_thr->th.th_current_task->td_flags.executing = 1;
630 
631  if (__kmp_tasking_mode != tskm_immediate_exec) {
632  // Copy the task team from the new child / old parent team to the thread.
633  this_thr->th.th_task_team =
634  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
635  KA_TRACE(20,
636  ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
637  "team %p\n",
638  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
639  }
640  } else {
641  if (__kmp_tasking_mode != tskm_immediate_exec) {
642  KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
643  "depth of serial team %p to %d\n",
644  global_tid, serial_team, serial_team->t.t_serialized));
645  }
646  }
647 
648  if (__kmp_env_consistency_check)
649  __kmp_pop_parallel(global_tid, NULL);
650 #if OMPT_SUPPORT
651  if (ompt_enabled.enabled)
652  this_thr->th.ompt_thread_info.state =
653  ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
654  : ompt_state_work_parallel);
655 #endif
656 }
657 
666 void __kmpc_flush(ident_t *loc) {
667  KC_TRACE(10, ("__kmpc_flush: called\n"));
668 
669  /* need explicit __mf() here since use volatile instead in library */
670  KMP_MB(); /* Flush all pending memory write invalidates. */
671 
672 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
673 #if KMP_MIC
674 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
675 // We shouldn't need it, though, since the ABI rules require that
676 // * If the compiler generates NGO stores it also generates the fence
677 // * If users hand-code NGO stores they should insert the fence
678 // therefore no incomplete unordered stores should be visible.
679 #else
680  // C74404
681  // This is to address non-temporal store instructions (sfence needed).
682  // The clflush instruction is addressed either (mfence needed).
683  // Probably the non-temporal load monvtdqa instruction should also be
684  // addressed.
685  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
686  if (!__kmp_cpuinfo.initialized) {
687  __kmp_query_cpuid(&__kmp_cpuinfo);
688  }
689  if (!__kmp_cpuinfo.sse2) {
690  // CPU cannot execute SSE2 instructions.
691  } else {
692 #if KMP_COMPILER_ICC
693  _mm_mfence();
694 #elif KMP_COMPILER_MSVC
695  MemoryBarrier();
696 #else
697  __sync_synchronize();
698 #endif // KMP_COMPILER_ICC
699  }
700 #endif // KMP_MIC
701 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
702  KMP_ARCH_RISCV64)
703 // Nothing to see here move along
704 #elif KMP_ARCH_PPC64
705 // Nothing needed here (we have a real MB above).
706 #else
707 #error Unknown or unsupported architecture
708 #endif
709 
710 #if OMPT_SUPPORT && OMPT_OPTIONAL
711  if (ompt_enabled.ompt_callback_flush) {
712  ompt_callbacks.ompt_callback(ompt_callback_flush)(
713  __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
714  }
715 #endif
716 }
717 
718 /* -------------------------------------------------------------------------- */
726 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
727  KMP_COUNT_BLOCK(OMP_BARRIER);
728  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
729  __kmp_assert_valid_gtid(global_tid);
730 
731  if (!TCR_4(__kmp_init_parallel))
732  __kmp_parallel_initialize();
733 
734  __kmp_resume_if_soft_paused();
735 
736  if (__kmp_env_consistency_check) {
737  if (loc == 0) {
738  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
739  }
740  __kmp_check_barrier(global_tid, ct_barrier, loc);
741  }
742 
743 #if OMPT_SUPPORT
744  ompt_frame_t *ompt_frame;
745  if (ompt_enabled.enabled) {
746  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
747  if (ompt_frame->enter_frame.ptr == NULL)
748  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
749  }
750  OMPT_STORE_RETURN_ADDRESS(global_tid);
751 #endif
752  __kmp_threads[global_tid]->th.th_ident = loc;
753  // TODO: explicit barrier_wait_id:
754  // this function is called when 'barrier' directive is present or
755  // implicit barrier at the end of a worksharing construct.
756  // 1) better to add a per-thread barrier counter to a thread data structure
757  // 2) set to 0 when a new team is created
758  // 4) no sync is required
759 
760  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
761 #if OMPT_SUPPORT && OMPT_OPTIONAL
762  if (ompt_enabled.enabled) {
763  ompt_frame->enter_frame = ompt_data_none;
764  }
765 #endif
766 }
767 
768 /* The BARRIER for a MASTER section is always explicit */
775 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
776  int status = 0;
777 
778  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
779  __kmp_assert_valid_gtid(global_tid);
780 
781  if (!TCR_4(__kmp_init_parallel))
782  __kmp_parallel_initialize();
783 
784  __kmp_resume_if_soft_paused();
785 
786  if (KMP_MASTER_GTID(global_tid)) {
787  KMP_COUNT_BLOCK(OMP_MASTER);
788  KMP_PUSH_PARTITIONED_TIMER(OMP_master);
789  status = 1;
790  }
791 
792 #if OMPT_SUPPORT && OMPT_OPTIONAL
793  if (status) {
794  if (ompt_enabled.ompt_callback_masked) {
795  kmp_info_t *this_thr = __kmp_threads[global_tid];
796  kmp_team_t *team = this_thr->th.th_team;
797 
798  int tid = __kmp_tid_from_gtid(global_tid);
799  ompt_callbacks.ompt_callback(ompt_callback_masked)(
800  ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
801  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
802  OMPT_GET_RETURN_ADDRESS(0));
803  }
804  }
805 #endif
806 
807  if (__kmp_env_consistency_check) {
808 #if KMP_USE_DYNAMIC_LOCK
809  if (status)
810  __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
811  else
812  __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
813 #else
814  if (status)
815  __kmp_push_sync(global_tid, ct_master, loc, NULL);
816  else
817  __kmp_check_sync(global_tid, ct_master, loc, NULL);
818 #endif
819  }
820 
821  return status;
822 }
823 
832 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
833  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
834  __kmp_assert_valid_gtid(global_tid);
835  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
836  KMP_POP_PARTITIONED_TIMER();
837 
838 #if OMPT_SUPPORT && OMPT_OPTIONAL
839  kmp_info_t *this_thr = __kmp_threads[global_tid];
840  kmp_team_t *team = this_thr->th.th_team;
841  if (ompt_enabled.ompt_callback_masked) {
842  int tid = __kmp_tid_from_gtid(global_tid);
843  ompt_callbacks.ompt_callback(ompt_callback_masked)(
844  ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
845  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
846  OMPT_GET_RETURN_ADDRESS(0));
847  }
848 #endif
849 
850  if (__kmp_env_consistency_check) {
851  if (KMP_MASTER_GTID(global_tid))
852  __kmp_pop_sync(global_tid, ct_master, loc);
853  }
854 }
855 
864 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
865  int status = 0;
866  int tid;
867  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
868  __kmp_assert_valid_gtid(global_tid);
869 
870  if (!TCR_4(__kmp_init_parallel))
871  __kmp_parallel_initialize();
872 
873  __kmp_resume_if_soft_paused();
874 
875  tid = __kmp_tid_from_gtid(global_tid);
876  if (tid == filter) {
877  KMP_COUNT_BLOCK(OMP_MASKED);
878  KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
879  status = 1;
880  }
881 
882 #if OMPT_SUPPORT && OMPT_OPTIONAL
883  if (status) {
884  if (ompt_enabled.ompt_callback_masked) {
885  kmp_info_t *this_thr = __kmp_threads[global_tid];
886  kmp_team_t *team = this_thr->th.th_team;
887  ompt_callbacks.ompt_callback(ompt_callback_masked)(
888  ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
889  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
890  OMPT_GET_RETURN_ADDRESS(0));
891  }
892  }
893 #endif
894 
895  if (__kmp_env_consistency_check) {
896 #if KMP_USE_DYNAMIC_LOCK
897  if (status)
898  __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
899  else
900  __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
901 #else
902  if (status)
903  __kmp_push_sync(global_tid, ct_masked, loc, NULL);
904  else
905  __kmp_check_sync(global_tid, ct_masked, loc, NULL);
906 #endif
907  }
908 
909  return status;
910 }
911 
920 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
921  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
922  __kmp_assert_valid_gtid(global_tid);
923  KMP_POP_PARTITIONED_TIMER();
924 
925 #if OMPT_SUPPORT && OMPT_OPTIONAL
926  kmp_info_t *this_thr = __kmp_threads[global_tid];
927  kmp_team_t *team = this_thr->th.th_team;
928  if (ompt_enabled.ompt_callback_masked) {
929  int tid = __kmp_tid_from_gtid(global_tid);
930  ompt_callbacks.ompt_callback(ompt_callback_masked)(
931  ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
932  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
933  OMPT_GET_RETURN_ADDRESS(0));
934  }
935 #endif
936 
937  if (__kmp_env_consistency_check) {
938  __kmp_pop_sync(global_tid, ct_masked, loc);
939  }
940 }
941 
949 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
950  int cid = 0;
951  kmp_info_t *th;
952  KMP_DEBUG_ASSERT(__kmp_init_serial);
953 
954  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
955  __kmp_assert_valid_gtid(gtid);
956 
957  if (!TCR_4(__kmp_init_parallel))
958  __kmp_parallel_initialize();
959 
960  __kmp_resume_if_soft_paused();
961 
962 #if USE_ITT_BUILD
963  __kmp_itt_ordered_prep(gtid);
964 // TODO: ordered_wait_id
965 #endif /* USE_ITT_BUILD */
966 
967  th = __kmp_threads[gtid];
968 
969 #if OMPT_SUPPORT && OMPT_OPTIONAL
970  kmp_team_t *team;
971  ompt_wait_id_t lck;
972  void *codeptr_ra;
973  OMPT_STORE_RETURN_ADDRESS(gtid);
974  if (ompt_enabled.enabled) {
975  team = __kmp_team_from_gtid(gtid);
976  lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
977  /* OMPT state update */
978  th->th.ompt_thread_info.wait_id = lck;
979  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
980 
981  /* OMPT event callback */
982  codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
983  if (ompt_enabled.ompt_callback_mutex_acquire) {
984  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
985  ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
986  codeptr_ra);
987  }
988  }
989 #endif
990 
991  if (th->th.th_dispatch->th_deo_fcn != 0)
992  (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
993  else
994  __kmp_parallel_deo(&gtid, &cid, loc);
995 
996 #if OMPT_SUPPORT && OMPT_OPTIONAL
997  if (ompt_enabled.enabled) {
998  /* OMPT state update */
999  th->th.ompt_thread_info.state = ompt_state_work_parallel;
1000  th->th.ompt_thread_info.wait_id = 0;
1001 
1002  /* OMPT event callback */
1003  if (ompt_enabled.ompt_callback_mutex_acquired) {
1004  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1005  ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1006  }
1007  }
1008 #endif
1009 
1010 #if USE_ITT_BUILD
1011  __kmp_itt_ordered_start(gtid);
1012 #endif /* USE_ITT_BUILD */
1013 }
1014 
1022 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1023  int cid = 0;
1024  kmp_info_t *th;
1025 
1026  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1027  __kmp_assert_valid_gtid(gtid);
1028 
1029 #if USE_ITT_BUILD
1030  __kmp_itt_ordered_end(gtid);
1031 // TODO: ordered_wait_id
1032 #endif /* USE_ITT_BUILD */
1033 
1034  th = __kmp_threads[gtid];
1035 
1036  if (th->th.th_dispatch->th_dxo_fcn != 0)
1037  (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1038  else
1039  __kmp_parallel_dxo(&gtid, &cid, loc);
1040 
1041 #if OMPT_SUPPORT && OMPT_OPTIONAL
1042  OMPT_STORE_RETURN_ADDRESS(gtid);
1043  if (ompt_enabled.ompt_callback_mutex_released) {
1044  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1045  ompt_mutex_ordered,
1046  (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1047  ->t.t_ordered.dt.t_value,
1048  OMPT_LOAD_RETURN_ADDRESS(gtid));
1049  }
1050 #endif
1051 }
1052 
1053 #if KMP_USE_DYNAMIC_LOCK
1054 
1055 static __forceinline void
1056 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1057  kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1058  // Pointer to the allocated indirect lock is written to crit, while indexing
1059  // is ignored.
1060  void *idx;
1061  kmp_indirect_lock_t **lck;
1062  lck = (kmp_indirect_lock_t **)crit;
1063  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1064  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1065  KMP_SET_I_LOCK_LOCATION(ilk, loc);
1066  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1067  KA_TRACE(20,
1068  ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1069 #if USE_ITT_BUILD
1070  __kmp_itt_critical_creating(ilk->lock, loc);
1071 #endif
1072  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1073  if (status == 0) {
1074 #if USE_ITT_BUILD
1075  __kmp_itt_critical_destroyed(ilk->lock);
1076 #endif
1077  // We don't really need to destroy the unclaimed lock here since it will be
1078  // cleaned up at program exit.
1079  // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1080  }
1081  KMP_DEBUG_ASSERT(*lck != NULL);
1082 }
1083 
1084 // Fast-path acquire tas lock
1085 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
1086  { \
1087  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1088  kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1089  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1090  if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1091  !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
1092  kmp_uint32 spins; \
1093  KMP_FSYNC_PREPARE(l); \
1094  KMP_INIT_YIELD(spins); \
1095  kmp_backoff_t backoff = __kmp_spin_backoff_params; \
1096  do { \
1097  if (TCR_4(__kmp_nth) > \
1098  (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
1099  KMP_YIELD(TRUE); \
1100  } else { \
1101  KMP_YIELD_SPIN(spins); \
1102  } \
1103  __kmp_spin_backoff(&backoff); \
1104  } while ( \
1105  KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1106  !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
1107  } \
1108  KMP_FSYNC_ACQUIRED(l); \
1109  }
1110 
1111 // Fast-path test tas lock
1112 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1113  { \
1114  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1115  kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1116  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1117  rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1118  __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1119  }
1120 
1121 // Fast-path release tas lock
1122 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1123  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1124 
1125 #if KMP_USE_FUTEX
1126 
1127 #include <sys/syscall.h>
1128 #include <unistd.h>
1129 #ifndef FUTEX_WAIT
1130 #define FUTEX_WAIT 0
1131 #endif
1132 #ifndef FUTEX_WAKE
1133 #define FUTEX_WAKE 1
1134 #endif
1135 
1136 // Fast-path acquire futex lock
1137 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1138  { \
1139  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1140  kmp_int32 gtid_code = (gtid + 1) << 1; \
1141  KMP_MB(); \
1142  KMP_FSYNC_PREPARE(ftx); \
1143  kmp_int32 poll_val; \
1144  while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1145  &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1146  KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1147  kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1148  if (!cond) { \
1149  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1150  poll_val | \
1151  KMP_LOCK_BUSY(1, futex))) { \
1152  continue; \
1153  } \
1154  poll_val |= KMP_LOCK_BUSY(1, futex); \
1155  } \
1156  kmp_int32 rc; \
1157  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1158  NULL, NULL, 0)) != 0) { \
1159  continue; \
1160  } \
1161  gtid_code |= 1; \
1162  } \
1163  KMP_FSYNC_ACQUIRED(ftx); \
1164  }
1165 
1166 // Fast-path test futex lock
1167 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1168  { \
1169  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1170  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1171  KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1172  KMP_FSYNC_ACQUIRED(ftx); \
1173  rc = TRUE; \
1174  } else { \
1175  rc = FALSE; \
1176  } \
1177  }
1178 
1179 // Fast-path release futex lock
1180 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1181  { \
1182  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1183  KMP_MB(); \
1184  KMP_FSYNC_RELEASING(ftx); \
1185  kmp_int32 poll_val = \
1186  KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1187  if (KMP_LOCK_STRIP(poll_val) & 1) { \
1188  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1189  KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1190  } \
1191  KMP_MB(); \
1192  KMP_YIELD_OVERSUB(); \
1193  }
1194 
1195 #endif // KMP_USE_FUTEX
1196 
1197 #else // KMP_USE_DYNAMIC_LOCK
1198 
1199 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1200  ident_t const *loc,
1201  kmp_int32 gtid) {
1202  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1203 
1204  // Because of the double-check, the following load doesn't need to be volatile
1205  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1206 
1207  if (lck == NULL) {
1208  void *idx;
1209 
1210  // Allocate & initialize the lock.
1211  // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1212  lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1213  __kmp_init_user_lock_with_checks(lck);
1214  __kmp_set_user_lock_location(lck, loc);
1215 #if USE_ITT_BUILD
1216  __kmp_itt_critical_creating(lck);
1217 // __kmp_itt_critical_creating() should be called *before* the first usage
1218 // of underlying lock. It is the only place where we can guarantee it. There
1219 // are chances the lock will destroyed with no usage, but it is not a
1220 // problem, because this is not real event seen by user but rather setting
1221 // name for object (lock). See more details in kmp_itt.h.
1222 #endif /* USE_ITT_BUILD */
1223 
1224  // Use a cmpxchg instruction to slam the start of the critical section with
1225  // the lock pointer. If another thread beat us to it, deallocate the lock,
1226  // and use the lock that the other thread allocated.
1227  int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1228 
1229  if (status == 0) {
1230 // Deallocate the lock and reload the value.
1231 #if USE_ITT_BUILD
1232  __kmp_itt_critical_destroyed(lck);
1233 // Let ITT know the lock is destroyed and the same memory location may be reused
1234 // for another purpose.
1235 #endif /* USE_ITT_BUILD */
1236  __kmp_destroy_user_lock_with_checks(lck);
1237  __kmp_user_lock_free(&idx, gtid, lck);
1238  lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1239  KMP_DEBUG_ASSERT(lck != NULL);
1240  }
1241  }
1242  return lck;
1243 }
1244 
1245 #endif // KMP_USE_DYNAMIC_LOCK
1246 
1257 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1258  kmp_critical_name *crit) {
1259 #if KMP_USE_DYNAMIC_LOCK
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261  OMPT_STORE_RETURN_ADDRESS(global_tid);
1262 #endif // OMPT_SUPPORT
1263  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1264 #else
1265  KMP_COUNT_BLOCK(OMP_CRITICAL);
1266 #if OMPT_SUPPORT && OMPT_OPTIONAL
1267  ompt_state_t prev_state = ompt_state_undefined;
1268  ompt_thread_info_t ti;
1269 #endif
1270  kmp_user_lock_p lck;
1271 
1272  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1273  __kmp_assert_valid_gtid(global_tid);
1274 
1275  // TODO: add THR_OVHD_STATE
1276 
1277  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1278  KMP_CHECK_USER_LOCK_INIT();
1279 
1280  if ((__kmp_user_lock_kind == lk_tas) &&
1281  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1282  lck = (kmp_user_lock_p)crit;
1283  }
1284 #if KMP_USE_FUTEX
1285  else if ((__kmp_user_lock_kind == lk_futex) &&
1286  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1287  lck = (kmp_user_lock_p)crit;
1288  }
1289 #endif
1290  else { // ticket, queuing or drdpa
1291  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1292  }
1293 
1294  if (__kmp_env_consistency_check)
1295  __kmp_push_sync(global_tid, ct_critical, loc, lck);
1296 
1297  // since the critical directive binds to all threads, not just the current
1298  // team we have to check this even if we are in a serialized team.
1299  // also, even if we are the uber thread, we still have to conduct the lock,
1300  // as we have to contend with sibling threads.
1301 
1302 #if USE_ITT_BUILD
1303  __kmp_itt_critical_acquiring(lck);
1304 #endif /* USE_ITT_BUILD */
1305 #if OMPT_SUPPORT && OMPT_OPTIONAL
1306  OMPT_STORE_RETURN_ADDRESS(gtid);
1307  void *codeptr_ra = NULL;
1308  if (ompt_enabled.enabled) {
1309  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1310  /* OMPT state update */
1311  prev_state = ti.state;
1312  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1313  ti.state = ompt_state_wait_critical;
1314 
1315  /* OMPT event callback */
1316  codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1317  if (ompt_enabled.ompt_callback_mutex_acquire) {
1318  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1319  ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1320  (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1321  }
1322  }
1323 #endif
1324  // Value of 'crit' should be good for using as a critical_id of the critical
1325  // section directive.
1326  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1327 
1328 #if USE_ITT_BUILD
1329  __kmp_itt_critical_acquired(lck);
1330 #endif /* USE_ITT_BUILD */
1331 #if OMPT_SUPPORT && OMPT_OPTIONAL
1332  if (ompt_enabled.enabled) {
1333  /* OMPT state update */
1334  ti.state = prev_state;
1335  ti.wait_id = 0;
1336 
1337  /* OMPT event callback */
1338  if (ompt_enabled.ompt_callback_mutex_acquired) {
1339  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1340  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1341  }
1342  }
1343 #endif
1344  KMP_POP_PARTITIONED_TIMER();
1345 
1346  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1347  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1348 #endif // KMP_USE_DYNAMIC_LOCK
1349 }
1350 
1351 #if KMP_USE_DYNAMIC_LOCK
1352 
1353 // Converts the given hint to an internal lock implementation
1354 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1355 #if KMP_USE_TSX
1356 #define KMP_TSX_LOCK(seq) lockseq_##seq
1357 #else
1358 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1359 #endif
1360 
1361 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1362 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1363 #else
1364 #define KMP_CPUINFO_RTM 0
1365 #endif
1366 
1367  // Hints that do not require further logic
1368  if (hint & kmp_lock_hint_hle)
1369  return KMP_TSX_LOCK(hle);
1370  if (hint & kmp_lock_hint_rtm)
1371  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1372  if (hint & kmp_lock_hint_adaptive)
1373  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1374 
1375  // Rule out conflicting hints first by returning the default lock
1376  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1377  return __kmp_user_lock_seq;
1378  if ((hint & omp_lock_hint_speculative) &&
1379  (hint & omp_lock_hint_nonspeculative))
1380  return __kmp_user_lock_seq;
1381 
1382  // Do not even consider speculation when it appears to be contended
1383  if (hint & omp_lock_hint_contended)
1384  return lockseq_queuing;
1385 
1386  // Uncontended lock without speculation
1387  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1388  return lockseq_tas;
1389 
1390  // Use RTM lock for speculation
1391  if (hint & omp_lock_hint_speculative)
1392  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1393 
1394  return __kmp_user_lock_seq;
1395 }
1396 
1397 #if OMPT_SUPPORT && OMPT_OPTIONAL
1398 #if KMP_USE_DYNAMIC_LOCK
1399 static kmp_mutex_impl_t
1400 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1401  if (user_lock) {
1402  switch (KMP_EXTRACT_D_TAG(user_lock)) {
1403  case 0:
1404  break;
1405 #if KMP_USE_FUTEX
1406  case locktag_futex:
1407  return kmp_mutex_impl_queuing;
1408 #endif
1409  case locktag_tas:
1410  return kmp_mutex_impl_spin;
1411 #if KMP_USE_TSX
1412  case locktag_hle:
1413  case locktag_rtm_spin:
1414  return kmp_mutex_impl_speculative;
1415 #endif
1416  default:
1417  return kmp_mutex_impl_none;
1418  }
1419  ilock = KMP_LOOKUP_I_LOCK(user_lock);
1420  }
1421  KMP_ASSERT(ilock);
1422  switch (ilock->type) {
1423 #if KMP_USE_TSX
1424  case locktag_adaptive:
1425  case locktag_rtm_queuing:
1426  return kmp_mutex_impl_speculative;
1427 #endif
1428  case locktag_nested_tas:
1429  return kmp_mutex_impl_spin;
1430 #if KMP_USE_FUTEX
1431  case locktag_nested_futex:
1432 #endif
1433  case locktag_ticket:
1434  case locktag_queuing:
1435  case locktag_drdpa:
1436  case locktag_nested_ticket:
1437  case locktag_nested_queuing:
1438  case locktag_nested_drdpa:
1439  return kmp_mutex_impl_queuing;
1440  default:
1441  return kmp_mutex_impl_none;
1442  }
1443 }
1444 #else
1445 // For locks without dynamic binding
1446 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1447  switch (__kmp_user_lock_kind) {
1448  case lk_tas:
1449  return kmp_mutex_impl_spin;
1450 #if KMP_USE_FUTEX
1451  case lk_futex:
1452 #endif
1453  case lk_ticket:
1454  case lk_queuing:
1455  case lk_drdpa:
1456  return kmp_mutex_impl_queuing;
1457 #if KMP_USE_TSX
1458  case lk_hle:
1459  case lk_rtm_queuing:
1460  case lk_rtm_spin:
1461  case lk_adaptive:
1462  return kmp_mutex_impl_speculative;
1463 #endif
1464  default:
1465  return kmp_mutex_impl_none;
1466  }
1467 }
1468 #endif // KMP_USE_DYNAMIC_LOCK
1469 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1470 
1484 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1485  kmp_critical_name *crit, uint32_t hint) {
1486  KMP_COUNT_BLOCK(OMP_CRITICAL);
1487  kmp_user_lock_p lck;
1488 #if OMPT_SUPPORT && OMPT_OPTIONAL
1489  ompt_state_t prev_state = ompt_state_undefined;
1490  ompt_thread_info_t ti;
1491  // This is the case, if called from __kmpc_critical:
1492  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1493  if (!codeptr)
1494  codeptr = OMPT_GET_RETURN_ADDRESS(0);
1495 #endif
1496 
1497  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1498  __kmp_assert_valid_gtid(global_tid);
1499 
1500  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1501  // Check if it is initialized.
1502  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1503  kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1504  if (*lk == 0) {
1505  if (KMP_IS_D_LOCK(lockseq)) {
1506  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1507  KMP_GET_D_TAG(lockseq));
1508  } else {
1509  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1510  }
1511  }
1512  // Branch for accessing the actual lock object and set operation. This
1513  // branching is inevitable since this lock initialization does not follow the
1514  // normal dispatch path (lock table is not used).
1515  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1516  lck = (kmp_user_lock_p)lk;
1517  if (__kmp_env_consistency_check) {
1518  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1519  __kmp_map_hint_to_lock(hint));
1520  }
1521 #if USE_ITT_BUILD
1522  __kmp_itt_critical_acquiring(lck);
1523 #endif
1524 #if OMPT_SUPPORT && OMPT_OPTIONAL
1525  if (ompt_enabled.enabled) {
1526  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1527  /* OMPT state update */
1528  prev_state = ti.state;
1529  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1530  ti.state = ompt_state_wait_critical;
1531 
1532  /* OMPT event callback */
1533  if (ompt_enabled.ompt_callback_mutex_acquire) {
1534  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1535  ompt_mutex_critical, (unsigned int)hint,
1536  __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1537  codeptr);
1538  }
1539  }
1540 #endif
1541 #if KMP_USE_INLINED_TAS
1542  if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1543  KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1544  } else
1545 #elif KMP_USE_INLINED_FUTEX
1546  if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1547  KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1548  } else
1549 #endif
1550  {
1551  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1552  }
1553  } else {
1554  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1555  lck = ilk->lock;
1556  if (__kmp_env_consistency_check) {
1557  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1558  __kmp_map_hint_to_lock(hint));
1559  }
1560 #if USE_ITT_BUILD
1561  __kmp_itt_critical_acquiring(lck);
1562 #endif
1563 #if OMPT_SUPPORT && OMPT_OPTIONAL
1564  if (ompt_enabled.enabled) {
1565  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1566  /* OMPT state update */
1567  prev_state = ti.state;
1568  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1569  ti.state = ompt_state_wait_critical;
1570 
1571  /* OMPT event callback */
1572  if (ompt_enabled.ompt_callback_mutex_acquire) {
1573  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1574  ompt_mutex_critical, (unsigned int)hint,
1575  __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1576  codeptr);
1577  }
1578  }
1579 #endif
1580  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1581  }
1582  KMP_POP_PARTITIONED_TIMER();
1583 
1584 #if USE_ITT_BUILD
1585  __kmp_itt_critical_acquired(lck);
1586 #endif /* USE_ITT_BUILD */
1587 #if OMPT_SUPPORT && OMPT_OPTIONAL
1588  if (ompt_enabled.enabled) {
1589  /* OMPT state update */
1590  ti.state = prev_state;
1591  ti.wait_id = 0;
1592 
1593  /* OMPT event callback */
1594  if (ompt_enabled.ompt_callback_mutex_acquired) {
1595  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1596  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1597  }
1598  }
1599 #endif
1600 
1601  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1602  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1603 } // __kmpc_critical_with_hint
1604 
1605 #endif // KMP_USE_DYNAMIC_LOCK
1606 
1616 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1617  kmp_critical_name *crit) {
1618  kmp_user_lock_p lck;
1619 
1620  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1621 
1622 #if KMP_USE_DYNAMIC_LOCK
1623  int locktag = KMP_EXTRACT_D_TAG(crit);
1624  if (locktag) {
1625  lck = (kmp_user_lock_p)crit;
1626  KMP_ASSERT(lck != NULL);
1627  if (__kmp_env_consistency_check) {
1628  __kmp_pop_sync(global_tid, ct_critical, loc);
1629  }
1630 #if USE_ITT_BUILD
1631  __kmp_itt_critical_releasing(lck);
1632 #endif
1633 #if KMP_USE_INLINED_TAS
1634  if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1635  KMP_RELEASE_TAS_LOCK(lck, global_tid);
1636  } else
1637 #elif KMP_USE_INLINED_FUTEX
1638  if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1639  KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1640  } else
1641 #endif
1642  {
1643  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1644  }
1645  } else {
1646  kmp_indirect_lock_t *ilk =
1647  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1648  KMP_ASSERT(ilk != NULL);
1649  lck = ilk->lock;
1650  if (__kmp_env_consistency_check) {
1651  __kmp_pop_sync(global_tid, ct_critical, loc);
1652  }
1653 #if USE_ITT_BUILD
1654  __kmp_itt_critical_releasing(lck);
1655 #endif
1656  KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1657  }
1658 
1659 #else // KMP_USE_DYNAMIC_LOCK
1660 
1661  if ((__kmp_user_lock_kind == lk_tas) &&
1662  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1663  lck = (kmp_user_lock_p)crit;
1664  }
1665 #if KMP_USE_FUTEX
1666  else if ((__kmp_user_lock_kind == lk_futex) &&
1667  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1668  lck = (kmp_user_lock_p)crit;
1669  }
1670 #endif
1671  else { // ticket, queuing or drdpa
1672  lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1673  }
1674 
1675  KMP_ASSERT(lck != NULL);
1676 
1677  if (__kmp_env_consistency_check)
1678  __kmp_pop_sync(global_tid, ct_critical, loc);
1679 
1680 #if USE_ITT_BUILD
1681  __kmp_itt_critical_releasing(lck);
1682 #endif /* USE_ITT_BUILD */
1683  // Value of 'crit' should be good for using as a critical_id of the critical
1684  // section directive.
1685  __kmp_release_user_lock_with_checks(lck, global_tid);
1686 
1687 #endif // KMP_USE_DYNAMIC_LOCK
1688 
1689 #if OMPT_SUPPORT && OMPT_OPTIONAL
1690  /* OMPT release event triggers after lock is released; place here to trigger
1691  * for all #if branches */
1692  OMPT_STORE_RETURN_ADDRESS(global_tid);
1693  if (ompt_enabled.ompt_callback_mutex_released) {
1694  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1695  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1696  OMPT_LOAD_RETURN_ADDRESS(0));
1697  }
1698 #endif
1699 
1700  KMP_POP_PARTITIONED_TIMER();
1701  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1702 }
1703 
1713 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1714  int status;
1715  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1716  __kmp_assert_valid_gtid(global_tid);
1717 
1718  if (!TCR_4(__kmp_init_parallel))
1719  __kmp_parallel_initialize();
1720 
1721  __kmp_resume_if_soft_paused();
1722 
1723  if (__kmp_env_consistency_check)
1724  __kmp_check_barrier(global_tid, ct_barrier, loc);
1725 
1726 #if OMPT_SUPPORT
1727  ompt_frame_t *ompt_frame;
1728  if (ompt_enabled.enabled) {
1729  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1730  if (ompt_frame->enter_frame.ptr == NULL)
1731  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1732  }
1733  OMPT_STORE_RETURN_ADDRESS(global_tid);
1734 #endif
1735 #if USE_ITT_NOTIFY
1736  __kmp_threads[global_tid]->th.th_ident = loc;
1737 #endif
1738  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1739 #if OMPT_SUPPORT && OMPT_OPTIONAL
1740  if (ompt_enabled.enabled) {
1741  ompt_frame->enter_frame = ompt_data_none;
1742  }
1743 #endif
1744 
1745  return (status != 0) ? 0 : 1;
1746 }
1747 
1757 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1758  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1759  __kmp_assert_valid_gtid(global_tid);
1760  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1761 }
1762 
1773 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1774  kmp_int32 ret;
1775  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1776  __kmp_assert_valid_gtid(global_tid);
1777 
1778  if (!TCR_4(__kmp_init_parallel))
1779  __kmp_parallel_initialize();
1780 
1781  __kmp_resume_if_soft_paused();
1782 
1783  if (__kmp_env_consistency_check) {
1784  if (loc == 0) {
1785  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1786  }
1787  __kmp_check_barrier(global_tid, ct_barrier, loc);
1788  }
1789 
1790 #if OMPT_SUPPORT
1791  ompt_frame_t *ompt_frame;
1792  if (ompt_enabled.enabled) {
1793  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1794  if (ompt_frame->enter_frame.ptr == NULL)
1795  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796  }
1797  OMPT_STORE_RETURN_ADDRESS(global_tid);
1798 #endif
1799 #if USE_ITT_NOTIFY
1800  __kmp_threads[global_tid]->th.th_ident = loc;
1801 #endif
1802  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804  if (ompt_enabled.enabled) {
1805  ompt_frame->enter_frame = ompt_data_none;
1806  }
1807 #endif
1808 
1809  ret = __kmpc_master(loc, global_tid);
1810 
1811  if (__kmp_env_consistency_check) {
1812  /* there's no __kmpc_end_master called; so the (stats) */
1813  /* actions of __kmpc_end_master are done here */
1814  if (ret) {
1815  /* only one thread should do the pop since only */
1816  /* one did the push (see __kmpc_master()) */
1817  __kmp_pop_sync(global_tid, ct_master, loc);
1818  }
1819  }
1820 
1821  return (ret);
1822 }
1823 
1824 /* The BARRIER for a SINGLE process section is always explicit */
1836 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1837  __kmp_assert_valid_gtid(global_tid);
1838  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1839 
1840  if (rc) {
1841  // We are going to execute the single statement, so we should count it.
1842  KMP_COUNT_BLOCK(OMP_SINGLE);
1843  KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1844  }
1845 
1846 #if OMPT_SUPPORT && OMPT_OPTIONAL
1847  kmp_info_t *this_thr = __kmp_threads[global_tid];
1848  kmp_team_t *team = this_thr->th.th_team;
1849  int tid = __kmp_tid_from_gtid(global_tid);
1850 
1851  if (ompt_enabled.enabled) {
1852  if (rc) {
1853  if (ompt_enabled.ompt_callback_work) {
1854  ompt_callbacks.ompt_callback(ompt_callback_work)(
1855  ompt_work_single_executor, ompt_scope_begin,
1856  &(team->t.ompt_team_info.parallel_data),
1857  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1858  1, OMPT_GET_RETURN_ADDRESS(0));
1859  }
1860  } else {
1861  if (ompt_enabled.ompt_callback_work) {
1862  ompt_callbacks.ompt_callback(ompt_callback_work)(
1863  ompt_work_single_other, ompt_scope_begin,
1864  &(team->t.ompt_team_info.parallel_data),
1865  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1866  1, OMPT_GET_RETURN_ADDRESS(0));
1867  ompt_callbacks.ompt_callback(ompt_callback_work)(
1868  ompt_work_single_other, ompt_scope_end,
1869  &(team->t.ompt_team_info.parallel_data),
1870  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1871  1, OMPT_GET_RETURN_ADDRESS(0));
1872  }
1873  }
1874  }
1875 #endif
1876 
1877  return rc;
1878 }
1879 
1889 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1890  __kmp_assert_valid_gtid(global_tid);
1891  __kmp_exit_single(global_tid);
1892  KMP_POP_PARTITIONED_TIMER();
1893 
1894 #if OMPT_SUPPORT && OMPT_OPTIONAL
1895  kmp_info_t *this_thr = __kmp_threads[global_tid];
1896  kmp_team_t *team = this_thr->th.th_team;
1897  int tid = __kmp_tid_from_gtid(global_tid);
1898 
1899  if (ompt_enabled.ompt_callback_work) {
1900  ompt_callbacks.ompt_callback(ompt_callback_work)(
1901  ompt_work_single_executor, ompt_scope_end,
1902  &(team->t.ompt_team_info.parallel_data),
1903  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1904  OMPT_GET_RETURN_ADDRESS(0));
1905  }
1906 #endif
1907 }
1908 
1916 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1917  KMP_POP_PARTITIONED_TIMER();
1918  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1919 
1920 #if OMPT_SUPPORT && OMPT_OPTIONAL
1921  if (ompt_enabled.ompt_callback_work) {
1922  ompt_work_t ompt_work_type = ompt_work_loop;
1923  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1924  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1925  // Determine workshare type
1926  if (loc != NULL) {
1927  if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1928  ompt_work_type = ompt_work_loop;
1929  } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1930  ompt_work_type = ompt_work_sections;
1931  } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1932  ompt_work_type = ompt_work_distribute;
1933  } else {
1934  // use default set above.
1935  // a warning about this case is provided in __kmpc_for_static_init
1936  }
1937  KMP_DEBUG_ASSERT(ompt_work_type);
1938  }
1939  ompt_callbacks.ompt_callback(ompt_callback_work)(
1940  ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1941  &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1942  }
1943 #endif
1944  if (__kmp_env_consistency_check)
1945  __kmp_pop_workshare(global_tid, ct_pdo, loc);
1946 }
1947 
1948 // User routines which take C-style arguments (call by value)
1949 // different from the Fortran equivalent routines
1950 
1951 void ompc_set_num_threads(int arg) {
1952  // !!!!! TODO: check the per-task binding
1953  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1954 }
1955 
1956 void ompc_set_dynamic(int flag) {
1957  kmp_info_t *thread;
1958 
1959  /* For the thread-private implementation of the internal controls */
1960  thread = __kmp_entry_thread();
1961 
1962  __kmp_save_internal_controls(thread);
1963 
1964  set__dynamic(thread, flag ? true : false);
1965 }
1966 
1967 void ompc_set_nested(int flag) {
1968  kmp_info_t *thread;
1969 
1970  /* For the thread-private internal controls implementation */
1971  thread = __kmp_entry_thread();
1972 
1973  __kmp_save_internal_controls(thread);
1974 
1975  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1976 }
1977 
1978 void ompc_set_max_active_levels(int max_active_levels) {
1979  /* TO DO */
1980  /* we want per-task implementation of this internal control */
1981 
1982  /* For the per-thread internal controls implementation */
1983  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1984 }
1985 
1986 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1987  // !!!!! TODO: check the per-task binding
1988  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1989 }
1990 
1991 int ompc_get_ancestor_thread_num(int level) {
1992  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1993 }
1994 
1995 int ompc_get_team_size(int level) {
1996  return __kmp_get_team_size(__kmp_entry_gtid(), level);
1997 }
1998 
1999 /* OpenMP 5.0 Affinity Format API */
2000 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2001  if (!__kmp_init_serial) {
2002  __kmp_serial_initialize();
2003  }
2004  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2005  format, KMP_STRLEN(format) + 1);
2006 }
2007 
2008 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2009  size_t format_size;
2010  if (!__kmp_init_serial) {
2011  __kmp_serial_initialize();
2012  }
2013  format_size = KMP_STRLEN(__kmp_affinity_format);
2014  if (buffer && size) {
2015  __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2016  format_size + 1);
2017  }
2018  return format_size;
2019 }
2020 
2021 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2022  int gtid;
2023  if (!TCR_4(__kmp_init_middle)) {
2024  __kmp_middle_initialize();
2025  }
2026  __kmp_assign_root_init_mask();
2027  gtid = __kmp_get_gtid();
2028  __kmp_aux_display_affinity(gtid, format);
2029 }
2030 
2031 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2032  char const *format) {
2033  int gtid;
2034  size_t num_required;
2035  kmp_str_buf_t capture_buf;
2036  if (!TCR_4(__kmp_init_middle)) {
2037  __kmp_middle_initialize();
2038  }
2039  __kmp_assign_root_init_mask();
2040  gtid = __kmp_get_gtid();
2041  __kmp_str_buf_init(&capture_buf);
2042  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2043  if (buffer && buf_size) {
2044  __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2045  capture_buf.used + 1);
2046  }
2047  __kmp_str_buf_free(&capture_buf);
2048  return num_required;
2049 }
2050 
2051 void kmpc_set_stacksize(int arg) {
2052  // __kmp_aux_set_stacksize initializes the library if needed
2053  __kmp_aux_set_stacksize(arg);
2054 }
2055 
2056 void kmpc_set_stacksize_s(size_t arg) {
2057  // __kmp_aux_set_stacksize initializes the library if needed
2058  __kmp_aux_set_stacksize(arg);
2059 }
2060 
2061 void kmpc_set_blocktime(int arg) {
2062  int gtid, tid;
2063  kmp_info_t *thread;
2064 
2065  gtid = __kmp_entry_gtid();
2066  tid = __kmp_tid_from_gtid(gtid);
2067  thread = __kmp_thread_from_gtid(gtid);
2068 
2069  __kmp_aux_set_blocktime(arg, thread, tid);
2070 }
2071 
2072 void kmpc_set_library(int arg) {
2073  // __kmp_user_set_library initializes the library if needed
2074  __kmp_user_set_library((enum library_type)arg);
2075 }
2076 
2077 void kmpc_set_defaults(char const *str) {
2078  // __kmp_aux_set_defaults initializes the library if needed
2079  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2080 }
2081 
2082 void kmpc_set_disp_num_buffers(int arg) {
2083  // ignore after initialization because some teams have already
2084  // allocated dispatch buffers
2085  if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2086  arg <= KMP_MAX_DISP_NUM_BUFF) {
2087  __kmp_dispatch_num_buffers = arg;
2088  }
2089 }
2090 
2091 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2092 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2093  return -1;
2094 #else
2095  if (!TCR_4(__kmp_init_middle)) {
2096  __kmp_middle_initialize();
2097  }
2098  __kmp_assign_root_init_mask();
2099  return __kmp_aux_set_affinity_mask_proc(proc, mask);
2100 #endif
2101 }
2102 
2103 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2104 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2105  return -1;
2106 #else
2107  if (!TCR_4(__kmp_init_middle)) {
2108  __kmp_middle_initialize();
2109  }
2110  __kmp_assign_root_init_mask();
2111  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2112 #endif
2113 }
2114 
2115 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2116 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2117  return -1;
2118 #else
2119  if (!TCR_4(__kmp_init_middle)) {
2120  __kmp_middle_initialize();
2121  }
2122  __kmp_assign_root_init_mask();
2123  return __kmp_aux_get_affinity_mask_proc(proc, mask);
2124 #endif
2125 }
2126 
2127 /* -------------------------------------------------------------------------- */
2172 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2173  void *cpy_data, void (*cpy_func)(void *, void *),
2174  kmp_int32 didit) {
2175  void **data_ptr;
2176  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2177  __kmp_assert_valid_gtid(gtid);
2178 
2179  KMP_MB();
2180 
2181  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2182 
2183  if (__kmp_env_consistency_check) {
2184  if (loc == 0) {
2185  KMP_WARNING(ConstructIdentInvalid);
2186  }
2187  }
2188 
2189  // ToDo: Optimize the following two barriers into some kind of split barrier
2190 
2191  if (didit)
2192  *data_ptr = cpy_data;
2193 
2194 #if OMPT_SUPPORT
2195  ompt_frame_t *ompt_frame;
2196  if (ompt_enabled.enabled) {
2197  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2198  if (ompt_frame->enter_frame.ptr == NULL)
2199  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2200  }
2201  OMPT_STORE_RETURN_ADDRESS(gtid);
2202 #endif
2203 /* This barrier is not a barrier region boundary */
2204 #if USE_ITT_NOTIFY
2205  __kmp_threads[gtid]->th.th_ident = loc;
2206 #endif
2207  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2208 
2209  if (!didit)
2210  (*cpy_func)(cpy_data, *data_ptr);
2211 
2212  // Consider next barrier a user-visible barrier for barrier region boundaries
2213  // Nesting checks are already handled by the single construct checks
2214  {
2215 #if OMPT_SUPPORT
2216  OMPT_STORE_RETURN_ADDRESS(gtid);
2217 #endif
2218 #if USE_ITT_NOTIFY
2219  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2220 // tasks can overwrite the location)
2221 #endif
2222  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2223 #if OMPT_SUPPORT && OMPT_OPTIONAL
2224  if (ompt_enabled.enabled) {
2225  ompt_frame->enter_frame = ompt_data_none;
2226  }
2227 #endif
2228  }
2229 }
2230 
2231 /* -------------------------------------------------------------------------- */
2232 
2233 #define INIT_LOCK __kmp_init_user_lock_with_checks
2234 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2235 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2236 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2237 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2238 #define ACQUIRE_NESTED_LOCK_TIMED \
2239  __kmp_acquire_nested_user_lock_with_checks_timed
2240 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2241 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2242 #define TEST_LOCK __kmp_test_user_lock_with_checks
2243 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2244 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2245 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2246 
2247 // TODO: Make check abort messages use location info & pass it into
2248 // with_checks routines
2249 
2250 #if KMP_USE_DYNAMIC_LOCK
2251 
2252 // internal lock initializer
2253 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2254  kmp_dyna_lockseq_t seq) {
2255  if (KMP_IS_D_LOCK(seq)) {
2256  KMP_INIT_D_LOCK(lock, seq);
2257 #if USE_ITT_BUILD
2258  __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2259 #endif
2260  } else {
2261  KMP_INIT_I_LOCK(lock, seq);
2262 #if USE_ITT_BUILD
2263  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2264  __kmp_itt_lock_creating(ilk->lock, loc);
2265 #endif
2266  }
2267 }
2268 
2269 // internal nest lock initializer
2270 static __forceinline void
2271 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2272  kmp_dyna_lockseq_t seq) {
2273 #if KMP_USE_TSX
2274  // Don't have nested lock implementation for speculative locks
2275  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2276  seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2277  seq = __kmp_user_lock_seq;
2278 #endif
2279  switch (seq) {
2280  case lockseq_tas:
2281  seq = lockseq_nested_tas;
2282  break;
2283 #if KMP_USE_FUTEX
2284  case lockseq_futex:
2285  seq = lockseq_nested_futex;
2286  break;
2287 #endif
2288  case lockseq_ticket:
2289  seq = lockseq_nested_ticket;
2290  break;
2291  case lockseq_queuing:
2292  seq = lockseq_nested_queuing;
2293  break;
2294  case lockseq_drdpa:
2295  seq = lockseq_nested_drdpa;
2296  break;
2297  default:
2298  seq = lockseq_nested_queuing;
2299  }
2300  KMP_INIT_I_LOCK(lock, seq);
2301 #if USE_ITT_BUILD
2302  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2303  __kmp_itt_lock_creating(ilk->lock, loc);
2304 #endif
2305 }
2306 
2307 /* initialize the lock with a hint */
2308 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2309  uintptr_t hint) {
2310  KMP_DEBUG_ASSERT(__kmp_init_serial);
2311  if (__kmp_env_consistency_check && user_lock == NULL) {
2312  KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2313  }
2314 
2315  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2316 
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318  // This is the case, if called from omp_init_lock_with_hint:
2319  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2320  if (!codeptr)
2321  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2322  if (ompt_enabled.ompt_callback_lock_init) {
2323  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2324  ompt_mutex_lock, (omp_lock_hint_t)hint,
2325  __ompt_get_mutex_impl_type(user_lock),
2326  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2327  }
2328 #endif
2329 }
2330 
2331 /* initialize the lock with a hint */
2332 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2333  void **user_lock, uintptr_t hint) {
2334  KMP_DEBUG_ASSERT(__kmp_init_serial);
2335  if (__kmp_env_consistency_check && user_lock == NULL) {
2336  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2337  }
2338 
2339  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2340 
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342  // This is the case, if called from omp_init_lock_with_hint:
2343  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2344  if (!codeptr)
2345  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2346  if (ompt_enabled.ompt_callback_lock_init) {
2347  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2348  ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2349  __ompt_get_mutex_impl_type(user_lock),
2350  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2351  }
2352 #endif
2353 }
2354 
2355 #endif // KMP_USE_DYNAMIC_LOCK
2356 
2357 /* initialize the lock */
2358 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2359 #if KMP_USE_DYNAMIC_LOCK
2360 
2361  KMP_DEBUG_ASSERT(__kmp_init_serial);
2362  if (__kmp_env_consistency_check && user_lock == NULL) {
2363  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2364  }
2365  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2366 
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368  // This is the case, if called from omp_init_lock_with_hint:
2369  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2370  if (!codeptr)
2371  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2372  if (ompt_enabled.ompt_callback_lock_init) {
2373  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2374  ompt_mutex_lock, omp_lock_hint_none,
2375  __ompt_get_mutex_impl_type(user_lock),
2376  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2377  }
2378 #endif
2379 
2380 #else // KMP_USE_DYNAMIC_LOCK
2381 
2382  static char const *const func = "omp_init_lock";
2383  kmp_user_lock_p lck;
2384  KMP_DEBUG_ASSERT(__kmp_init_serial);
2385 
2386  if (__kmp_env_consistency_check) {
2387  if (user_lock == NULL) {
2388  KMP_FATAL(LockIsUninitialized, func);
2389  }
2390  }
2391 
2392  KMP_CHECK_USER_LOCK_INIT();
2393 
2394  if ((__kmp_user_lock_kind == lk_tas) &&
2395  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2396  lck = (kmp_user_lock_p)user_lock;
2397  }
2398 #if KMP_USE_FUTEX
2399  else if ((__kmp_user_lock_kind == lk_futex) &&
2400  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2401  lck = (kmp_user_lock_p)user_lock;
2402  }
2403 #endif
2404  else {
2405  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2406  }
2407  INIT_LOCK(lck);
2408  __kmp_set_user_lock_location(lck, loc);
2409 
2410 #if OMPT_SUPPORT && OMPT_OPTIONAL
2411  // This is the case, if called from omp_init_lock_with_hint:
2412  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2413  if (!codeptr)
2414  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2415  if (ompt_enabled.ompt_callback_lock_init) {
2416  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2417  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2418  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2419  }
2420 #endif
2421 
2422 #if USE_ITT_BUILD
2423  __kmp_itt_lock_creating(lck);
2424 #endif /* USE_ITT_BUILD */
2425 
2426 #endif // KMP_USE_DYNAMIC_LOCK
2427 } // __kmpc_init_lock
2428 
2429 /* initialize the lock */
2430 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2431 #if KMP_USE_DYNAMIC_LOCK
2432 
2433  KMP_DEBUG_ASSERT(__kmp_init_serial);
2434  if (__kmp_env_consistency_check && user_lock == NULL) {
2435  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2436  }
2437  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2438 
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440  // This is the case, if called from omp_init_lock_with_hint:
2441  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2442  if (!codeptr)
2443  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2444  if (ompt_enabled.ompt_callback_lock_init) {
2445  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2446  ompt_mutex_nest_lock, omp_lock_hint_none,
2447  __ompt_get_mutex_impl_type(user_lock),
2448  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2449  }
2450 #endif
2451 
2452 #else // KMP_USE_DYNAMIC_LOCK
2453 
2454  static char const *const func = "omp_init_nest_lock";
2455  kmp_user_lock_p lck;
2456  KMP_DEBUG_ASSERT(__kmp_init_serial);
2457 
2458  if (__kmp_env_consistency_check) {
2459  if (user_lock == NULL) {
2460  KMP_FATAL(LockIsUninitialized, func);
2461  }
2462  }
2463 
2464  KMP_CHECK_USER_LOCK_INIT();
2465 
2466  if ((__kmp_user_lock_kind == lk_tas) &&
2467  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2468  OMP_NEST_LOCK_T_SIZE)) {
2469  lck = (kmp_user_lock_p)user_lock;
2470  }
2471 #if KMP_USE_FUTEX
2472  else if ((__kmp_user_lock_kind == lk_futex) &&
2473  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2474  OMP_NEST_LOCK_T_SIZE)) {
2475  lck = (kmp_user_lock_p)user_lock;
2476  }
2477 #endif
2478  else {
2479  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2480  }
2481 
2482  INIT_NESTED_LOCK(lck);
2483  __kmp_set_user_lock_location(lck, loc);
2484 
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486  // This is the case, if called from omp_init_lock_with_hint:
2487  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2488  if (!codeptr)
2489  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2490  if (ompt_enabled.ompt_callback_lock_init) {
2491  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2492  ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2493  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2494  }
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498  __kmp_itt_lock_creating(lck);
2499 #endif /* USE_ITT_BUILD */
2500 
2501 #endif // KMP_USE_DYNAMIC_LOCK
2502 } // __kmpc_init_nest_lock
2503 
2504 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2505 #if KMP_USE_DYNAMIC_LOCK
2506 
2507 #if USE_ITT_BUILD
2508  kmp_user_lock_p lck;
2509  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2510  lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2511  } else {
2512  lck = (kmp_user_lock_p)user_lock;
2513  }
2514  __kmp_itt_lock_destroyed(lck);
2515 #endif
2516 #if OMPT_SUPPORT && OMPT_OPTIONAL
2517  // This is the case, if called from omp_init_lock_with_hint:
2518  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2519  if (!codeptr)
2520  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2521  if (ompt_enabled.ompt_callback_lock_destroy) {
2522  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2523  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2524  }
2525 #endif
2526  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2527 #else
2528  kmp_user_lock_p lck;
2529 
2530  if ((__kmp_user_lock_kind == lk_tas) &&
2531  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2532  lck = (kmp_user_lock_p)user_lock;
2533  }
2534 #if KMP_USE_FUTEX
2535  else if ((__kmp_user_lock_kind == lk_futex) &&
2536  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2537  lck = (kmp_user_lock_p)user_lock;
2538  }
2539 #endif
2540  else {
2541  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2542  }
2543 
2544 #if OMPT_SUPPORT && OMPT_OPTIONAL
2545  // This is the case, if called from omp_init_lock_with_hint:
2546  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2547  if (!codeptr)
2548  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2549  if (ompt_enabled.ompt_callback_lock_destroy) {
2550  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2551  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2552  }
2553 #endif
2554 
2555 #if USE_ITT_BUILD
2556  __kmp_itt_lock_destroyed(lck);
2557 #endif /* USE_ITT_BUILD */
2558  DESTROY_LOCK(lck);
2559 
2560  if ((__kmp_user_lock_kind == lk_tas) &&
2561  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2562  ;
2563  }
2564 #if KMP_USE_FUTEX
2565  else if ((__kmp_user_lock_kind == lk_futex) &&
2566  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2567  ;
2568  }
2569 #endif
2570  else {
2571  __kmp_user_lock_free(user_lock, gtid, lck);
2572  }
2573 #endif // KMP_USE_DYNAMIC_LOCK
2574 } // __kmpc_destroy_lock
2575 
2576 /* destroy the lock */
2577 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2578 #if KMP_USE_DYNAMIC_LOCK
2579 
2580 #if USE_ITT_BUILD
2581  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2582  __kmp_itt_lock_destroyed(ilk->lock);
2583 #endif
2584 #if OMPT_SUPPORT && OMPT_OPTIONAL
2585  // This is the case, if called from omp_init_lock_with_hint:
2586  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2587  if (!codeptr)
2588  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2589  if (ompt_enabled.ompt_callback_lock_destroy) {
2590  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2591  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2592  }
2593 #endif
2594  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2595 
2596 #else // KMP_USE_DYNAMIC_LOCK
2597 
2598  kmp_user_lock_p lck;
2599 
2600  if ((__kmp_user_lock_kind == lk_tas) &&
2601  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2602  OMP_NEST_LOCK_T_SIZE)) {
2603  lck = (kmp_user_lock_p)user_lock;
2604  }
2605 #if KMP_USE_FUTEX
2606  else if ((__kmp_user_lock_kind == lk_futex) &&
2607  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2608  OMP_NEST_LOCK_T_SIZE)) {
2609  lck = (kmp_user_lock_p)user_lock;
2610  }
2611 #endif
2612  else {
2613  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2614  }
2615 
2616 #if OMPT_SUPPORT && OMPT_OPTIONAL
2617  // This is the case, if called from omp_init_lock_with_hint:
2618  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2619  if (!codeptr)
2620  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2621  if (ompt_enabled.ompt_callback_lock_destroy) {
2622  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2623  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2624  }
2625 #endif
2626 
2627 #if USE_ITT_BUILD
2628  __kmp_itt_lock_destroyed(lck);
2629 #endif /* USE_ITT_BUILD */
2630 
2631  DESTROY_NESTED_LOCK(lck);
2632 
2633  if ((__kmp_user_lock_kind == lk_tas) &&
2634  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2635  OMP_NEST_LOCK_T_SIZE)) {
2636  ;
2637  }
2638 #if KMP_USE_FUTEX
2639  else if ((__kmp_user_lock_kind == lk_futex) &&
2640  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2641  OMP_NEST_LOCK_T_SIZE)) {
2642  ;
2643  }
2644 #endif
2645  else {
2646  __kmp_user_lock_free(user_lock, gtid, lck);
2647  }
2648 #endif // KMP_USE_DYNAMIC_LOCK
2649 } // __kmpc_destroy_nest_lock
2650 
2651 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2652  KMP_COUNT_BLOCK(OMP_set_lock);
2653 #if KMP_USE_DYNAMIC_LOCK
2654  int tag = KMP_EXTRACT_D_TAG(user_lock);
2655 #if USE_ITT_BUILD
2656  __kmp_itt_lock_acquiring(
2657  (kmp_user_lock_p)
2658  user_lock); // itt function will get to the right lock object.
2659 #endif
2660 #if OMPT_SUPPORT && OMPT_OPTIONAL
2661  // This is the case, if called from omp_init_lock_with_hint:
2662  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2663  if (!codeptr)
2664  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2665  if (ompt_enabled.ompt_callback_mutex_acquire) {
2666  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2667  ompt_mutex_lock, omp_lock_hint_none,
2668  __ompt_get_mutex_impl_type(user_lock),
2669  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2670  }
2671 #endif
2672 #if KMP_USE_INLINED_TAS
2673  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2674  KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2675  } else
2676 #elif KMP_USE_INLINED_FUTEX
2677  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2678  KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2679  } else
2680 #endif
2681  {
2682  __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2683  }
2684 #if USE_ITT_BUILD
2685  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2686 #endif
2687 #if OMPT_SUPPORT && OMPT_OPTIONAL
2688  if (ompt_enabled.ompt_callback_mutex_acquired) {
2689  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2690  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691  }
2692 #endif
2693 
2694 #else // KMP_USE_DYNAMIC_LOCK
2695 
2696  kmp_user_lock_p lck;
2697 
2698  if ((__kmp_user_lock_kind == lk_tas) &&
2699  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2700  lck = (kmp_user_lock_p)user_lock;
2701  }
2702 #if KMP_USE_FUTEX
2703  else if ((__kmp_user_lock_kind == lk_futex) &&
2704  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2705  lck = (kmp_user_lock_p)user_lock;
2706  }
2707 #endif
2708  else {
2709  lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2710  }
2711 
2712 #if USE_ITT_BUILD
2713  __kmp_itt_lock_acquiring(lck);
2714 #endif /* USE_ITT_BUILD */
2715 #if OMPT_SUPPORT && OMPT_OPTIONAL
2716  // This is the case, if called from omp_init_lock_with_hint:
2717  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718  if (!codeptr)
2719  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720  if (ompt_enabled.ompt_callback_mutex_acquire) {
2721  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2722  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2723  (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2724  }
2725 #endif
2726 
2727  ACQUIRE_LOCK(lck, gtid);
2728 
2729 #if USE_ITT_BUILD
2730  __kmp_itt_lock_acquired(lck);
2731 #endif /* USE_ITT_BUILD */
2732 
2733 #if OMPT_SUPPORT && OMPT_OPTIONAL
2734  if (ompt_enabled.ompt_callback_mutex_acquired) {
2735  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2736  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2737  }
2738 #endif
2739 
2740 #endif // KMP_USE_DYNAMIC_LOCK
2741 }
2742 
2743 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2744 #if KMP_USE_DYNAMIC_LOCK
2745 
2746 #if USE_ITT_BUILD
2747  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2748 #endif
2749 #if OMPT_SUPPORT && OMPT_OPTIONAL
2750  // This is the case, if called from omp_init_lock_with_hint:
2751  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2752  if (!codeptr)
2753  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2754  if (ompt_enabled.enabled) {
2755  if (ompt_enabled.ompt_callback_mutex_acquire) {
2756  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2757  ompt_mutex_nest_lock, omp_lock_hint_none,
2758  __ompt_get_mutex_impl_type(user_lock),
2759  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2760  }
2761  }
2762 #endif
2763  int acquire_status =
2764  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2765  (void)acquire_status;
2766 #if USE_ITT_BUILD
2767  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2768 #endif
2769 
2770 #if OMPT_SUPPORT && OMPT_OPTIONAL
2771  if (ompt_enabled.enabled) {
2772  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2773  if (ompt_enabled.ompt_callback_mutex_acquired) {
2774  // lock_first
2775  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2776  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2777  codeptr);
2778  }
2779  } else {
2780  if (ompt_enabled.ompt_callback_nest_lock) {
2781  // lock_next
2782  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2783  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2784  }
2785  }
2786  }
2787 #endif
2788 
2789 #else // KMP_USE_DYNAMIC_LOCK
2790  int acquire_status;
2791  kmp_user_lock_p lck;
2792 
2793  if ((__kmp_user_lock_kind == lk_tas) &&
2794  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2795  OMP_NEST_LOCK_T_SIZE)) {
2796  lck = (kmp_user_lock_p)user_lock;
2797  }
2798 #if KMP_USE_FUTEX
2799  else if ((__kmp_user_lock_kind == lk_futex) &&
2800  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2801  OMP_NEST_LOCK_T_SIZE)) {
2802  lck = (kmp_user_lock_p)user_lock;
2803  }
2804 #endif
2805  else {
2806  lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2807  }
2808 
2809 #if USE_ITT_BUILD
2810  __kmp_itt_lock_acquiring(lck);
2811 #endif /* USE_ITT_BUILD */
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813  // This is the case, if called from omp_init_lock_with_hint:
2814  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2815  if (!codeptr)
2816  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2817  if (ompt_enabled.enabled) {
2818  if (ompt_enabled.ompt_callback_mutex_acquire) {
2819  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2820  ompt_mutex_nest_lock, omp_lock_hint_none,
2821  __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2822  codeptr);
2823  }
2824  }
2825 #endif
2826 
2827  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2828 
2829 #if USE_ITT_BUILD
2830  __kmp_itt_lock_acquired(lck);
2831 #endif /* USE_ITT_BUILD */
2832 
2833 #if OMPT_SUPPORT && OMPT_OPTIONAL
2834  if (ompt_enabled.enabled) {
2835  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2836  if (ompt_enabled.ompt_callback_mutex_acquired) {
2837  // lock_first
2838  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2839  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2840  }
2841  } else {
2842  if (ompt_enabled.ompt_callback_nest_lock) {
2843  // lock_next
2844  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2845  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2846  }
2847  }
2848  }
2849 #endif
2850 
2851 #endif // KMP_USE_DYNAMIC_LOCK
2852 }
2853 
2854 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2855 #if KMP_USE_DYNAMIC_LOCK
2856 
2857  int tag = KMP_EXTRACT_D_TAG(user_lock);
2858 #if USE_ITT_BUILD
2859  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2860 #endif
2861 #if KMP_USE_INLINED_TAS
2862  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2863  KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2864  } else
2865 #elif KMP_USE_INLINED_FUTEX
2866  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2867  KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2868  } else
2869 #endif
2870  {
2871  __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2872  }
2873 
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875  // This is the case, if called from omp_init_lock_with_hint:
2876  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2877  if (!codeptr)
2878  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2879  if (ompt_enabled.ompt_callback_mutex_released) {
2880  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2881  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2882  }
2883 #endif
2884 
2885 #else // KMP_USE_DYNAMIC_LOCK
2886 
2887  kmp_user_lock_p lck;
2888 
2889  /* Can't use serial interval since not block structured */
2890  /* release the lock */
2891 
2892  if ((__kmp_user_lock_kind == lk_tas) &&
2893  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2894 #if KMP_OS_LINUX && \
2895  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2896 // "fast" path implemented to fix customer performance issue
2897 #if USE_ITT_BUILD
2898  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2899 #endif /* USE_ITT_BUILD */
2900  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2901  KMP_MB();
2902 
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904  // This is the case, if called from omp_init_lock_with_hint:
2905  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2906  if (!codeptr)
2907  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2908  if (ompt_enabled.ompt_callback_mutex_released) {
2909  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2910  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2911  }
2912 #endif
2913 
2914  return;
2915 #else
2916  lck = (kmp_user_lock_p)user_lock;
2917 #endif
2918  }
2919 #if KMP_USE_FUTEX
2920  else if ((__kmp_user_lock_kind == lk_futex) &&
2921  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2922  lck = (kmp_user_lock_p)user_lock;
2923  }
2924 #endif
2925  else {
2926  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2927  }
2928 
2929 #if USE_ITT_BUILD
2930  __kmp_itt_lock_releasing(lck);
2931 #endif /* USE_ITT_BUILD */
2932 
2933  RELEASE_LOCK(lck, gtid);
2934 
2935 #if OMPT_SUPPORT && OMPT_OPTIONAL
2936  // This is the case, if called from omp_init_lock_with_hint:
2937  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2938  if (!codeptr)
2939  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2940  if (ompt_enabled.ompt_callback_mutex_released) {
2941  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2942  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2943  }
2944 #endif
2945 
2946 #endif // KMP_USE_DYNAMIC_LOCK
2947 }
2948 
2949 /* release the lock */
2950 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2951 #if KMP_USE_DYNAMIC_LOCK
2952 
2953 #if USE_ITT_BUILD
2954  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2955 #endif
2956  int release_status =
2957  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2958  (void)release_status;
2959 
2960 #if OMPT_SUPPORT && OMPT_OPTIONAL
2961  // This is the case, if called from omp_init_lock_with_hint:
2962  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2963  if (!codeptr)
2964  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2965  if (ompt_enabled.enabled) {
2966  if (release_status == KMP_LOCK_RELEASED) {
2967  if (ompt_enabled.ompt_callback_mutex_released) {
2968  // release_lock_last
2969  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2970  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2971  codeptr);
2972  }
2973  } else if (ompt_enabled.ompt_callback_nest_lock) {
2974  // release_lock_prev
2975  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2976  ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2977  }
2978  }
2979 #endif
2980 
2981 #else // KMP_USE_DYNAMIC_LOCK
2982 
2983  kmp_user_lock_p lck;
2984 
2985  /* Can't use serial interval since not block structured */
2986 
2987  if ((__kmp_user_lock_kind == lk_tas) &&
2988  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2989  OMP_NEST_LOCK_T_SIZE)) {
2990 #if KMP_OS_LINUX && \
2991  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2992  // "fast" path implemented to fix customer performance issue
2993  kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2994 #if USE_ITT_BUILD
2995  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2996 #endif /* USE_ITT_BUILD */
2997 
2998 #if OMPT_SUPPORT && OMPT_OPTIONAL
2999  int release_status = KMP_LOCK_STILL_HELD;
3000 #endif
3001 
3002  if (--(tl->lk.depth_locked) == 0) {
3003  TCW_4(tl->lk.poll, 0);
3004 #if OMPT_SUPPORT && OMPT_OPTIONAL
3005  release_status = KMP_LOCK_RELEASED;
3006 #endif
3007  }
3008  KMP_MB();
3009 
3010 #if OMPT_SUPPORT && OMPT_OPTIONAL
3011  // This is the case, if called from omp_init_lock_with_hint:
3012  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3013  if (!codeptr)
3014  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3015  if (ompt_enabled.enabled) {
3016  if (release_status == KMP_LOCK_RELEASED) {
3017  if (ompt_enabled.ompt_callback_mutex_released) {
3018  // release_lock_last
3019  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3020  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3021  }
3022  } else if (ompt_enabled.ompt_callback_nest_lock) {
3023  // release_lock_previous
3024  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3025  ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3026  }
3027  }
3028 #endif
3029 
3030  return;
3031 #else
3032  lck = (kmp_user_lock_p)user_lock;
3033 #endif
3034  }
3035 #if KMP_USE_FUTEX
3036  else if ((__kmp_user_lock_kind == lk_futex) &&
3037  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3038  OMP_NEST_LOCK_T_SIZE)) {
3039  lck = (kmp_user_lock_p)user_lock;
3040  }
3041 #endif
3042  else {
3043  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3044  }
3045 
3046 #if USE_ITT_BUILD
3047  __kmp_itt_lock_releasing(lck);
3048 #endif /* USE_ITT_BUILD */
3049 
3050  int release_status;
3051  release_status = RELEASE_NESTED_LOCK(lck, gtid);
3052 #if OMPT_SUPPORT && OMPT_OPTIONAL
3053  // This is the case, if called from omp_init_lock_with_hint:
3054  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3055  if (!codeptr)
3056  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3057  if (ompt_enabled.enabled) {
3058  if (release_status == KMP_LOCK_RELEASED) {
3059  if (ompt_enabled.ompt_callback_mutex_released) {
3060  // release_lock_last
3061  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3062  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3063  }
3064  } else if (ompt_enabled.ompt_callback_nest_lock) {
3065  // release_lock_previous
3066  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3067  ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3068  }
3069  }
3070 #endif
3071 
3072 #endif // KMP_USE_DYNAMIC_LOCK
3073 }
3074 
3075 /* try to acquire the lock */
3076 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3077  KMP_COUNT_BLOCK(OMP_test_lock);
3078 
3079 #if KMP_USE_DYNAMIC_LOCK
3080  int rc;
3081  int tag = KMP_EXTRACT_D_TAG(user_lock);
3082 #if USE_ITT_BUILD
3083  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3084 #endif
3085 #if OMPT_SUPPORT && OMPT_OPTIONAL
3086  // This is the case, if called from omp_init_lock_with_hint:
3087  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3088  if (!codeptr)
3089  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3090  if (ompt_enabled.ompt_callback_mutex_acquire) {
3091  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3092  ompt_mutex_lock, omp_lock_hint_none,
3093  __ompt_get_mutex_impl_type(user_lock),
3094  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3095  }
3096 #endif
3097 #if KMP_USE_INLINED_TAS
3098  if (tag == locktag_tas && !__kmp_env_consistency_check) {
3099  KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3100  } else
3101 #elif KMP_USE_INLINED_FUTEX
3102  if (tag == locktag_futex && !__kmp_env_consistency_check) {
3103  KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3104  } else
3105 #endif
3106  {
3107  rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3108  }
3109  if (rc) {
3110 #if USE_ITT_BUILD
3111  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3112 #endif
3113 #if OMPT_SUPPORT && OMPT_OPTIONAL
3114  if (ompt_enabled.ompt_callback_mutex_acquired) {
3115  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3116  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3117  }
3118 #endif
3119  return FTN_TRUE;
3120  } else {
3121 #if USE_ITT_BUILD
3122  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3123 #endif
3124  return FTN_FALSE;
3125  }
3126 
3127 #else // KMP_USE_DYNAMIC_LOCK
3128 
3129  kmp_user_lock_p lck;
3130  int rc;
3131 
3132  if ((__kmp_user_lock_kind == lk_tas) &&
3133  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3134  lck = (kmp_user_lock_p)user_lock;
3135  }
3136 #if KMP_USE_FUTEX
3137  else if ((__kmp_user_lock_kind == lk_futex) &&
3138  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3139  lck = (kmp_user_lock_p)user_lock;
3140  }
3141 #endif
3142  else {
3143  lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3144  }
3145 
3146 #if USE_ITT_BUILD
3147  __kmp_itt_lock_acquiring(lck);
3148 #endif /* USE_ITT_BUILD */
3149 #if OMPT_SUPPORT && OMPT_OPTIONAL
3150  // This is the case, if called from omp_init_lock_with_hint:
3151  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3152  if (!codeptr)
3153  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3154  if (ompt_enabled.ompt_callback_mutex_acquire) {
3155  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3156  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3157  (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3158  }
3159 #endif
3160 
3161  rc = TEST_LOCK(lck, gtid);
3162 #if USE_ITT_BUILD
3163  if (rc) {
3164  __kmp_itt_lock_acquired(lck);
3165  } else {
3166  __kmp_itt_lock_cancelled(lck);
3167  }
3168 #endif /* USE_ITT_BUILD */
3169 #if OMPT_SUPPORT && OMPT_OPTIONAL
3170  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3171  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3172  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3173  }
3174 #endif
3175 
3176  return (rc ? FTN_TRUE : FTN_FALSE);
3177 
3178  /* Can't use serial interval since not block structured */
3179 
3180 #endif // KMP_USE_DYNAMIC_LOCK
3181 }
3182 
3183 /* try to acquire the lock */
3184 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3185 #if KMP_USE_DYNAMIC_LOCK
3186  int rc;
3187 #if USE_ITT_BUILD
3188  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3189 #endif
3190 #if OMPT_SUPPORT && OMPT_OPTIONAL
3191  // This is the case, if called from omp_init_lock_with_hint:
3192  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3193  if (!codeptr)
3194  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3195  if (ompt_enabled.ompt_callback_mutex_acquire) {
3196  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3197  ompt_mutex_nest_lock, omp_lock_hint_none,
3198  __ompt_get_mutex_impl_type(user_lock),
3199  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3200  }
3201 #endif
3202  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3203 #if USE_ITT_BUILD
3204  if (rc) {
3205  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3206  } else {
3207  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3208  }
3209 #endif
3210 #if OMPT_SUPPORT && OMPT_OPTIONAL
3211  if (ompt_enabled.enabled && rc) {
3212  if (rc == 1) {
3213  if (ompt_enabled.ompt_callback_mutex_acquired) {
3214  // lock_first
3215  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3216  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3217  codeptr);
3218  }
3219  } else {
3220  if (ompt_enabled.ompt_callback_nest_lock) {
3221  // lock_next
3222  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3223  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3224  }
3225  }
3226  }
3227 #endif
3228  return rc;
3229 
3230 #else // KMP_USE_DYNAMIC_LOCK
3231 
3232  kmp_user_lock_p lck;
3233  int rc;
3234 
3235  if ((__kmp_user_lock_kind == lk_tas) &&
3236  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3237  OMP_NEST_LOCK_T_SIZE)) {
3238  lck = (kmp_user_lock_p)user_lock;
3239  }
3240 #if KMP_USE_FUTEX
3241  else if ((__kmp_user_lock_kind == lk_futex) &&
3242  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3243  OMP_NEST_LOCK_T_SIZE)) {
3244  lck = (kmp_user_lock_p)user_lock;
3245  }
3246 #endif
3247  else {
3248  lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3249  }
3250 
3251 #if USE_ITT_BUILD
3252  __kmp_itt_lock_acquiring(lck);
3253 #endif /* USE_ITT_BUILD */
3254 
3255 #if OMPT_SUPPORT && OMPT_OPTIONAL
3256  // This is the case, if called from omp_init_lock_with_hint:
3257  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3258  if (!codeptr)
3259  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3260  if (ompt_enabled.enabled) &&
3261  ompt_enabled.ompt_callback_mutex_acquire) {
3262  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3263  ompt_mutex_nest_lock, omp_lock_hint_none,
3264  __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3265  codeptr);
3266  }
3267 #endif
3268 
3269  rc = TEST_NESTED_LOCK(lck, gtid);
3270 #if USE_ITT_BUILD
3271  if (rc) {
3272  __kmp_itt_lock_acquired(lck);
3273  } else {
3274  __kmp_itt_lock_cancelled(lck);
3275  }
3276 #endif /* USE_ITT_BUILD */
3277 #if OMPT_SUPPORT && OMPT_OPTIONAL
3278  if (ompt_enabled.enabled && rc) {
3279  if (rc == 1) {
3280  if (ompt_enabled.ompt_callback_mutex_acquired) {
3281  // lock_first
3282  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3283  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3284  }
3285  } else {
3286  if (ompt_enabled.ompt_callback_nest_lock) {
3287  // lock_next
3288  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3289  ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3290  }
3291  }
3292  }
3293 #endif
3294  return rc;
3295 
3296  /* Can't use serial interval since not block structured */
3297 
3298 #endif // KMP_USE_DYNAMIC_LOCK
3299 }
3300 
3301 // Interface to fast scalable reduce methods routines
3302 
3303 // keep the selected method in a thread local structure for cross-function
3304 // usage: will be used in __kmpc_end_reduce* functions;
3305 // another solution: to re-determine the method one more time in
3306 // __kmpc_end_reduce* functions (new prototype required then)
3307 // AT: which solution is better?
3308 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3309  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3310 
3311 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3312  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3313 
3314 // description of the packed_reduction_method variable: look at the macros in
3315 // kmp.h
3316 
3317 // used in a critical section reduce block
3318 static __forceinline void
3319 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3320  kmp_critical_name *crit) {
3321 
3322  // this lock was visible to a customer and to the threading profile tool as a
3323  // serial overhead span (although it's used for an internal purpose only)
3324  // why was it visible in previous implementation?
3325  // should we keep it visible in new reduce block?
3326  kmp_user_lock_p lck;
3327 
3328 #if KMP_USE_DYNAMIC_LOCK
3329 
3330  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3331  // Check if it is initialized.
3332  if (*lk == 0) {
3333  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3334  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3335  KMP_GET_D_TAG(__kmp_user_lock_seq));
3336  } else {
3337  __kmp_init_indirect_csptr(crit, loc, global_tid,
3338  KMP_GET_I_TAG(__kmp_user_lock_seq));
3339  }
3340  }
3341  // Branch for accessing the actual lock object and set operation. This
3342  // branching is inevitable since this lock initialization does not follow the
3343  // normal dispatch path (lock table is not used).
3344  if (KMP_EXTRACT_D_TAG(lk) != 0) {
3345  lck = (kmp_user_lock_p)lk;
3346  KMP_DEBUG_ASSERT(lck != NULL);
3347  if (__kmp_env_consistency_check) {
3348  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3349  }
3350  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3351  } else {
3352  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3353  lck = ilk->lock;
3354  KMP_DEBUG_ASSERT(lck != NULL);
3355  if (__kmp_env_consistency_check) {
3356  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3357  }
3358  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3359  }
3360 
3361 #else // KMP_USE_DYNAMIC_LOCK
3362 
3363  // We know that the fast reduction code is only emitted by Intel compilers
3364  // with 32 byte critical sections. If there isn't enough space, then we
3365  // have to use a pointer.
3366  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3367  lck = (kmp_user_lock_p)crit;
3368  } else {
3369  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3370  }
3371  KMP_DEBUG_ASSERT(lck != NULL);
3372 
3373  if (__kmp_env_consistency_check)
3374  __kmp_push_sync(global_tid, ct_critical, loc, lck);
3375 
3376  __kmp_acquire_user_lock_with_checks(lck, global_tid);
3377 
3378 #endif // KMP_USE_DYNAMIC_LOCK
3379 }
3380 
3381 // used in a critical section reduce block
3382 static __forceinline void
3383 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3384  kmp_critical_name *crit) {
3385 
3386  kmp_user_lock_p lck;
3387 
3388 #if KMP_USE_DYNAMIC_LOCK
3389 
3390  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3391  lck = (kmp_user_lock_p)crit;
3392  if (__kmp_env_consistency_check)
3393  __kmp_pop_sync(global_tid, ct_critical, loc);
3394  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3395  } else {
3396  kmp_indirect_lock_t *ilk =
3397  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3398  if (__kmp_env_consistency_check)
3399  __kmp_pop_sync(global_tid, ct_critical, loc);
3400  KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3401  }
3402 
3403 #else // KMP_USE_DYNAMIC_LOCK
3404 
3405  // We know that the fast reduction code is only emitted by Intel compilers
3406  // with 32 byte critical sections. If there isn't enough space, then we have
3407  // to use a pointer.
3408  if (__kmp_base_user_lock_size > 32) {
3409  lck = *((kmp_user_lock_p *)crit);
3410  KMP_ASSERT(lck != NULL);
3411  } else {
3412  lck = (kmp_user_lock_p)crit;
3413  }
3414 
3415  if (__kmp_env_consistency_check)
3416  __kmp_pop_sync(global_tid, ct_critical, loc);
3417 
3418  __kmp_release_user_lock_with_checks(lck, global_tid);
3419 
3420 #endif // KMP_USE_DYNAMIC_LOCK
3421 } // __kmp_end_critical_section_reduce_block
3422 
3423 static __forceinline int
3424 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3425  int *task_state) {
3426  kmp_team_t *team;
3427 
3428  // Check if we are inside the teams construct?
3429  if (th->th.th_teams_microtask) {
3430  *team_p = team = th->th.th_team;
3431  if (team->t.t_level == th->th.th_teams_level) {
3432  // This is reduction at teams construct.
3433  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3434  // Let's swap teams temporarily for the reduction.
3435  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3436  th->th.th_team = team->t.t_parent;
3437  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3438  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3439  *task_state = th->th.th_task_state;
3440  th->th.th_task_state = 0;
3441 
3442  return 1;
3443  }
3444  }
3445  return 0;
3446 }
3447 
3448 static __forceinline void
3449 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3450  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3451  th->th.th_info.ds.ds_tid = 0;
3452  th->th.th_team = team;
3453  th->th.th_team_nproc = team->t.t_nproc;
3454  th->th.th_task_team = team->t.t_task_team[task_state];
3455  __kmp_type_convert(task_state, &(th->th.th_task_state));
3456 }
3457 
3458 /* 2.a.i. Reduce Block without a terminating barrier */
3474 kmp_int32
3475 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3476  size_t reduce_size, void *reduce_data,
3477  void (*reduce_func)(void *lhs_data, void *rhs_data),
3478  kmp_critical_name *lck) {
3479 
3480  KMP_COUNT_BLOCK(REDUCE_nowait);
3481  int retval = 0;
3482  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3483  kmp_info_t *th;
3484  kmp_team_t *team;
3485  int teams_swapped = 0, task_state;
3486  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3487  __kmp_assert_valid_gtid(global_tid);
3488 
3489  // why do we need this initialization here at all?
3490  // Reduction clause can not be used as a stand-alone directive.
3491 
3492  // do not call __kmp_serial_initialize(), it will be called by
3493  // __kmp_parallel_initialize() if needed
3494  // possible detection of false-positive race by the threadchecker ???
3495  if (!TCR_4(__kmp_init_parallel))
3496  __kmp_parallel_initialize();
3497 
3498  __kmp_resume_if_soft_paused();
3499 
3500 // check correctness of reduce block nesting
3501 #if KMP_USE_DYNAMIC_LOCK
3502  if (__kmp_env_consistency_check)
3503  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3504 #else
3505  if (__kmp_env_consistency_check)
3506  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3507 #endif
3508 
3509  th = __kmp_thread_from_gtid(global_tid);
3510  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3511 
3512  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3513  // the value should be kept in a variable
3514  // the variable should be either a construct-specific or thread-specific
3515  // property, not a team specific property
3516  // (a thread can reach the next reduce block on the next construct, reduce
3517  // method may differ on the next construct)
3518  // an ident_t "loc" parameter could be used as a construct-specific property
3519  // (what if loc == 0?)
3520  // (if both construct-specific and team-specific variables were shared,
3521  // then unness extra syncs should be needed)
3522  // a thread-specific variable is better regarding two issues above (next
3523  // construct and extra syncs)
3524  // a thread-specific "th_local.reduction_method" variable is used currently
3525  // each thread executes 'determine' and 'set' lines (no need to execute by one
3526  // thread, to avoid unness extra syncs)
3527 
3528  packed_reduction_method = __kmp_determine_reduction_method(
3529  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3530  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3531 
3532  OMPT_REDUCTION_DECL(th, global_tid);
3533  if (packed_reduction_method == critical_reduce_block) {
3534 
3535  OMPT_REDUCTION_BEGIN;
3536 
3537  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3538  retval = 1;
3539 
3540  } else if (packed_reduction_method == empty_reduce_block) {
3541 
3542  OMPT_REDUCTION_BEGIN;
3543 
3544  // usage: if team size == 1, no synchronization is required ( Intel
3545  // platforms only )
3546  retval = 1;
3547 
3548  } else if (packed_reduction_method == atomic_reduce_block) {
3549 
3550  retval = 2;
3551 
3552  // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3553  // won't be called by the code gen)
3554  // (it's not quite good, because the checking block has been closed by
3555  // this 'pop',
3556  // but atomic operation has not been executed yet, will be executed
3557  // slightly later, literally on next instruction)
3558  if (__kmp_env_consistency_check)
3559  __kmp_pop_sync(global_tid, ct_reduce, loc);
3560 
3561  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3562  tree_reduce_block)) {
3563 
3564 // AT: performance issue: a real barrier here
3565 // AT: (if primary thread is slow, other threads are blocked here waiting for
3566 // the primary thread to come and release them)
3567 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3568 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3569 // be confusing to a customer)
3570 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3571 // might go faster and be more in line with sense of NOWAIT
3572 // AT: TO DO: do epcc test and compare times
3573 
3574 // this barrier should be invisible to a customer and to the threading profile
3575 // tool (it's neither a terminating barrier nor customer's code, it's
3576 // used for an internal purpose)
3577 #if OMPT_SUPPORT
3578  // JP: can this barrier potentially leed to task scheduling?
3579  // JP: as long as there is a barrier in the implementation, OMPT should and
3580  // will provide the barrier events
3581  // so we set-up the necessary frame/return addresses.
3582  ompt_frame_t *ompt_frame;
3583  if (ompt_enabled.enabled) {
3584  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3585  if (ompt_frame->enter_frame.ptr == NULL)
3586  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3587  }
3588  OMPT_STORE_RETURN_ADDRESS(global_tid);
3589 #endif
3590 #if USE_ITT_NOTIFY
3591  __kmp_threads[global_tid]->th.th_ident = loc;
3592 #endif
3593  retval =
3594  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3595  global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3596  retval = (retval != 0) ? (0) : (1);
3597 #if OMPT_SUPPORT && OMPT_OPTIONAL
3598  if (ompt_enabled.enabled) {
3599  ompt_frame->enter_frame = ompt_data_none;
3600  }
3601 #endif
3602 
3603  // all other workers except primary thread should do this pop here
3604  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3605  if (__kmp_env_consistency_check) {
3606  if (retval == 0) {
3607  __kmp_pop_sync(global_tid, ct_reduce, loc);
3608  }
3609  }
3610 
3611  } else {
3612 
3613  // should never reach this block
3614  KMP_ASSERT(0); // "unexpected method"
3615  }
3616  if (teams_swapped) {
3617  __kmp_restore_swapped_teams(th, team, task_state);
3618  }
3619  KA_TRACE(
3620  10,
3621  ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3622  global_tid, packed_reduction_method, retval));
3623 
3624  return retval;
3625 }
3626 
3635 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3636  kmp_critical_name *lck) {
3637 
3638  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3639 
3640  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3641  __kmp_assert_valid_gtid(global_tid);
3642 
3643  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3644 
3645  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3646 
3647  if (packed_reduction_method == critical_reduce_block) {
3648 
3649  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3650  OMPT_REDUCTION_END;
3651 
3652  } else if (packed_reduction_method == empty_reduce_block) {
3653 
3654  // usage: if team size == 1, no synchronization is required ( on Intel
3655  // platforms only )
3656 
3657  OMPT_REDUCTION_END;
3658 
3659  } else if (packed_reduction_method == atomic_reduce_block) {
3660 
3661  // neither primary thread nor other workers should get here
3662  // (code gen does not generate this call in case 2: atomic reduce block)
3663  // actually it's better to remove this elseif at all;
3664  // after removal this value will checked by the 'else' and will assert
3665 
3666  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3667  tree_reduce_block)) {
3668 
3669  // only primary thread gets here
3670  // OMPT: tree reduction is annotated in the barrier code
3671 
3672  } else {
3673 
3674  // should never reach this block
3675  KMP_ASSERT(0); // "unexpected method"
3676  }
3677 
3678  if (__kmp_env_consistency_check)
3679  __kmp_pop_sync(global_tid, ct_reduce, loc);
3680 
3681  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3682  global_tid, packed_reduction_method));
3683 
3684  return;
3685 }
3686 
3687 /* 2.a.ii. Reduce Block with a terminating barrier */
3688 
3704 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3705  size_t reduce_size, void *reduce_data,
3706  void (*reduce_func)(void *lhs_data, void *rhs_data),
3707  kmp_critical_name *lck) {
3708  KMP_COUNT_BLOCK(REDUCE_wait);
3709  int retval = 0;
3710  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3711  kmp_info_t *th;
3712  kmp_team_t *team;
3713  int teams_swapped = 0, task_state;
3714 
3715  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3716  __kmp_assert_valid_gtid(global_tid);
3717 
3718  // why do we need this initialization here at all?
3719  // Reduction clause can not be a stand-alone directive.
3720 
3721  // do not call __kmp_serial_initialize(), it will be called by
3722  // __kmp_parallel_initialize() if needed
3723  // possible detection of false-positive race by the threadchecker ???
3724  if (!TCR_4(__kmp_init_parallel))
3725  __kmp_parallel_initialize();
3726 
3727  __kmp_resume_if_soft_paused();
3728 
3729 // check correctness of reduce block nesting
3730 #if KMP_USE_DYNAMIC_LOCK
3731  if (__kmp_env_consistency_check)
3732  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3733 #else
3734  if (__kmp_env_consistency_check)
3735  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3736 #endif
3737 
3738  th = __kmp_thread_from_gtid(global_tid);
3739  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3740 
3741  packed_reduction_method = __kmp_determine_reduction_method(
3742  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3743  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3744 
3745  OMPT_REDUCTION_DECL(th, global_tid);
3746 
3747  if (packed_reduction_method == critical_reduce_block) {
3748 
3749  OMPT_REDUCTION_BEGIN;
3750  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3751  retval = 1;
3752 
3753  } else if (packed_reduction_method == empty_reduce_block) {
3754 
3755  OMPT_REDUCTION_BEGIN;
3756  // usage: if team size == 1, no synchronization is required ( Intel
3757  // platforms only )
3758  retval = 1;
3759 
3760  } else if (packed_reduction_method == atomic_reduce_block) {
3761 
3762  retval = 2;
3763 
3764  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3765  tree_reduce_block)) {
3766 
3767 // case tree_reduce_block:
3768 // this barrier should be visible to a customer and to the threading profile
3769 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3770 #if OMPT_SUPPORT
3771  ompt_frame_t *ompt_frame;
3772  if (ompt_enabled.enabled) {
3773  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3774  if (ompt_frame->enter_frame.ptr == NULL)
3775  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3776  }
3777  OMPT_STORE_RETURN_ADDRESS(global_tid);
3778 #endif
3779 #if USE_ITT_NOTIFY
3780  __kmp_threads[global_tid]->th.th_ident =
3781  loc; // needed for correct notification of frames
3782 #endif
3783  retval =
3784  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3785  global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3786  retval = (retval != 0) ? (0) : (1);
3787 #if OMPT_SUPPORT && OMPT_OPTIONAL
3788  if (ompt_enabled.enabled) {
3789  ompt_frame->enter_frame = ompt_data_none;
3790  }
3791 #endif
3792 
3793  // all other workers except primary thread should do this pop here
3794  // (none of other workers except primary will enter __kmpc_end_reduce())
3795  if (__kmp_env_consistency_check) {
3796  if (retval == 0) { // 0: all other workers; 1: primary thread
3797  __kmp_pop_sync(global_tid, ct_reduce, loc);
3798  }
3799  }
3800 
3801  } else {
3802 
3803  // should never reach this block
3804  KMP_ASSERT(0); // "unexpected method"
3805  }
3806  if (teams_swapped) {
3807  __kmp_restore_swapped_teams(th, team, task_state);
3808  }
3809 
3810  KA_TRACE(10,
3811  ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3812  global_tid, packed_reduction_method, retval));
3813  return retval;
3814 }
3815 
3826 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3827  kmp_critical_name *lck) {
3828 
3829  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3830  kmp_info_t *th;
3831  kmp_team_t *team;
3832  int teams_swapped = 0, task_state;
3833 
3834  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3835  __kmp_assert_valid_gtid(global_tid);
3836 
3837  th = __kmp_thread_from_gtid(global_tid);
3838  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3839 
3840  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3841 
3842  // this barrier should be visible to a customer and to the threading profile
3843  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3844  OMPT_REDUCTION_DECL(th, global_tid);
3845 
3846  if (packed_reduction_method == critical_reduce_block) {
3847  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3848 
3849  OMPT_REDUCTION_END;
3850 
3851 // TODO: implicit barrier: should be exposed
3852 #if OMPT_SUPPORT
3853  ompt_frame_t *ompt_frame;
3854  if (ompt_enabled.enabled) {
3855  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3856  if (ompt_frame->enter_frame.ptr == NULL)
3857  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3858  }
3859  OMPT_STORE_RETURN_ADDRESS(global_tid);
3860 #endif
3861 #if USE_ITT_NOTIFY
3862  __kmp_threads[global_tid]->th.th_ident = loc;
3863 #endif
3864  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3865 #if OMPT_SUPPORT && OMPT_OPTIONAL
3866  if (ompt_enabled.enabled) {
3867  ompt_frame->enter_frame = ompt_data_none;
3868  }
3869 #endif
3870 
3871  } else if (packed_reduction_method == empty_reduce_block) {
3872 
3873  OMPT_REDUCTION_END;
3874 
3875 // usage: if team size==1, no synchronization is required (Intel platforms only)
3876 
3877 // TODO: implicit barrier: should be exposed
3878 #if OMPT_SUPPORT
3879  ompt_frame_t *ompt_frame;
3880  if (ompt_enabled.enabled) {
3881  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3882  if (ompt_frame->enter_frame.ptr == NULL)
3883  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3884  }
3885  OMPT_STORE_RETURN_ADDRESS(global_tid);
3886 #endif
3887 #if USE_ITT_NOTIFY
3888  __kmp_threads[global_tid]->th.th_ident = loc;
3889 #endif
3890  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3891 #if OMPT_SUPPORT && OMPT_OPTIONAL
3892  if (ompt_enabled.enabled) {
3893  ompt_frame->enter_frame = ompt_data_none;
3894  }
3895 #endif
3896 
3897  } else if (packed_reduction_method == atomic_reduce_block) {
3898 
3899 #if OMPT_SUPPORT
3900  ompt_frame_t *ompt_frame;
3901  if (ompt_enabled.enabled) {
3902  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3903  if (ompt_frame->enter_frame.ptr == NULL)
3904  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3905  }
3906  OMPT_STORE_RETURN_ADDRESS(global_tid);
3907 #endif
3908 // TODO: implicit barrier: should be exposed
3909 #if USE_ITT_NOTIFY
3910  __kmp_threads[global_tid]->th.th_ident = loc;
3911 #endif
3912  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3913 #if OMPT_SUPPORT && OMPT_OPTIONAL
3914  if (ompt_enabled.enabled) {
3915  ompt_frame->enter_frame = ompt_data_none;
3916  }
3917 #endif
3918 
3919  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3920  tree_reduce_block)) {
3921 
3922  // only primary thread executes here (primary releases all other workers)
3923  __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3924  global_tid);
3925 
3926  } else {
3927 
3928  // should never reach this block
3929  KMP_ASSERT(0); // "unexpected method"
3930  }
3931  if (teams_swapped) {
3932  __kmp_restore_swapped_teams(th, team, task_state);
3933  }
3934 
3935  if (__kmp_env_consistency_check)
3936  __kmp_pop_sync(global_tid, ct_reduce, loc);
3937 
3938  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3939  global_tid, packed_reduction_method));
3940 
3941  return;
3942 }
3943 
3944 #undef __KMP_GET_REDUCTION_METHOD
3945 #undef __KMP_SET_REDUCTION_METHOD
3946 
3947 /* end of interface to fast scalable reduce routines */
3948 
3949 kmp_uint64 __kmpc_get_taskid() {
3950 
3951  kmp_int32 gtid;
3952  kmp_info_t *thread;
3953 
3954  gtid = __kmp_get_gtid();
3955  if (gtid < 0) {
3956  return 0;
3957  }
3958  thread = __kmp_thread_from_gtid(gtid);
3959  return thread->th.th_current_task->td_task_id;
3960 
3961 } // __kmpc_get_taskid
3962 
3963 kmp_uint64 __kmpc_get_parent_taskid() {
3964 
3965  kmp_int32 gtid;
3966  kmp_info_t *thread;
3967  kmp_taskdata_t *parent_task;
3968 
3969  gtid = __kmp_get_gtid();
3970  if (gtid < 0) {
3971  return 0;
3972  }
3973  thread = __kmp_thread_from_gtid(gtid);
3974  parent_task = thread->th.th_current_task->td_parent;
3975  return (parent_task == NULL ? 0 : parent_task->td_task_id);
3976 
3977 } // __kmpc_get_parent_taskid
3978 
3990 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3991  const struct kmp_dim *dims) {
3992  __kmp_assert_valid_gtid(gtid);
3993  int j, idx;
3994  kmp_int64 last, trace_count;
3995  kmp_info_t *th = __kmp_threads[gtid];
3996  kmp_team_t *team = th->th.th_team;
3997  kmp_uint32 *flags;
3998  kmp_disp_t *pr_buf = th->th.th_dispatch;
3999  dispatch_shared_info_t *sh_buf;
4000 
4001  KA_TRACE(
4002  20,
4003  ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4004  gtid, num_dims, !team->t.t_serialized));
4005  KMP_DEBUG_ASSERT(dims != NULL);
4006  KMP_DEBUG_ASSERT(num_dims > 0);
4007 
4008  if (team->t.t_serialized) {
4009  KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4010  return; // no dependencies if team is serialized
4011  }
4012  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4013  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4014  // the next loop
4015  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4016 
4017  // Save bounds info into allocated private buffer
4018  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4019  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4020  th, sizeof(kmp_int64) * (4 * num_dims + 1));
4021  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022  pr_buf->th_doacross_info[0] =
4023  (kmp_int64)num_dims; // first element is number of dimensions
4024  // Save also address of num_done in order to access it later without knowing
4025  // the buffer index
4026  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4027  pr_buf->th_doacross_info[2] = dims[0].lo;
4028  pr_buf->th_doacross_info[3] = dims[0].up;
4029  pr_buf->th_doacross_info[4] = dims[0].st;
4030  last = 5;
4031  for (j = 1; j < num_dims; ++j) {
4032  kmp_int64
4033  range_length; // To keep ranges of all dimensions but the first dims[0]
4034  if (dims[j].st == 1) { // most common case
4035  // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4036  range_length = dims[j].up - dims[j].lo + 1;
4037  } else {
4038  if (dims[j].st > 0) {
4039  KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4040  range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4041  } else { // negative increment
4042  KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4043  range_length =
4044  (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4045  }
4046  }
4047  pr_buf->th_doacross_info[last++] = range_length;
4048  pr_buf->th_doacross_info[last++] = dims[j].lo;
4049  pr_buf->th_doacross_info[last++] = dims[j].up;
4050  pr_buf->th_doacross_info[last++] = dims[j].st;
4051  }
4052 
4053  // Compute total trip count.
4054  // Start with range of dims[0] which we don't need to keep in the buffer.
4055  if (dims[0].st == 1) { // most common case
4056  trace_count = dims[0].up - dims[0].lo + 1;
4057  } else if (dims[0].st > 0) {
4058  KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4059  trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4060  } else { // negative increment
4061  KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4062  trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4063  }
4064  for (j = 1; j < num_dims; ++j) {
4065  trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4066  }
4067  KMP_DEBUG_ASSERT(trace_count > 0);
4068 
4069  // Check if shared buffer is not occupied by other loop (idx -
4070  // __kmp_dispatch_num_buffers)
4071  if (idx != sh_buf->doacross_buf_idx) {
4072  // Shared buffer is occupied, wait for it to be free
4073  __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4074  __kmp_eq_4, NULL);
4075  }
4076 #if KMP_32_BIT_ARCH
4077  // Check if we are the first thread. After the CAS the first thread gets 0,
4078  // others get 1 if initialization is in progress, allocated pointer otherwise.
4079  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4080  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4081  (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4082 #else
4083  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4084  (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4085 #endif
4086  if (flags == NULL) {
4087  // we are the first thread, allocate the array of flags
4088  size_t size =
4089  (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4090  flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4091  KMP_MB();
4092  sh_buf->doacross_flags = flags;
4093  } else if (flags == (kmp_uint32 *)1) {
4094 #if KMP_32_BIT_ARCH
4095  // initialization is still in progress, need to wait
4096  while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4097 #else
4098  while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4099 #endif
4100  KMP_YIELD(TRUE);
4101  KMP_MB();
4102  } else {
4103  KMP_MB();
4104  }
4105  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4106  pr_buf->th_doacross_flags =
4107  sh_buf->doacross_flags; // save private copy in order to not
4108  // touch shared buffer on each iteration
4109  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4110 }
4111 
4112 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4113  __kmp_assert_valid_gtid(gtid);
4114  kmp_int64 shft;
4115  size_t num_dims, i;
4116  kmp_uint32 flag;
4117  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4118  kmp_info_t *th = __kmp_threads[gtid];
4119  kmp_team_t *team = th->th.th_team;
4120  kmp_disp_t *pr_buf;
4121  kmp_int64 lo, up, st;
4122 
4123  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4124  if (team->t.t_serialized) {
4125  KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4126  return; // no dependencies if team is serialized
4127  }
4128 
4129  // calculate sequential iteration number and check out-of-bounds condition
4130  pr_buf = th->th.th_dispatch;
4131  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4132  num_dims = (size_t)pr_buf->th_doacross_info[0];
4133  lo = pr_buf->th_doacross_info[2];
4134  up = pr_buf->th_doacross_info[3];
4135  st = pr_buf->th_doacross_info[4];
4136 #if OMPT_SUPPORT && OMPT_OPTIONAL
4137  ompt_dependence_t deps[num_dims];
4138 #endif
4139  if (st == 1) { // most common case
4140  if (vec[0] < lo || vec[0] > up) {
4141  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4142  "bounds [%lld,%lld]\n",
4143  gtid, vec[0], lo, up));
4144  return;
4145  }
4146  iter_number = vec[0] - lo;
4147  } else if (st > 0) {
4148  if (vec[0] < lo || vec[0] > up) {
4149  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4150  "bounds [%lld,%lld]\n",
4151  gtid, vec[0], lo, up));
4152  return;
4153  }
4154  iter_number = (kmp_uint64)(vec[0] - lo) / st;
4155  } else { // negative increment
4156  if (vec[0] > lo || vec[0] < up) {
4157  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4158  "bounds [%lld,%lld]\n",
4159  gtid, vec[0], lo, up));
4160  return;
4161  }
4162  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4163  }
4164 #if OMPT_SUPPORT && OMPT_OPTIONAL
4165  deps[0].variable.value = iter_number;
4166  deps[0].dependence_type = ompt_dependence_type_sink;
4167 #endif
4168  for (i = 1; i < num_dims; ++i) {
4169  kmp_int64 iter, ln;
4170  size_t j = i * 4;
4171  ln = pr_buf->th_doacross_info[j + 1];
4172  lo = pr_buf->th_doacross_info[j + 2];
4173  up = pr_buf->th_doacross_info[j + 3];
4174  st = pr_buf->th_doacross_info[j + 4];
4175  if (st == 1) {
4176  if (vec[i] < lo || vec[i] > up) {
4177  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4178  "bounds [%lld,%lld]\n",
4179  gtid, vec[i], lo, up));
4180  return;
4181  }
4182  iter = vec[i] - lo;
4183  } else if (st > 0) {
4184  if (vec[i] < lo || vec[i] > up) {
4185  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4186  "bounds [%lld,%lld]\n",
4187  gtid, vec[i], lo, up));
4188  return;
4189  }
4190  iter = (kmp_uint64)(vec[i] - lo) / st;
4191  } else { // st < 0
4192  if (vec[i] > lo || vec[i] < up) {
4193  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4194  "bounds [%lld,%lld]\n",
4195  gtid, vec[i], lo, up));
4196  return;
4197  }
4198  iter = (kmp_uint64)(lo - vec[i]) / (-st);
4199  }
4200  iter_number = iter + ln * iter_number;
4201 #if OMPT_SUPPORT && OMPT_OPTIONAL
4202  deps[i].variable.value = iter;
4203  deps[i].dependence_type = ompt_dependence_type_sink;
4204 #endif
4205  }
4206  shft = iter_number % 32; // use 32-bit granularity
4207  iter_number >>= 5; // divided by 32
4208  flag = 1 << shft;
4209  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4210  KMP_YIELD(TRUE);
4211  }
4212  KMP_MB();
4213 #if OMPT_SUPPORT && OMPT_OPTIONAL
4214  if (ompt_enabled.ompt_callback_dependences) {
4215  ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4216  &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4217  }
4218 #endif
4219  KA_TRACE(20,
4220  ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4221  gtid, (iter_number << 5) + shft));
4222 }
4223 
4224 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4225  __kmp_assert_valid_gtid(gtid);
4226  kmp_int64 shft;
4227  size_t num_dims, i;
4228  kmp_uint32 flag;
4229  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4230  kmp_info_t *th = __kmp_threads[gtid];
4231  kmp_team_t *team = th->th.th_team;
4232  kmp_disp_t *pr_buf;
4233  kmp_int64 lo, st;
4234 
4235  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4236  if (team->t.t_serialized) {
4237  KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4238  return; // no dependencies if team is serialized
4239  }
4240 
4241  // calculate sequential iteration number (same as in "wait" but no
4242  // out-of-bounds checks)
4243  pr_buf = th->th.th_dispatch;
4244  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4245  num_dims = (size_t)pr_buf->th_doacross_info[0];
4246  lo = pr_buf->th_doacross_info[2];
4247  st = pr_buf->th_doacross_info[4];
4248 #if OMPT_SUPPORT && OMPT_OPTIONAL
4249  ompt_dependence_t deps[num_dims];
4250 #endif
4251  if (st == 1) { // most common case
4252  iter_number = vec[0] - lo;
4253  } else if (st > 0) {
4254  iter_number = (kmp_uint64)(vec[0] - lo) / st;
4255  } else { // negative increment
4256  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4257  }
4258 #if OMPT_SUPPORT && OMPT_OPTIONAL
4259  deps[0].variable.value = iter_number;
4260  deps[0].dependence_type = ompt_dependence_type_source;
4261 #endif
4262  for (i = 1; i < num_dims; ++i) {
4263  kmp_int64 iter, ln;
4264  size_t j = i * 4;
4265  ln = pr_buf->th_doacross_info[j + 1];
4266  lo = pr_buf->th_doacross_info[j + 2];
4267  st = pr_buf->th_doacross_info[j + 4];
4268  if (st == 1) {
4269  iter = vec[i] - lo;
4270  } else if (st > 0) {
4271  iter = (kmp_uint64)(vec[i] - lo) / st;
4272  } else { // st < 0
4273  iter = (kmp_uint64)(lo - vec[i]) / (-st);
4274  }
4275  iter_number = iter + ln * iter_number;
4276 #if OMPT_SUPPORT && OMPT_OPTIONAL
4277  deps[i].variable.value = iter;
4278  deps[i].dependence_type = ompt_dependence_type_source;
4279 #endif
4280  }
4281 #if OMPT_SUPPORT && OMPT_OPTIONAL
4282  if (ompt_enabled.ompt_callback_dependences) {
4283  ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4284  &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4285  }
4286 #endif
4287  shft = iter_number % 32; // use 32-bit granularity
4288  iter_number >>= 5; // divided by 32
4289  flag = 1 << shft;
4290  KMP_MB();
4291  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4292  KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4293  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4294  (iter_number << 5) + shft));
4295 }
4296 
4297 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4298  __kmp_assert_valid_gtid(gtid);
4299  kmp_int32 num_done;
4300  kmp_info_t *th = __kmp_threads[gtid];
4301  kmp_team_t *team = th->th.th_team;
4302  kmp_disp_t *pr_buf = th->th.th_dispatch;
4303 
4304  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4305  if (team->t.t_serialized) {
4306  KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4307  return; // nothing to do
4308  }
4309  num_done =
4310  KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4311  if (num_done == th->th.th_team_nproc) {
4312  // we are the last thread, need to free shared resources
4313  int idx = pr_buf->th_doacross_buf_idx - 1;
4314  dispatch_shared_info_t *sh_buf =
4315  &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4316  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4317  (kmp_int64)&sh_buf->doacross_num_done);
4318  KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4319  KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4320  __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4321  sh_buf->doacross_flags = NULL;
4322  sh_buf->doacross_num_done = 0;
4323  sh_buf->doacross_buf_idx +=
4324  __kmp_dispatch_num_buffers; // free buffer for future re-use
4325  }
4326  // free private resources (need to keep buffer index forever)
4327  pr_buf->th_doacross_flags = NULL;
4328  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4329  pr_buf->th_doacross_info = NULL;
4330  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4331 }
4332 
4333 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4334 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4335  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4336 }
4337 
4338 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4339  return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4340 }
4341 
4342 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4343  omp_allocator_handle_t free_allocator) {
4344  return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4345  free_allocator);
4346 }
4347 
4348 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4349  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4350 }
4351 
4352 int __kmpc_get_target_offload(void) {
4353  if (!__kmp_init_serial) {
4354  __kmp_serial_initialize();
4355  }
4356  return __kmp_target_offload;
4357 }
4358 
4359 int __kmpc_pause_resource(kmp_pause_status_t level) {
4360  if (!__kmp_init_serial) {
4361  return 1; // Can't pause if runtime is not initialized
4362  }
4363  return __kmp_pause_resource(level);
4364 }
4365 
4366 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4367  if (!__kmp_init_serial)
4368  __kmp_serial_initialize();
4369 
4370  KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4371 
4372 #if OMPT_SUPPORT
4373  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4374  ompt_callbacks.ompt_callback(ompt_callback_error)(
4375  (ompt_severity_t)severity, message, KMP_STRLEN(message),
4376  OMPT_GET_RETURN_ADDRESS(0));
4377  }
4378 #endif // OMPT_SUPPORT
4379 
4380  char *src_loc;
4381  if (loc && loc->psource) {
4382  kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4383  src_loc =
4384  __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4385  __kmp_str_loc_free(&str_loc);
4386  } else {
4387  src_loc = __kmp_str_format("unknown");
4388  }
4389 
4390  if (severity == severity_warning)
4391  KMP_WARNING(UserDirectedWarning, src_loc, message);
4392  else
4393  KMP_FATAL(UserDirectedError, src_loc, message);
4394 
4395  __kmp_str_free(&src_loc);
4396 }
4397 
4398 #ifdef KMP_USE_VERSION_SYMBOLS
4399 // For GOMP compatibility there are two versions of each omp_* API.
4400 // One is the plain C symbol and one is the Fortran symbol with an appended
4401 // underscore. When we implement a specific ompc_* version of an omp_*
4402 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4403 // instead of the Fortran versions in kmp_ftn_entry.h
4404 extern "C" {
4405 // Have to undef these from omp.h so they aren't translated into
4406 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4407 #ifdef omp_set_affinity_format
4408 #undef omp_set_affinity_format
4409 #endif
4410 #ifdef omp_get_affinity_format
4411 #undef omp_get_affinity_format
4412 #endif
4413 #ifdef omp_display_affinity
4414 #undef omp_display_affinity
4415 #endif
4416 #ifdef omp_capture_affinity
4417 #undef omp_capture_affinity
4418 #endif
4419 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4420  "OMP_5.0");
4421 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4422  "OMP_5.0");
4423 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4424  "OMP_5.0");
4425 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4426  "OMP_5.0");
4427 } // extern "C"
4428 #endif
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams_lb, kmp_int32 num_teams_ub, kmp_int32 num_threads)
void __kmpc_flush(ident_t *loc)
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, const struct kmp_dim *dims)
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:904
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:233
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
kmp_int32 __kmpc_in_parallel(ident_t *loc)
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit, uint32_t hint)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1522
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
char const * psource
Definition: kmp.h:243
kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter)
kmp_int32 flags
Definition: kmp.h:235