LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23  class Mask : public KMPAffinity::Mask {
24  hwloc_cpuset_t mask;
25 
26  public:
27  Mask() {
28  mask = hwloc_bitmap_alloc();
29  this->zero();
30  }
31  ~Mask() { hwloc_bitmap_free(mask); }
32  void set(int i) override { hwloc_bitmap_set(mask, i); }
33  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35  void zero() override { hwloc_bitmap_zero(mask); }
36  void copy(const KMPAffinity::Mask *src) override {
37  const Mask *convert = static_cast<const Mask *>(src);
38  hwloc_bitmap_copy(mask, convert->mask);
39  }
40  void bitwise_and(const KMPAffinity::Mask *rhs) override {
41  const Mask *convert = static_cast<const Mask *>(rhs);
42  hwloc_bitmap_and(mask, mask, convert->mask);
43  }
44  void bitwise_or(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_or(mask, mask, convert->mask);
47  }
48  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49  int begin() const override { return hwloc_bitmap_first(mask); }
50  int end() const override { return -1; }
51  int next(int previous) const override {
52  return hwloc_bitmap_next(mask, previous);
53  }
54  int get_system_affinity(bool abort_on_error) override {
55  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56  "Illegal get affinity operation when not capable");
57  long retval =
58  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59  if (retval >= 0) {
60  return 0;
61  }
62  int error = errno;
63  if (abort_on_error) {
64  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65  }
66  return error;
67  }
68  int set_system_affinity(bool abort_on_error) const override {
69  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70  "Illegal set affinity operation when not capable");
71  long retval =
72  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73  if (retval >= 0) {
74  return 0;
75  }
76  int error = errno;
77  if (abort_on_error) {
78  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79  }
80  return error;
81  }
82 #if KMP_OS_WINDOWS
83  int set_process_affinity(bool abort_on_error) const override {
84  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85  "Illegal set process affinity operation when not capable");
86  int error = 0;
87  const hwloc_topology_support *support =
88  hwloc_topology_get_support(__kmp_hwloc_topology);
89  if (support->cpubind->set_proc_cpubind) {
90  int retval;
91  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92  HWLOC_CPUBIND_PROCESS);
93  if (retval >= 0)
94  return 0;
95  error = errno;
96  if (abort_on_error)
97  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98  }
99  return error;
100  }
101 #endif
102  int get_proc_group() const override {
103  int group = -1;
104 #if KMP_OS_WINDOWS
105  if (__kmp_num_proc_groups == 1) {
106  return 1;
107  }
108  for (int i = 0; i < __kmp_num_proc_groups; i++) {
109  // On windows, the long type is always 32 bits
110  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111  unsigned long second_32_bits =
112  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113  if (first_32_bits == 0 && second_32_bits == 0) {
114  continue;
115  }
116  if (group >= 0) {
117  return -1;
118  }
119  group = i;
120  }
121 #endif /* KMP_OS_WINDOWS */
122  return group;
123  }
124  };
125  void determine_capable(const char *var) override {
126  const hwloc_topology_support *topology_support;
127  if (__kmp_hwloc_topology == NULL) {
128  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129  __kmp_hwloc_error = TRUE;
130  if (__kmp_affinity_verbose)
131  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132  }
133  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134  __kmp_hwloc_error = TRUE;
135  if (__kmp_affinity_verbose)
136  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137  }
138  }
139  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140  // Is the system capable of setting/getting this thread's affinity?
141  // Also, is topology discovery possible? (pu indicates ability to discover
142  // processing units). And finally, were there no errors when calling any
143  // hwloc_* API functions?
144  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145  topology_support->cpubind->get_thisthread_cpubind &&
146  topology_support->discovery->pu && !__kmp_hwloc_error) {
147  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148  KMP_AFFINITY_ENABLE(TRUE);
149  } else {
150  // indicate that hwloc didn't work and disable affinity
151  __kmp_hwloc_error = TRUE;
152  KMP_AFFINITY_DISABLE();
153  }
154  }
155  void bind_thread(int which) override {
156  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157  "Illegal set affinity operation when not capable");
158  KMPAffinity::Mask *mask;
159  KMP_CPU_ALLOC_ON_STACK(mask);
160  KMP_CPU_ZERO(mask);
161  KMP_CPU_SET(which, mask);
162  __kmp_set_system_affinity(mask, TRUE);
163  KMP_CPU_FREE_FROM_STACK(mask);
164  }
165  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167  KMPAffinity::Mask *allocate_mask_array(int num) override {
168  return new Mask[num];
169  }
170  void deallocate_mask_array(KMPAffinity::Mask *array) override {
171  Mask *hwloc_array = static_cast<Mask *>(array);
172  delete[] hwloc_array;
173  }
174  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175  int index) override {
176  Mask *hwloc_array = static_cast<Mask *>(array);
177  return &(hwloc_array[index]);
178  }
179  api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187  all systems of the same arch where they are defined, and they cannot change.
188  stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_RISCV64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_X86_64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 203
226 #elif __NR_sched_setaffinity != 203
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 204
231 #elif __NR_sched_getaffinity != 204
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_PPC64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 222
237 #elif __NR_sched_setaffinity != 222
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 223
242 #elif __NR_sched_getaffinity != 223
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 # elif KMP_ARCH_MIPS
246 # ifndef __NR_sched_setaffinity
247 # define __NR_sched_setaffinity 4239
248 # elif __NR_sched_setaffinity != 4239
249 # error Wrong code for setaffinity system call.
250 # endif /* __NR_sched_setaffinity */
251 # ifndef __NR_sched_getaffinity
252 # define __NR_sched_getaffinity 4240
253 # elif __NR_sched_getaffinity != 4240
254 # error Wrong code for getaffinity system call.
255 # endif /* __NR_sched_getaffinity */
256 # elif KMP_ARCH_MIPS64
257 # ifndef __NR_sched_setaffinity
258 # define __NR_sched_setaffinity 5195
259 # elif __NR_sched_setaffinity != 5195
260 # error Wrong code for setaffinity system call.
261 # endif /* __NR_sched_setaffinity */
262 # ifndef __NR_sched_getaffinity
263 # define __NR_sched_getaffinity 5196
264 # elif __NR_sched_getaffinity != 5196
265 # error Wrong code for getaffinity system call.
266 # endif /* __NR_sched_getaffinity */
267 # else
268 #error Unknown or unsupported architecture
269 #endif /* KMP_ARCH_* */
270 #elif KMP_OS_FREEBSD
271 #include <pthread.h>
272 #include <pthread_np.h>
273 #endif
274 class KMPNativeAffinity : public KMPAffinity {
275  class Mask : public KMPAffinity::Mask {
276  typedef unsigned long mask_t;
277  typedef decltype(__kmp_affin_mask_size) mask_size_type;
278  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
279  static const mask_t ONE = 1;
280  mask_size_type get_num_mask_types() const {
281  return __kmp_affin_mask_size / sizeof(mask_t);
282  }
283 
284  public:
285  mask_t *mask;
286  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
287  ~Mask() {
288  if (mask)
289  __kmp_free(mask);
290  }
291  void set(int i) override {
292  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
293  }
294  bool is_set(int i) const override {
295  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
296  }
297  void clear(int i) override {
298  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
299  }
300  void zero() override {
301  mask_size_type e = get_num_mask_types();
302  for (mask_size_type i = 0; i < e; ++i)
303  mask[i] = (mask_t)0;
304  }
305  void copy(const KMPAffinity::Mask *src) override {
306  const Mask *convert = static_cast<const Mask *>(src);
307  mask_size_type e = get_num_mask_types();
308  for (mask_size_type i = 0; i < e; ++i)
309  mask[i] = convert->mask[i];
310  }
311  void bitwise_and(const KMPAffinity::Mask *rhs) override {
312  const Mask *convert = static_cast<const Mask *>(rhs);
313  mask_size_type e = get_num_mask_types();
314  for (mask_size_type i = 0; i < e; ++i)
315  mask[i] &= convert->mask[i];
316  }
317  void bitwise_or(const KMPAffinity::Mask *rhs) override {
318  const Mask *convert = static_cast<const Mask *>(rhs);
319  mask_size_type e = get_num_mask_types();
320  for (mask_size_type i = 0; i < e; ++i)
321  mask[i] |= convert->mask[i];
322  }
323  void bitwise_not() override {
324  mask_size_type e = get_num_mask_types();
325  for (mask_size_type i = 0; i < e; ++i)
326  mask[i] = ~(mask[i]);
327  }
328  int begin() const override {
329  int retval = 0;
330  while (retval < end() && !is_set(retval))
331  ++retval;
332  return retval;
333  }
334  int end() const override {
335  int e;
336  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
337  return e;
338  }
339  int next(int previous) const override {
340  int retval = previous + 1;
341  while (retval < end() && !is_set(retval))
342  ++retval;
343  return retval;
344  }
345  int get_system_affinity(bool abort_on_error) override {
346  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
347  "Illegal get affinity operation when not capable");
348 #if KMP_OS_LINUX
349  long retval =
350  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
351 #elif KMP_OS_FREEBSD
352  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
353  reinterpret_cast<cpuset_t *>(mask));
354  int retval = (r == 0 ? 0 : -1);
355 #endif
356  if (retval >= 0) {
357  return 0;
358  }
359  int error = errno;
360  if (abort_on_error) {
361  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
362  }
363  return error;
364  }
365  int set_system_affinity(bool abort_on_error) const override {
366  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
367  "Illegal set affinity operation when not capable");
368 #if KMP_OS_LINUX
369  long retval =
370  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
371 #elif KMP_OS_FREEBSD
372  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
373  reinterpret_cast<cpuset_t *>(mask));
374  int retval = (r == 0 ? 0 : -1);
375 #endif
376  if (retval >= 0) {
377  return 0;
378  }
379  int error = errno;
380  if (abort_on_error) {
381  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
382  }
383  return error;
384  }
385  };
386  void determine_capable(const char *env_var) override {
387  __kmp_affinity_determine_capable(env_var);
388  }
389  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
390  KMPAffinity::Mask *allocate_mask() override {
391  KMPNativeAffinity::Mask *retval = new Mask();
392  return retval;
393  }
394  void deallocate_mask(KMPAffinity::Mask *m) override {
395  KMPNativeAffinity::Mask *native_mask =
396  static_cast<KMPNativeAffinity::Mask *>(m);
397  delete native_mask;
398  }
399  KMPAffinity::Mask *allocate_mask_array(int num) override {
400  return new Mask[num];
401  }
402  void deallocate_mask_array(KMPAffinity::Mask *array) override {
403  Mask *linux_array = static_cast<Mask *>(array);
404  delete[] linux_array;
405  }
406  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
407  int index) override {
408  Mask *linux_array = static_cast<Mask *>(array);
409  return &(linux_array[index]);
410  }
411  api_type get_api_type() const override { return NATIVE_OS; }
412 };
413 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
414 
415 #if KMP_OS_WINDOWS
416 class KMPNativeAffinity : public KMPAffinity {
417  class Mask : public KMPAffinity::Mask {
418  typedef ULONG_PTR mask_t;
419  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
420  mask_t *mask;
421 
422  public:
423  Mask() {
424  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
425  }
426  ~Mask() {
427  if (mask)
428  __kmp_free(mask);
429  }
430  void set(int i) override {
431  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
432  }
433  bool is_set(int i) const override {
434  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
435  }
436  void clear(int i) override {
437  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
438  }
439  void zero() override {
440  for (int i = 0; i < __kmp_num_proc_groups; ++i)
441  mask[i] = 0;
442  }
443  void copy(const KMPAffinity::Mask *src) override {
444  const Mask *convert = static_cast<const Mask *>(src);
445  for (int i = 0; i < __kmp_num_proc_groups; ++i)
446  mask[i] = convert->mask[i];
447  }
448  void bitwise_and(const KMPAffinity::Mask *rhs) override {
449  const Mask *convert = static_cast<const Mask *>(rhs);
450  for (int i = 0; i < __kmp_num_proc_groups; ++i)
451  mask[i] &= convert->mask[i];
452  }
453  void bitwise_or(const KMPAffinity::Mask *rhs) override {
454  const Mask *convert = static_cast<const Mask *>(rhs);
455  for (int i = 0; i < __kmp_num_proc_groups; ++i)
456  mask[i] |= convert->mask[i];
457  }
458  void bitwise_not() override {
459  for (int i = 0; i < __kmp_num_proc_groups; ++i)
460  mask[i] = ~(mask[i]);
461  }
462  int begin() const override {
463  int retval = 0;
464  while (retval < end() && !is_set(retval))
465  ++retval;
466  return retval;
467  }
468  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
469  int next(int previous) const override {
470  int retval = previous + 1;
471  while (retval < end() && !is_set(retval))
472  ++retval;
473  return retval;
474  }
475  int set_process_affinity(bool abort_on_error) const override {
476  if (__kmp_num_proc_groups <= 1) {
477  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
478  DWORD error = GetLastError();
479  if (abort_on_error) {
480  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
481  __kmp_msg_null);
482  }
483  return error;
484  }
485  }
486  return 0;
487  }
488  int set_system_affinity(bool abort_on_error) const override {
489  if (__kmp_num_proc_groups > 1) {
490  // Check for a valid mask.
491  GROUP_AFFINITY ga;
492  int group = get_proc_group();
493  if (group < 0) {
494  if (abort_on_error) {
495  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
496  }
497  return -1;
498  }
499  // Transform the bit vector into a GROUP_AFFINITY struct
500  // and make the system call to set affinity.
501  ga.Group = group;
502  ga.Mask = mask[group];
503  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
504 
505  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
506  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
507  DWORD error = GetLastError();
508  if (abort_on_error) {
509  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
510  __kmp_msg_null);
511  }
512  return error;
513  }
514  } else {
515  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
516  DWORD error = GetLastError();
517  if (abort_on_error) {
518  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
519  __kmp_msg_null);
520  }
521  return error;
522  }
523  }
524  return 0;
525  }
526  int get_system_affinity(bool abort_on_error) override {
527  if (__kmp_num_proc_groups > 1) {
528  this->zero();
529  GROUP_AFFINITY ga;
530  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
531  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
532  DWORD error = GetLastError();
533  if (abort_on_error) {
534  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
535  KMP_ERR(error), __kmp_msg_null);
536  }
537  return error;
538  }
539  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
540  (ga.Mask == 0)) {
541  return -1;
542  }
543  mask[ga.Group] = ga.Mask;
544  } else {
545  mask_t newMask, sysMask, retval;
546  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
547  DWORD error = GetLastError();
548  if (abort_on_error) {
549  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
550  KMP_ERR(error), __kmp_msg_null);
551  }
552  return error;
553  }
554  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
555  if (!retval) {
556  DWORD error = GetLastError();
557  if (abort_on_error) {
558  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
559  KMP_ERR(error), __kmp_msg_null);
560  }
561  return error;
562  }
563  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
564  if (!newMask) {
565  DWORD error = GetLastError();
566  if (abort_on_error) {
567  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
568  KMP_ERR(error), __kmp_msg_null);
569  }
570  }
571  *mask = retval;
572  }
573  return 0;
574  }
575  int get_proc_group() const override {
576  int group = -1;
577  if (__kmp_num_proc_groups == 1) {
578  return 1;
579  }
580  for (int i = 0; i < __kmp_num_proc_groups; i++) {
581  if (mask[i] == 0)
582  continue;
583  if (group >= 0)
584  return -1;
585  group = i;
586  }
587  return group;
588  }
589  };
590  void determine_capable(const char *env_var) override {
591  __kmp_affinity_determine_capable(env_var);
592  }
593  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
594  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
595  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
596  KMPAffinity::Mask *allocate_mask_array(int num) override {
597  return new Mask[num];
598  }
599  void deallocate_mask_array(KMPAffinity::Mask *array) override {
600  Mask *windows_array = static_cast<Mask *>(array);
601  delete[] windows_array;
602  }
603  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
604  int index) override {
605  Mask *windows_array = static_cast<Mask *>(array);
606  return &(windows_array[index]);
607  }
608  api_type get_api_type() const override { return NATIVE_OS; }
609 };
610 #endif /* KMP_OS_WINDOWS */
611 #endif /* KMP_AFFINITY_SUPPORTED */
612 
613 class kmp_hw_thread_t {
614 public:
615  static const int UNKNOWN_ID = -1;
616  static int compare_ids(const void *a, const void *b);
617  static int compare_compact(const void *a, const void *b);
618  int ids[KMP_HW_LAST];
619  int sub_ids[KMP_HW_LAST];
620  bool leader;
621  int os_id;
622  void print() const;
623  void clear() {
624  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
625  ids[i] = UNKNOWN_ID;
626  leader = false;
627  }
628 };
629 
630 class kmp_topology_t {
631 
632  struct flags_t {
633  int uniform : 1;
634  int reserved : 31;
635  };
636 
637  int depth;
638 
639  // The following arrays are all 'depth' long
640 
641  // Orderd array of the types in the topology
642  kmp_hw_t *types;
643 
644  // Keep quick topology ratios, for non-uniform topologies,
645  // this ratio holds the max number of itemAs per itemB
646  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
647  int *ratio;
648 
649  // Storage containing the absolute number of each topology layer
650  int *count;
651 
652  // The hardware threads array
653  // hw_threads is num_hw_threads long
654  // Each hw_thread's ids and sub_ids are depth deep
655  int num_hw_threads;
656  kmp_hw_thread_t *hw_threads;
657 
658  // Equivalence hash where the key is the hardware topology item
659  // and the value is the equivalent hardware topology type in the
660  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
661  // known equivalence for the topology type
662  kmp_hw_t equivalent[KMP_HW_LAST];
663 
664  // Flags describing the topology
665  flags_t flags;
666 
667  // Count each item & get the num x's per y
668  // e.g., get the number of cores and the number of threads per core
669  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
670  void _gather_enumeration_information();
671 
672  // Remove layers that don't add information to the topology.
673  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
674  void _remove_radix1_layers();
675 
676  // Find out if the topology is uniform
677  void _discover_uniformity();
678 
679  // Set all the sub_ids for each hardware thread
680  void _set_sub_ids();
681 
682  // Set global affinity variables describing the number of threads per
683  // core, the number of packages, the number of cores per package, and
684  // the number of cores.
685  void _set_globals();
686 
687  // Set the last level cache equivalent type
688  void _set_last_level_cache();
689 
690 public:
691  // Force use of allocate()/deallocate()
692  kmp_topology_t() = delete;
693  kmp_topology_t(const kmp_topology_t &t) = delete;
694  kmp_topology_t(kmp_topology_t &&t) = delete;
695  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
696  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
697 
698  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
699  static void deallocate(kmp_topology_t *);
700 
701  // Functions used in create_map() routines
702  kmp_hw_thread_t &at(int index) {
703  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
704  return hw_threads[index];
705  }
706  const kmp_hw_thread_t &at(int index) const {
707  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
708  return hw_threads[index];
709  }
710  int get_num_hw_threads() const { return num_hw_threads; }
711  void sort_ids() {
712  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
713  kmp_hw_thread_t::compare_ids);
714  }
715  // Check if the hardware ids are unique, if they are
716  // return true, otherwise return false
717  bool check_ids() const;
718 
719  // Function to call after the create_map() routine
720  void canonicalize();
721  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
722 
723  // Functions used after canonicalize() called
724  bool filter_hw_subset();
725  bool is_close(int hwt1, int hwt2, int level) const;
726  bool is_uniform() const { return flags.uniform; }
727  // Tell whether a type is a valid type in the topology
728  // returns KMP_HW_UNKNOWN when there is no equivalent type
729  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
730  // Set type1 = type2
731  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
732  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
733  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
734  kmp_hw_t real_type2 = equivalent[type2];
735  if (real_type2 == KMP_HW_UNKNOWN)
736  real_type2 = type2;
737  equivalent[type1] = real_type2;
738  // This loop is required since any of the types may have been set to
739  // be equivalent to type1. They all must be checked and reset to type2.
740  KMP_FOREACH_HW_TYPE(type) {
741  if (equivalent[type] == type1) {
742  equivalent[type] = real_type2;
743  }
744  }
745  }
746  // Calculate number of types corresponding to level1
747  // per types corresponding to level2 (e.g., number of threads per core)
748  int calculate_ratio(int level1, int level2) const {
749  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
750  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
751  int r = 1;
752  for (int level = level1; level > level2; --level)
753  r *= ratio[level];
754  return r;
755  }
756  int get_ratio(int level) const {
757  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
758  return ratio[level];
759  }
760  int get_depth() const { return depth; };
761  kmp_hw_t get_type(int level) const {
762  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
763  return types[level];
764  }
765  int get_level(kmp_hw_t type) const {
766  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
767  int eq_type = equivalent[type];
768  if (eq_type == KMP_HW_UNKNOWN)
769  return -1;
770  for (int i = 0; i < depth; ++i)
771  if (types[i] == eq_type)
772  return i;
773  return -1;
774  }
775  int get_count(int level) const {
776  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
777  return count[level];
778  }
779 #if KMP_AFFINITY_SUPPORTED
780  void sort_compact() {
781  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
782  kmp_hw_thread_t::compare_compact);
783  }
784 #endif
785  void print(const char *env_var = "KMP_AFFINITY") const;
786  void dump() const;
787 };
788 
789 class kmp_hw_subset_t {
790 public:
791  struct item_t {
792  int num;
793  kmp_hw_t type;
794  int offset;
795  };
796 
797 private:
798  int depth;
799  int capacity;
800  item_t *items;
801  kmp_uint64 set;
802  bool absolute;
803  // The set must be able to handle up to KMP_HW_LAST number of layers
804  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
805 
806 public:
807  // Force use of allocate()/deallocate()
808  kmp_hw_subset_t() = delete;
809  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
810  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
811  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
812  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
813 
814  static kmp_hw_subset_t *allocate() {
815  int initial_capacity = 5;
816  kmp_hw_subset_t *retval =
817  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
818  retval->depth = 0;
819  retval->capacity = initial_capacity;
820  retval->set = 0ull;
821  retval->absolute = false;
822  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
823  return retval;
824  }
825  static void deallocate(kmp_hw_subset_t *subset) {
826  __kmp_free(subset->items);
827  __kmp_free(subset);
828  }
829  void set_absolute() { absolute = true; }
830  bool is_absolute() const { return absolute; }
831  void push_back(int num, kmp_hw_t type, int offset) {
832  if (depth == capacity - 1) {
833  capacity *= 2;
834  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
835  for (int i = 0; i < depth; ++i)
836  new_items[i] = items[i];
837  __kmp_free(items);
838  items = new_items;
839  }
840  items[depth].num = num;
841  items[depth].type = type;
842  items[depth].offset = offset;
843  depth++;
844  set |= (1ull << type);
845  }
846  int get_depth() const { return depth; }
847  const item_t &at(int index) const {
848  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
849  return items[index];
850  }
851  item_t &at(int index) {
852  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
853  return items[index];
854  }
855  void remove(int index) {
856  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
857  set &= ~(1ull << items[index].type);
858  for (int j = index + 1; j < depth; ++j) {
859  items[j - 1] = items[j];
860  }
861  depth--;
862  }
863  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
864  void dump() const {
865  printf("**********************\n");
866  printf("*** kmp_hw_subset: ***\n");
867  printf("* depth: %d\n", depth);
868  printf("* items:\n");
869  for (int i = 0; i < depth; ++i) {
870  printf("num: %d, type: %s, offset: %d\n", items[i].num,
871  __kmp_hw_get_keyword(items[i].type), items[i].offset);
872  }
873  printf("* set: 0x%llx\n", set);
874  printf("* absolute: %d\n", absolute);
875  printf("**********************\n");
876  }
877 };
878 
879 extern kmp_topology_t *__kmp_topology;
880 extern kmp_hw_subset_t *__kmp_hw_subset;
881 
882 /* A structure for holding machine-specific hierarchy info to be computed once
883  at init. This structure represents a mapping of threads to the actual machine
884  hierarchy, or to our best guess at what the hierarchy might be, for the
885  purpose of performing an efficient barrier. In the worst case, when there is
886  no machine hierarchy information, it produces a tree suitable for a barrier,
887  similar to the tree used in the hyper barrier. */
888 class hierarchy_info {
889 public:
890  /* Good default values for number of leaves and branching factor, given no
891  affinity information. Behaves a bit like hyper barrier. */
892  static const kmp_uint32 maxLeaves = 4;
893  static const kmp_uint32 minBranch = 4;
899  kmp_uint32 maxLevels;
900 
905  kmp_uint32 depth;
906  kmp_uint32 base_num_threads;
907  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
908  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
909  // 2=initialization in progress
910  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
911 
916  kmp_uint32 *numPerLevel;
917  kmp_uint32 *skipPerLevel;
918 
919  void deriveLevels() {
920  int hier_depth = __kmp_topology->get_depth();
921  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
922  numPerLevel[level] = __kmp_topology->get_ratio(i);
923  }
924  }
925 
926  hierarchy_info()
927  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
928 
929  void fini() {
930  if (!uninitialized && numPerLevel) {
931  __kmp_free(numPerLevel);
932  numPerLevel = NULL;
933  uninitialized = not_initialized;
934  }
935  }
936 
937  void init(int num_addrs) {
938  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
939  &uninitialized, not_initialized, initializing);
940  if (bool_result == 0) { // Wait for initialization
941  while (TCR_1(uninitialized) != initialized)
942  KMP_CPU_PAUSE();
943  return;
944  }
945  KMP_DEBUG_ASSERT(bool_result == 1);
946 
947  /* Added explicit initialization of the data fields here to prevent usage of
948  dirty value observed when static library is re-initialized multiple times
949  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
950  OpenMP). */
951  depth = 1;
952  resizing = 0;
953  maxLevels = 7;
954  numPerLevel =
955  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
956  skipPerLevel = &(numPerLevel[maxLevels]);
957  for (kmp_uint32 i = 0; i < maxLevels;
958  ++i) { // init numPerLevel[*] to 1 item per level
959  numPerLevel[i] = 1;
960  skipPerLevel[i] = 1;
961  }
962 
963  // Sort table by physical ID
964  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
965  deriveLevels();
966  } else {
967  numPerLevel[0] = maxLeaves;
968  numPerLevel[1] = num_addrs / maxLeaves;
969  if (num_addrs % maxLeaves)
970  numPerLevel[1]++;
971  }
972 
973  base_num_threads = num_addrs;
974  for (int i = maxLevels - 1; i >= 0;
975  --i) // count non-empty levels to get depth
976  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
977  depth++;
978 
979  kmp_uint32 branch = minBranch;
980  if (numPerLevel[0] == 1)
981  branch = num_addrs / maxLeaves;
982  if (branch < minBranch)
983  branch = minBranch;
984  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
985  while (numPerLevel[d] > branch ||
986  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
987  if (numPerLevel[d] & 1)
988  numPerLevel[d]++;
989  numPerLevel[d] = numPerLevel[d] >> 1;
990  if (numPerLevel[d + 1] == 1)
991  depth++;
992  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
993  }
994  if (numPerLevel[0] == 1) {
995  branch = branch >> 1;
996  if (branch < 4)
997  branch = minBranch;
998  }
999  }
1000 
1001  for (kmp_uint32 i = 1; i < depth; ++i)
1002  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1003  // Fill in hierarchy in the case of oversubscription
1004  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1005  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1006 
1007  uninitialized = initialized; // One writer
1008  }
1009 
1010  // Resize the hierarchy if nproc changes to something larger than before
1011  void resize(kmp_uint32 nproc) {
1012  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1013  while (bool_result == 0) { // someone else is trying to resize
1014  KMP_CPU_PAUSE();
1015  if (nproc <= base_num_threads) // happy with other thread's resize
1016  return;
1017  else // try to resize
1018  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1019  }
1020  KMP_DEBUG_ASSERT(bool_result != 0);
1021  if (nproc <= base_num_threads)
1022  return; // happy with other thread's resize
1023 
1024  // Calculate new maxLevels
1025  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1026  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1027  // First see if old maxLevels is enough to contain new size
1028  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1029  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1030  numPerLevel[i - 1] *= 2;
1031  old_sz *= 2;
1032  depth++;
1033  }
1034  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1035  while (nproc > old_sz) {
1036  old_sz *= 2;
1037  incs++;
1038  depth++;
1039  }
1040  maxLevels += incs;
1041 
1042  // Resize arrays
1043  kmp_uint32 *old_numPerLevel = numPerLevel;
1044  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1045  numPerLevel = skipPerLevel = NULL;
1046  numPerLevel =
1047  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1048  skipPerLevel = &(numPerLevel[maxLevels]);
1049 
1050  // Copy old elements from old arrays
1051  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1052  // init numPerLevel[*] to 1 item per level
1053  numPerLevel[i] = old_numPerLevel[i];
1054  skipPerLevel[i] = old_skipPerLevel[i];
1055  }
1056 
1057  // Init new elements in arrays to 1
1058  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1059  // init numPerLevel[*] to 1 item per level
1060  numPerLevel[i] = 1;
1061  skipPerLevel[i] = 1;
1062  }
1063 
1064  // Free old arrays
1065  __kmp_free(old_numPerLevel);
1066  }
1067 
1068  // Fill in oversubscription levels of hierarchy
1069  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1070  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1071 
1072  base_num_threads = nproc;
1073  resizing = 0; // One writer
1074  }
1075 };
1076 #endif // KMP_AFFINITY_H