LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 Mask(const Mask &other) = delete;
33 Mask &operator=(const Mask &other) = delete;
34 ~Mask() { hwloc_bitmap_free(mask); }
35 void set(int i) override { hwloc_bitmap_set(mask, i); }
36 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38 void zero() override { hwloc_bitmap_zero(mask); }
39 bool empty() const override { return hwloc_bitmap_iszero(mask); }
40 void copy(const KMPAffinity::Mask *src) override {
41 const Mask *convert = static_cast<const Mask *>(src);
42 hwloc_bitmap_copy(mask, convert->mask);
43 }
44 void bitwise_and(const KMPAffinity::Mask *rhs) override {
45 const Mask *convert = static_cast<const Mask *>(rhs);
46 hwloc_bitmap_and(mask, mask, convert->mask);
47 }
48 void bitwise_or(const KMPAffinity::Mask *rhs) override {
49 const Mask *convert = static_cast<const Mask *>(rhs);
50 hwloc_bitmap_or(mask, mask, convert->mask);
51 }
52 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53 bool is_equal(const KMPAffinity::Mask *rhs) const override {
54 const Mask *convert = static_cast<const Mask *>(rhs);
55 return hwloc_bitmap_isequal(mask, convert->mask);
56 }
57 int begin() const override { return hwloc_bitmap_first(mask); }
58 int end() const override { return -1; }
59 int next(int previous) const override {
60 return hwloc_bitmap_next(mask, previous);
61 }
62 int get_system_affinity(bool abort_on_error) override {
63 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64 "Illegal get affinity operation when not capable");
65 long retval =
66 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67 if (retval >= 0) {
68 return 0;
69 }
70 int error = errno;
71 if (abort_on_error) {
72 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73 KMP_ERR(error), __kmp_msg_null);
74 }
75 return error;
76 }
77 int set_system_affinity(bool abort_on_error) const override {
78 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79 "Illegal set affinity operation when not capable");
80 long retval =
81 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82 if (retval >= 0) {
83 return 0;
84 }
85 int error = errno;
86 if (abort_on_error) {
87 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88 KMP_ERR(error), __kmp_msg_null);
89 }
90 return error;
91 }
92#if KMP_OS_WINDOWS
93 int set_process_affinity(bool abort_on_error) const override {
94 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95 "Illegal set process affinity operation when not capable");
96 int error = 0;
97 const hwloc_topology_support *support =
98 hwloc_topology_get_support(__kmp_hwloc_topology);
99 if (support->cpubind->set_proc_cpubind) {
100 int retval;
101 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102 HWLOC_CPUBIND_PROCESS);
103 if (retval >= 0)
104 return 0;
105 error = errno;
106 if (abort_on_error)
107 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108 KMP_ERR(error), __kmp_msg_null);
109 }
110 return error;
111 }
112#endif
113 int get_proc_group() const override {
114 int group = -1;
115#if KMP_OS_WINDOWS
116 if (__kmp_num_proc_groups == 1) {
117 return 1;
118 }
119 for (int i = 0; i < __kmp_num_proc_groups; i++) {
120 // On windows, the long type is always 32 bits
121 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122 unsigned long second_32_bits =
123 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124 if (first_32_bits == 0 && second_32_bits == 0) {
125 continue;
126 }
127 if (group >= 0) {
128 return -1;
129 }
130 group = i;
131 }
132#endif /* KMP_OS_WINDOWS */
133 return group;
134 }
135 };
136 void determine_capable(const char *var) override {
137 const hwloc_topology_support *topology_support;
138 if (__kmp_hwloc_topology == NULL) {
139 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140 __kmp_hwloc_error = TRUE;
141 if (__kmp_affinity.flags.verbose) {
142 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143 }
144 }
145 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146 __kmp_hwloc_error = TRUE;
147 if (__kmp_affinity.flags.verbose) {
148 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149 }
150 }
151 }
152 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153 // Is the system capable of setting/getting this thread's affinity?
154 // Also, is topology discovery possible? (pu indicates ability to discover
155 // processing units). And finally, were there no errors when calling any
156 // hwloc_* API functions?
157 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158 topology_support->cpubind->get_thisthread_cpubind &&
159 topology_support->discovery->pu && !__kmp_hwloc_error) {
160 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161 KMP_AFFINITY_ENABLE(TRUE);
162 } else {
163 // indicate that hwloc didn't work and disable affinity
164 __kmp_hwloc_error = TRUE;
165 KMP_AFFINITY_DISABLE();
166 }
167 }
168 void bind_thread(int which) override {
169 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170 "Illegal set affinity operation when not capable");
171 KMPAffinity::Mask *mask;
172 KMP_CPU_ALLOC_ON_STACK(mask);
173 KMP_CPU_ZERO(mask);
174 KMP_CPU_SET(which, mask);
175 __kmp_set_system_affinity(mask, TRUE);
176 KMP_CPU_FREE_FROM_STACK(mask);
177 }
178 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180 KMPAffinity::Mask *allocate_mask_array(int num) override {
181 return new Mask[num];
182 }
183 void deallocate_mask_array(KMPAffinity::Mask *array) override {
184 Mask *hwloc_array = static_cast<Mask *>(array);
185 delete[] hwloc_array;
186 }
187 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188 int index) override {
189 Mask *hwloc_array = static_cast<Mask *>(array);
190 return &(hwloc_array[index]);
191 }
192 api_type get_api_type() const override { return HWLOC; }
193};
194#endif /* KMP_USE_HWLOC */
195
196#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
197 KMP_OS_AIX
198#if KMP_OS_LINUX
199/* On some of the older OS's that we build on, these constants aren't present
200 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201 all systems of the same arch where they are defined, and they cannot change.
202 stone forever. */
203#include <sys/syscall.h>
204#if KMP_ARCH_X86 || KMP_ARCH_ARM
205#ifndef __NR_sched_setaffinity
206#define __NR_sched_setaffinity 241
207#elif __NR_sched_setaffinity != 241
208#error Wrong code for setaffinity system call.
209#endif /* __NR_sched_setaffinity */
210#ifndef __NR_sched_getaffinity
211#define __NR_sched_getaffinity 242
212#elif __NR_sched_getaffinity != 242
213#error Wrong code for getaffinity system call.
214#endif /* __NR_sched_getaffinity */
215#elif KMP_ARCH_AARCH64
216#ifndef __NR_sched_setaffinity
217#define __NR_sched_setaffinity 122
218#elif __NR_sched_setaffinity != 122
219#error Wrong code for setaffinity system call.
220#endif /* __NR_sched_setaffinity */
221#ifndef __NR_sched_getaffinity
222#define __NR_sched_getaffinity 123
223#elif __NR_sched_getaffinity != 123
224#error Wrong code for getaffinity system call.
225#endif /* __NR_sched_getaffinity */
226#elif KMP_ARCH_RISCV64
227#ifndef __NR_sched_setaffinity
228#define __NR_sched_setaffinity 122
229#elif __NR_sched_setaffinity != 122
230#error Wrong code for setaffinity system call.
231#endif /* __NR_sched_setaffinity */
232#ifndef __NR_sched_getaffinity
233#define __NR_sched_getaffinity 123
234#elif __NR_sched_getaffinity != 123
235#error Wrong code for getaffinity system call.
236#endif /* __NR_sched_getaffinity */
237#elif KMP_ARCH_X86_64
238#ifndef __NR_sched_setaffinity
239#define __NR_sched_setaffinity 203
240#elif __NR_sched_setaffinity != 203
241#error Wrong code for setaffinity system call.
242#endif /* __NR_sched_setaffinity */
243#ifndef __NR_sched_getaffinity
244#define __NR_sched_getaffinity 204
245#elif __NR_sched_getaffinity != 204
246#error Wrong code for getaffinity system call.
247#endif /* __NR_sched_getaffinity */
248#elif KMP_ARCH_PPC64
249#ifndef __NR_sched_setaffinity
250#define __NR_sched_setaffinity 222
251#elif __NR_sched_setaffinity != 222
252#error Wrong code for setaffinity system call.
253#endif /* __NR_sched_setaffinity */
254#ifndef __NR_sched_getaffinity
255#define __NR_sched_getaffinity 223
256#elif __NR_sched_getaffinity != 223
257#error Wrong code for getaffinity system call.
258#endif /* __NR_sched_getaffinity */
259#elif KMP_ARCH_MIPS
260#ifndef __NR_sched_setaffinity
261#define __NR_sched_setaffinity 4239
262#elif __NR_sched_setaffinity != 4239
263#error Wrong code for setaffinity system call.
264#endif /* __NR_sched_setaffinity */
265#ifndef __NR_sched_getaffinity
266#define __NR_sched_getaffinity 4240
267#elif __NR_sched_getaffinity != 4240
268#error Wrong code for getaffinity system call.
269#endif /* __NR_sched_getaffinity */
270#elif KMP_ARCH_MIPS64
271#ifndef __NR_sched_setaffinity
272#define __NR_sched_setaffinity 5195
273#elif __NR_sched_setaffinity != 5195
274#error Wrong code for setaffinity system call.
275#endif /* __NR_sched_setaffinity */
276#ifndef __NR_sched_getaffinity
277#define __NR_sched_getaffinity 5196
278#elif __NR_sched_getaffinity != 5196
279#error Wrong code for getaffinity system call.
280#endif /* __NR_sched_getaffinity */
281#elif KMP_ARCH_LOONGARCH64
282#ifndef __NR_sched_setaffinity
283#define __NR_sched_setaffinity 122
284#elif __NR_sched_setaffinity != 122
285#error Wrong code for setaffinity system call.
286#endif /* __NR_sched_setaffinity */
287#ifndef __NR_sched_getaffinity
288#define __NR_sched_getaffinity 123
289#elif __NR_sched_getaffinity != 123
290#error Wrong code for getaffinity system call.
291#endif /* __NR_sched_getaffinity */
292#elif KMP_ARCH_RISCV64
293#ifndef __NR_sched_setaffinity
294#define __NR_sched_setaffinity 122
295#elif __NR_sched_setaffinity != 122
296#error Wrong code for setaffinity system call.
297#endif /* __NR_sched_setaffinity */
298#ifndef __NR_sched_getaffinity
299#define __NR_sched_getaffinity 123
300#elif __NR_sched_getaffinity != 123
301#error Wrong code for getaffinity system call.
302#endif /* __NR_sched_getaffinity */
303#elif KMP_ARCH_VE
304#ifndef __NR_sched_setaffinity
305#define __NR_sched_setaffinity 203
306#elif __NR_sched_setaffinity != 203
307#error Wrong code for setaffinity system call.
308#endif /* __NR_sched_setaffinity */
309#ifndef __NR_sched_getaffinity
310#define __NR_sched_getaffinity 204
311#elif __NR_sched_getaffinity != 204
312#error Wrong code for getaffinity system call.
313#endif /* __NR_sched_getaffinity */
314#elif KMP_ARCH_S390X
315#ifndef __NR_sched_setaffinity
316#define __NR_sched_setaffinity 239
317#elif __NR_sched_setaffinity != 239
318#error Wrong code for setaffinity system call.
319#endif /* __NR_sched_setaffinity */
320#ifndef __NR_sched_getaffinity
321#define __NR_sched_getaffinity 240
322#elif __NR_sched_getaffinity != 240
323#error Wrong code for getaffinity system call.
324#endif /* __NR_sched_getaffinity */
325#elif KMP_ARCH_SPARC
326#ifndef __NR_sched_setaffinity
327#define __NR_sched_setaffinity 261
328#elif __NR_sched_setaffinity != 261
329#error Wrong code for setaffinity system call.
330#endif /* __NR_sched_setaffinity */
331#ifndef __NR_sched_getaffinity
332#define __NR_sched_getaffinity 260
333#elif __NR_sched_getaffinity != 260
334#error Wrong code for getaffinity system call.
335#endif /* __NR_sched_getaffinity */
336#else
337#error Unknown or unsupported architecture
338#endif /* KMP_ARCH_* */
339#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
340#include <pthread.h>
341#include <pthread_np.h>
342#elif KMP_OS_NETBSD
343#include <pthread.h>
344#include <sched.h>
345#elif KMP_OS_AIX
346#include <sys/dr.h>
347#include <sys/rset.h>
348#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
349#define GET_NUMBER_SMT_SETS 0x0004
350extern "C" int syssmt(int flags, int, int, int *);
351#endif
352class KMPNativeAffinity : public KMPAffinity {
353 class Mask : public KMPAffinity::Mask {
354 typedef unsigned long mask_t;
355 typedef decltype(__kmp_affin_mask_size) mask_size_type;
356 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
357 static const mask_t ONE = 1;
358 mask_size_type get_num_mask_types() const {
359 return __kmp_affin_mask_size / sizeof(mask_t);
360 }
361
362 public:
363 mask_t *mask;
364 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
365 ~Mask() {
366 if (mask)
367 __kmp_free(mask);
368 }
369 void set(int i) override {
370 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
371 }
372 bool is_set(int i) const override {
373 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
374 }
375 void clear(int i) override {
376 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
377 }
378 void zero() override {
379 mask_size_type e = get_num_mask_types();
380 for (mask_size_type i = 0; i < e; ++i)
381 mask[i] = (mask_t)0;
382 }
383 bool empty() const override {
384 mask_size_type e = get_num_mask_types();
385 for (mask_size_type i = 0; i < e; ++i)
386 if (mask[i] != (mask_t)0)
387 return false;
388 return true;
389 }
390 void copy(const KMPAffinity::Mask *src) override {
391 const Mask *convert = static_cast<const Mask *>(src);
392 mask_size_type e = get_num_mask_types();
393 for (mask_size_type i = 0; i < e; ++i)
394 mask[i] = convert->mask[i];
395 }
396 void bitwise_and(const KMPAffinity::Mask *rhs) override {
397 const Mask *convert = static_cast<const Mask *>(rhs);
398 mask_size_type e = get_num_mask_types();
399 for (mask_size_type i = 0; i < e; ++i)
400 mask[i] &= convert->mask[i];
401 }
402 void bitwise_or(const KMPAffinity::Mask *rhs) override {
403 const Mask *convert = static_cast<const Mask *>(rhs);
404 mask_size_type e = get_num_mask_types();
405 for (mask_size_type i = 0; i < e; ++i)
406 mask[i] |= convert->mask[i];
407 }
408 void bitwise_not() override {
409 mask_size_type e = get_num_mask_types();
410 for (mask_size_type i = 0; i < e; ++i)
411 mask[i] = ~(mask[i]);
412 }
413 bool is_equal(const KMPAffinity::Mask *rhs) const override {
414 const Mask *convert = static_cast<const Mask *>(rhs);
415 mask_size_type e = get_num_mask_types();
416 for (mask_size_type i = 0; i < e; ++i)
417 if (mask[i] != convert->mask[i])
418 return false;
419 return true;
420 }
421 int begin() const override {
422 int retval = 0;
423 while (retval < end() && !is_set(retval))
424 ++retval;
425 return retval;
426 }
427 int end() const override {
428 int e;
429 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
430 return e;
431 }
432 int next(int previous) const override {
433 int retval = previous + 1;
434 while (retval < end() && !is_set(retval))
435 ++retval;
436 return retval;
437 }
438#if KMP_OS_AIX
439 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
440 // This routine is only used to get the full mask.
441 int get_system_affinity(bool abort_on_error) override {
442 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443 "Illegal get affinity operation when not capable");
444
445 (void)abort_on_error;
446
447 // Set the mask with all CPUs that are available.
448 for (int i = 0; i < __kmp_xproc; ++i)
449 KMP_CPU_SET(i, this);
450 return 0;
451 }
452 int set_system_affinity(bool abort_on_error) const override {
453 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
454
455 "Illegal set affinity operation when not capable");
456
457 int location;
458 int gtid = __kmp_entry_gtid();
459 int tid = thread_self();
460
461 // Unbind the thread if it was bound to any processors before so that
462 // we can bind the thread to CPUs specified by the mask not others.
463 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
464
465 // On AIX, we can only bind to one instead of a set of CPUs with the
466 // bindprocessor() system call.
467 KMP_CPU_SET_ITERATE(location, this) {
468 if (KMP_CPU_ISSET(location, this)) {
469 retval = bindprocessor(BINDTHREAD, tid, location);
470 if (retval == -1 && errno == 1) {
471 rsid_t rsid;
472 rsethandle_t rsh;
473 // Put something in rsh to prevent compiler warning
474 // about uninitalized use
475 rsh = rs_alloc(RS_EMPTY);
476 rsid.at_pid = getpid();
477 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
478 retval = ra_detachrset(R_PROCESS, rsid, 0);
479 retval = bindprocessor(BINDTHREAD, tid, location);
480 }
481 }
482 if (retval == 0) {
483 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
484 "T#%d to cpu=%d.\n",
485 gtid, location));
486 continue;
487 }
488 int error = errno;
489 if (abort_on_error) {
490 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
491 KMP_ERR(error), __kmp_msg_null);
492 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
493 "T#%d to cpu=%d, errno=%d.\n",
494 gtid, location, error));
495 return error;
496 }
497 }
498 }
499 return 0;
500 }
501#else // !KMP_OS_AIX
502 int get_system_affinity(bool abort_on_error) override {
503 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
504 "Illegal get affinity operation when not capable");
505#if KMP_OS_LINUX
506 long retval =
507 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
508#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
509 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
510 reinterpret_cast<cpuset_t *>(mask));
511 int retval = (r == 0 ? 0 : -1);
512#endif
513 if (retval >= 0) {
514 return 0;
515 }
516 int error = errno;
517 if (abort_on_error) {
518 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
519 KMP_ERR(error), __kmp_msg_null);
520 }
521 return error;
522 }
523 int set_system_affinity(bool abort_on_error) const override {
524 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
525 "Illegal set affinity operation when not capable");
526#if KMP_OS_LINUX
527 long retval =
528 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
529#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
530 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
531 reinterpret_cast<cpuset_t *>(mask));
532 int retval = (r == 0 ? 0 : -1);
533#endif
534 if (retval >= 0) {
535 return 0;
536 }
537 int error = errno;
538 if (abort_on_error) {
539 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
540 KMP_ERR(error), __kmp_msg_null);
541 }
542 return error;
543 }
544#endif // KMP_OS_AIX
545 };
546 void determine_capable(const char *env_var) override {
547 __kmp_affinity_determine_capable(env_var);
548 }
549 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
550 KMPAffinity::Mask *allocate_mask() override {
551 KMPNativeAffinity::Mask *retval = new Mask();
552 return retval;
553 }
554 void deallocate_mask(KMPAffinity::Mask *m) override {
555 KMPNativeAffinity::Mask *native_mask =
556 static_cast<KMPNativeAffinity::Mask *>(m);
557 delete native_mask;
558 }
559 KMPAffinity::Mask *allocate_mask_array(int num) override {
560 return new Mask[num];
561 }
562 void deallocate_mask_array(KMPAffinity::Mask *array) override {
563 Mask *linux_array = static_cast<Mask *>(array);
564 delete[] linux_array;
565 }
566 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
567 int index) override {
568 Mask *linux_array = static_cast<Mask *>(array);
569 return &(linux_array[index]);
570 }
571 api_type get_api_type() const override { return NATIVE_OS; }
572};
573#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
574 || KMP_OS_AIX */
575
576#if KMP_OS_WINDOWS
577class KMPNativeAffinity : public KMPAffinity {
578 class Mask : public KMPAffinity::Mask {
579 typedef ULONG_PTR mask_t;
580 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
581 mask_t *mask;
582
583 public:
584 Mask() {
585 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
586 }
587 ~Mask() {
588 if (mask)
589 __kmp_free(mask);
590 }
591 void set(int i) override {
592 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
593 }
594 bool is_set(int i) const override {
595 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
596 }
597 void clear(int i) override {
598 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
599 }
600 void zero() override {
601 for (int i = 0; i < __kmp_num_proc_groups; ++i)
602 mask[i] = 0;
603 }
604 bool empty() const override {
605 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
606 if (mask[i])
607 return false;
608 return true;
609 }
610 void copy(const KMPAffinity::Mask *src) override {
611 const Mask *convert = static_cast<const Mask *>(src);
612 for (int i = 0; i < __kmp_num_proc_groups; ++i)
613 mask[i] = convert->mask[i];
614 }
615 void bitwise_and(const KMPAffinity::Mask *rhs) override {
616 const Mask *convert = static_cast<const Mask *>(rhs);
617 for (int i = 0; i < __kmp_num_proc_groups; ++i)
618 mask[i] &= convert->mask[i];
619 }
620 void bitwise_or(const KMPAffinity::Mask *rhs) override {
621 const Mask *convert = static_cast<const Mask *>(rhs);
622 for (int i = 0; i < __kmp_num_proc_groups; ++i)
623 mask[i] |= convert->mask[i];
624 }
625 void bitwise_not() override {
626 for (int i = 0; i < __kmp_num_proc_groups; ++i)
627 mask[i] = ~(mask[i]);
628 }
629 bool is_equal(const KMPAffinity::Mask *rhs) const override {
630 const Mask *convert = static_cast<const Mask *>(rhs);
631 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
632 if (mask[i] != convert->mask[i])
633 return false;
634 return true;
635 }
636 int begin() const override {
637 int retval = 0;
638 while (retval < end() && !is_set(retval))
639 ++retval;
640 return retval;
641 }
642 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
643 int next(int previous) const override {
644 int retval = previous + 1;
645 while (retval < end() && !is_set(retval))
646 ++retval;
647 return retval;
648 }
649 int set_process_affinity(bool abort_on_error) const override {
650 if (__kmp_num_proc_groups <= 1) {
651 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
652 DWORD error = GetLastError();
653 if (abort_on_error) {
654 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
655 __kmp_msg_null);
656 }
657 return error;
658 }
659 }
660 return 0;
661 }
662 int set_system_affinity(bool abort_on_error) const override {
663 if (__kmp_num_proc_groups > 1) {
664 // Check for a valid mask.
665 GROUP_AFFINITY ga;
666 int group = get_proc_group();
667 if (group < 0) {
668 if (abort_on_error) {
669 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
670 }
671 return -1;
672 }
673 // Transform the bit vector into a GROUP_AFFINITY struct
674 // and make the system call to set affinity.
675 ga.Group = group;
676 ga.Mask = mask[group];
677 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
678
679 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
680 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
681 DWORD error = GetLastError();
682 if (abort_on_error) {
683 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
684 __kmp_msg_null);
685 }
686 return error;
687 }
688 } else {
689 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
690 DWORD error = GetLastError();
691 if (abort_on_error) {
692 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
693 __kmp_msg_null);
694 }
695 return error;
696 }
697 }
698 return 0;
699 }
700 int get_system_affinity(bool abort_on_error) override {
701 if (__kmp_num_proc_groups > 1) {
702 this->zero();
703 GROUP_AFFINITY ga;
704 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
705 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
706 DWORD error = GetLastError();
707 if (abort_on_error) {
708 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
709 KMP_ERR(error), __kmp_msg_null);
710 }
711 return error;
712 }
713 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
714 (ga.Mask == 0)) {
715 return -1;
716 }
717 mask[ga.Group] = ga.Mask;
718 } else {
719 mask_t newMask, sysMask, retval;
720 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
721 DWORD error = GetLastError();
722 if (abort_on_error) {
723 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
724 KMP_ERR(error), __kmp_msg_null);
725 }
726 return error;
727 }
728 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
729 if (!retval) {
730 DWORD error = GetLastError();
731 if (abort_on_error) {
732 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
733 KMP_ERR(error), __kmp_msg_null);
734 }
735 return error;
736 }
737 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
738 if (!newMask) {
739 DWORD error = GetLastError();
740 if (abort_on_error) {
741 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
742 KMP_ERR(error), __kmp_msg_null);
743 }
744 }
745 *mask = retval;
746 }
747 return 0;
748 }
749 int get_proc_group() const override {
750 int group = -1;
751 if (__kmp_num_proc_groups == 1) {
752 return 1;
753 }
754 for (int i = 0; i < __kmp_num_proc_groups; i++) {
755 if (mask[i] == 0)
756 continue;
757 if (group >= 0)
758 return -1;
759 group = i;
760 }
761 return group;
762 }
763 };
764 void determine_capable(const char *env_var) override {
765 __kmp_affinity_determine_capable(env_var);
766 }
767 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
768 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
769 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
770 KMPAffinity::Mask *allocate_mask_array(int num) override {
771 return new Mask[num];
772 }
773 void deallocate_mask_array(KMPAffinity::Mask *array) override {
774 Mask *windows_array = static_cast<Mask *>(array);
775 delete[] windows_array;
776 }
777 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
778 int index) override {
779 Mask *windows_array = static_cast<Mask *>(array);
780 return &(windows_array[index]);
781 }
782 api_type get_api_type() const override { return NATIVE_OS; }
783};
784#endif /* KMP_OS_WINDOWS */
785#endif /* KMP_AFFINITY_SUPPORTED */
786
787// Describe an attribute for a level in the machine topology
788struct kmp_hw_attr_t {
789 int core_type : 8;
790 int core_eff : 8;
791 unsigned valid : 1;
792 unsigned reserved : 15;
793
794 static const int UNKNOWN_CORE_EFF = -1;
795
796 kmp_hw_attr_t()
797 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
798 valid(0), reserved(0) {}
799 void set_core_type(kmp_hw_core_type_t type) {
800 valid = 1;
801 core_type = type;
802 }
803 void set_core_eff(int eff) {
804 valid = 1;
805 core_eff = eff;
806 }
807 kmp_hw_core_type_t get_core_type() const {
808 return (kmp_hw_core_type_t)core_type;
809 }
810 int get_core_eff() const { return core_eff; }
811 bool is_core_type_valid() const {
812 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
813 }
814 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
815 operator bool() const { return valid; }
816 void clear() {
817 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
818 core_eff = UNKNOWN_CORE_EFF;
819 valid = 0;
820 }
821 bool contains(const kmp_hw_attr_t &other) const {
822 if (!valid && !other.valid)
823 return true;
824 if (valid && other.valid) {
825 if (other.is_core_type_valid()) {
826 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
827 return false;
828 }
829 if (other.is_core_eff_valid()) {
830 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
831 return false;
832 }
833 return true;
834 }
835 return false;
836 }
837#if KMP_AFFINITY_SUPPORTED
838 bool contains(const kmp_affinity_attrs_t &attr) const {
839 if (!valid && !attr.valid)
840 return true;
841 if (valid && attr.valid) {
842 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
843 return (is_core_type_valid() &&
844 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
845 if (attr.core_eff != UNKNOWN_CORE_EFF)
846 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
847 return true;
848 }
849 return false;
850 }
851#endif // KMP_AFFINITY_SUPPORTED
852 bool operator==(const kmp_hw_attr_t &rhs) const {
853 return (rhs.valid == valid && rhs.core_eff == core_eff &&
854 rhs.core_type == core_type);
855 }
856 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
857};
858
859#if KMP_AFFINITY_SUPPORTED
860KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
861#endif
862
863class kmp_hw_thread_t {
864public:
865 static const int UNKNOWN_ID = -1;
866 static const int MULTIPLE_ID = -2;
867 static int compare_ids(const void *a, const void *b);
868 static int compare_compact(const void *a, const void *b);
869 int ids[KMP_HW_LAST];
870 int sub_ids[KMP_HW_LAST];
871 bool leader;
872 int os_id;
873 int original_idx;
874 kmp_hw_attr_t attrs;
875
876 void print() const;
877 void clear() {
878 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
879 ids[i] = UNKNOWN_ID;
880 leader = false;
881 attrs.clear();
882 }
883};
884
885class kmp_topology_t {
886
887 struct flags_t {
888 int uniform : 1;
889 int reserved : 31;
890 };
891
892 int depth;
893
894 // The following arrays are all 'depth' long and have been
895 // allocated to hold up to KMP_HW_LAST number of objects if
896 // needed so layers can be added without reallocation of any array
897
898 // Orderd array of the types in the topology
899 kmp_hw_t *types;
900
901 // Keep quick topology ratios, for non-uniform topologies,
902 // this ratio holds the max number of itemAs per itemB
903 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
904 int *ratio;
905
906 // Storage containing the absolute number of each topology layer
907 int *count;
908
909 // The number of core efficiencies. This is only useful for hybrid
910 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
911 int num_core_efficiencies;
912 int num_core_types;
913 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
914
915 // The hardware threads array
916 // hw_threads is num_hw_threads long
917 // Each hw_thread's ids and sub_ids are depth deep
918 int num_hw_threads;
919 kmp_hw_thread_t *hw_threads;
920
921 // Equivalence hash where the key is the hardware topology item
922 // and the value is the equivalent hardware topology type in the
923 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
924 // known equivalence for the topology type
925 kmp_hw_t equivalent[KMP_HW_LAST];
926
927 // Flags describing the topology
928 flags_t flags;
929
930 // Compact value used during sort_compact()
931 int compact;
932
933#if KMP_GROUP_AFFINITY
934 // Insert topology information about Windows Processor groups
935 void _insert_windows_proc_groups();
936#endif
937
938 // Count each item & get the num x's per y
939 // e.g., get the number of cores and the number of threads per core
940 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
941 void _gather_enumeration_information();
942
943 // Remove layers that don't add information to the topology.
944 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
945 void _remove_radix1_layers();
946
947 // Find out if the topology is uniform
948 void _discover_uniformity();
949
950 // Set all the sub_ids for each hardware thread
951 void _set_sub_ids();
952
953 // Set global affinity variables describing the number of threads per
954 // core, the number of packages, the number of cores per package, and
955 // the number of cores.
956 void _set_globals();
957
958 // Set the last level cache equivalent type
959 void _set_last_level_cache();
960
961 // Return the number of cores with a particular attribute, 'attr'.
962 // If 'find_all' is true, then find all cores on the machine, otherwise find
963 // all cores per the layer 'above'
964 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
965 bool find_all = false) const;
966
967public:
968 // Force use of allocate()/deallocate()
969 kmp_topology_t() = delete;
970 kmp_topology_t(const kmp_topology_t &t) = delete;
971 kmp_topology_t(kmp_topology_t &&t) = delete;
972 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
973 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
974
975 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
976 static void deallocate(kmp_topology_t *);
977
978 // Functions used in create_map() routines
979 kmp_hw_thread_t &at(int index) {
980 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
981 return hw_threads[index];
982 }
983 const kmp_hw_thread_t &at(int index) const {
984 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
985 return hw_threads[index];
986 }
987 int get_num_hw_threads() const { return num_hw_threads; }
988 void sort_ids() {
989 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
990 kmp_hw_thread_t::compare_ids);
991 }
992
993 // Insert a new topology layer after allocation
994 void insert_layer(kmp_hw_t type, const int *ids);
995
996 // Check if the hardware ids are unique, if they are
997 // return true, otherwise return false
998 bool check_ids() const;
999
1000 // Function to call after the create_map() routine
1001 void canonicalize();
1002 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
1003
1004// Functions used after canonicalize() called
1005
1006#if KMP_AFFINITY_SUPPORTED
1007 // Set the granularity for affinity settings
1008 void set_granularity(kmp_affinity_t &stgs) const;
1009 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
1010 bool restrict_to_mask(const kmp_affin_mask_t *mask);
1011 bool filter_hw_subset();
1012#endif
1013 bool is_uniform() const { return flags.uniform; }
1014 // Tell whether a type is a valid type in the topology
1015 // returns KMP_HW_UNKNOWN when there is no equivalent type
1016 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1017 if (type == KMP_HW_UNKNOWN)
1018 return KMP_HW_UNKNOWN;
1019 return equivalent[type];
1020 }
1021 // Set type1 = type2
1022 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1023 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1024 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1025 kmp_hw_t real_type2 = equivalent[type2];
1026 if (real_type2 == KMP_HW_UNKNOWN)
1027 real_type2 = type2;
1028 equivalent[type1] = real_type2;
1029 // This loop is required since any of the types may have been set to
1030 // be equivalent to type1. They all must be checked and reset to type2.
1031 KMP_FOREACH_HW_TYPE(type) {
1032 if (equivalent[type] == type1) {
1033 equivalent[type] = real_type2;
1034 }
1035 }
1036 }
1037 // Calculate number of types corresponding to level1
1038 // per types corresponding to level2 (e.g., number of threads per core)
1039 int calculate_ratio(int level1, int level2) const {
1040 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1041 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1042 int r = 1;
1043 for (int level = level1; level > level2; --level)
1044 r *= ratio[level];
1045 return r;
1046 }
1047 int get_ratio(int level) const {
1048 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1049 return ratio[level];
1050 }
1051 int get_depth() const { return depth; };
1052 kmp_hw_t get_type(int level) const {
1053 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1054 return types[level];
1055 }
1056 int get_level(kmp_hw_t type) const {
1057 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1058 int eq_type = equivalent[type];
1059 if (eq_type == KMP_HW_UNKNOWN)
1060 return -1;
1061 for (int i = 0; i < depth; ++i)
1062 if (types[i] == eq_type)
1063 return i;
1064 return -1;
1065 }
1066 int get_count(int level) const {
1067 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1068 return count[level];
1069 }
1070 // Return the total number of cores with attribute 'attr'
1071 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1072 return _get_ncores_with_attr(attr, -1, true);
1073 }
1074 // Return the number of cores with attribute
1075 // 'attr' per topology level 'above'
1076 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1077 return _get_ncores_with_attr(attr, above, false);
1078 }
1079
1080#if KMP_AFFINITY_SUPPORTED
1081 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1082 void sort_compact(kmp_affinity_t &affinity) {
1083 compact = affinity.compact;
1084 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1085 kmp_hw_thread_t::compare_compact);
1086 }
1087#endif
1088 void print(const char *env_var = "KMP_AFFINITY") const;
1089 void dump() const;
1090};
1091extern kmp_topology_t *__kmp_topology;
1092
1093class kmp_hw_subset_t {
1094 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1095
1096public:
1097 // Describe a machine topology item in KMP_HW_SUBSET
1098 struct item_t {
1099 kmp_hw_t type;
1100 int num_attrs;
1101 int num[MAX_ATTRS];
1102 int offset[MAX_ATTRS];
1103 kmp_hw_attr_t attr[MAX_ATTRS];
1104 };
1105 // Put parenthesis around max to avoid accidental use of Windows max macro.
1106 const static int USE_ALL = (std::numeric_limits<int>::max)();
1107
1108private:
1109 int depth;
1110 int capacity;
1111 item_t *items;
1112 kmp_uint64 set;
1113 bool absolute;
1114 // The set must be able to handle up to KMP_HW_LAST number of layers
1115 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1116 // Sorting the KMP_HW_SUBSET items to follow topology order
1117 // All unknown topology types will be at the beginning of the subset
1118 static int hw_subset_compare(const void *i1, const void *i2) {
1119 kmp_hw_t type1 = ((const item_t *)i1)->type;
1120 kmp_hw_t type2 = ((const item_t *)i2)->type;
1121 int level1 = __kmp_topology->get_level(type1);
1122 int level2 = __kmp_topology->get_level(type2);
1123 return level1 - level2;
1124 }
1125
1126public:
1127 // Force use of allocate()/deallocate()
1128 kmp_hw_subset_t() = delete;
1129 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1130 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1131 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1132 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1133
1134 static kmp_hw_subset_t *allocate() {
1135 int initial_capacity = 5;
1136 kmp_hw_subset_t *retval =
1137 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1138 retval->depth = 0;
1139 retval->capacity = initial_capacity;
1140 retval->set = 0ull;
1141 retval->absolute = false;
1142 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1143 return retval;
1144 }
1145 static void deallocate(kmp_hw_subset_t *subset) {
1146 __kmp_free(subset->items);
1147 __kmp_free(subset);
1148 }
1149 void set_absolute() { absolute = true; }
1150 bool is_absolute() const { return absolute; }
1151 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1152 for (int i = 0; i < depth; ++i) {
1153 // Found an existing item for this layer type
1154 // Add the num, offset, and attr to this item
1155 if (items[i].type == type) {
1156 int idx = items[i].num_attrs++;
1157 if ((size_t)idx >= MAX_ATTRS)
1158 return;
1159 items[i].num[idx] = num;
1160 items[i].offset[idx] = offset;
1161 items[i].attr[idx] = attr;
1162 return;
1163 }
1164 }
1165 if (depth == capacity - 1) {
1166 capacity *= 2;
1167 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1168 for (int i = 0; i < depth; ++i)
1169 new_items[i] = items[i];
1170 __kmp_free(items);
1171 items = new_items;
1172 }
1173 items[depth].num_attrs = 1;
1174 items[depth].type = type;
1175 items[depth].num[0] = num;
1176 items[depth].offset[0] = offset;
1177 items[depth].attr[0] = attr;
1178 depth++;
1179 set |= (1ull << type);
1180 }
1181 int get_depth() const { return depth; }
1182 const item_t &at(int index) const {
1183 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1184 return items[index];
1185 }
1186 item_t &at(int index) {
1187 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1188 return items[index];
1189 }
1190 void remove(int index) {
1191 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1192 set &= ~(1ull << items[index].type);
1193 for (int j = index + 1; j < depth; ++j) {
1194 items[j - 1] = items[j];
1195 }
1196 depth--;
1197 }
1198 void sort() {
1199 KMP_DEBUG_ASSERT(__kmp_topology);
1200 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1201 }
1202 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1203
1204 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1205 // This means putting each of {sockets, cores, threads} in the topology if
1206 // they are not specified:
1207 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1208 // e.g., 3module => *s,3module,*c,*t
1209 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1210 // are expecting the traditional sockets/cores/threads topology. For newer
1211 // hardware, there can be intervening layers like dies/tiles/modules
1212 // (usually corresponding to a cache level). So when a user asks for
1213 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1214 // should get 12 hardware threads across 6 cores and effectively ignore the
1215 // module layer.
1216 void canonicalize(const kmp_topology_t *top) {
1217 // Layers to target for KMP_HW_SUBSET canonicalization
1218 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1219
1220 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1221 if (is_absolute())
1222 return;
1223
1224 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1225 // topology doesn't have these layers
1226 for (kmp_hw_t type : targeted)
1227 if (top->get_level(type) == KMP_HW_UNKNOWN)
1228 return;
1229
1230 // Put targeted layers in topology if they do not exist
1231 for (kmp_hw_t type : targeted) {
1232 bool found = false;
1233 for (int i = 0; i < get_depth(); ++i) {
1234 if (top->get_equivalent_type(items[i].type) == type) {
1235 found = true;
1236 break;
1237 }
1238 }
1239 if (!found) {
1240 push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1241 }
1242 }
1243 sort();
1244 // Set as an absolute topology that only targets the targeted layers
1245 set_absolute();
1246 }
1247 void dump() const {
1248 printf("**********************\n");
1249 printf("*** kmp_hw_subset: ***\n");
1250 printf("* depth: %d\n", depth);
1251 printf("* items:\n");
1252 for (int i = 0; i < depth; ++i) {
1253 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1254 for (int j = 0; j < items[i].num_attrs; ++j) {
1255 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1256 items[i].offset[j]);
1257 if (!items[i].attr[j]) {
1258 printf(" (none)\n");
1259 } else {
1260 printf(
1261 " core_type = %s, core_eff = %d\n",
1262 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1263 items[i].attr[j].get_core_eff());
1264 }
1265 }
1266 }
1267 printf("* set: 0x%llx\n", set);
1268 printf("* absolute: %d\n", absolute);
1269 printf("**********************\n");
1270 }
1271};
1272extern kmp_hw_subset_t *__kmp_hw_subset;
1273
1274/* A structure for holding machine-specific hierarchy info to be computed once
1275 at init. This structure represents a mapping of threads to the actual machine
1276 hierarchy, or to our best guess at what the hierarchy might be, for the
1277 purpose of performing an efficient barrier. In the worst case, when there is
1278 no machine hierarchy information, it produces a tree suitable for a barrier,
1279 similar to the tree used in the hyper barrier. */
1280class hierarchy_info {
1281public:
1282 /* Good default values for number of leaves and branching factor, given no
1283 affinity information. Behaves a bit like hyper barrier. */
1284 static const kmp_uint32 maxLeaves = 4;
1285 static const kmp_uint32 minBranch = 4;
1291 kmp_uint32 maxLevels;
1292
1297 kmp_uint32 depth;
1298 kmp_uint32 base_num_threads = 0;
1299 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1300 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1301 // 2=initialization in progress
1302 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1303
1308 kmp_uint32 *numPerLevel = nullptr;
1309 kmp_uint32 *skipPerLevel = nullptr;
1310
1311 void deriveLevels() {
1312 int hier_depth = __kmp_topology->get_depth();
1313 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1314 numPerLevel[level] = __kmp_topology->get_ratio(i);
1315 }
1316 }
1317
1318 hierarchy_info()
1319 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1320
1321 void fini() {
1322 if (!uninitialized && numPerLevel) {
1323 __kmp_free(numPerLevel);
1324 numPerLevel = NULL;
1325 uninitialized = not_initialized;
1326 }
1327 }
1328
1329 void init(int num_addrs) {
1330 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1331 &uninitialized, not_initialized, initializing);
1332 if (bool_result == 0) { // Wait for initialization
1333 while (TCR_1(uninitialized) != initialized)
1334 KMP_CPU_PAUSE();
1335 return;
1336 }
1337 KMP_DEBUG_ASSERT(bool_result == 1);
1338
1339 /* Added explicit initialization of the data fields here to prevent usage of
1340 dirty value observed when static library is re-initialized multiple times
1341 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1342 OpenMP). */
1343 depth = 1;
1344 resizing = 0;
1345 maxLevels = 7;
1346 numPerLevel =
1347 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1348 skipPerLevel = &(numPerLevel[maxLevels]);
1349 for (kmp_uint32 i = 0; i < maxLevels;
1350 ++i) { // init numPerLevel[*] to 1 item per level
1351 numPerLevel[i] = 1;
1352 skipPerLevel[i] = 1;
1353 }
1354
1355 // Sort table by physical ID
1356 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1357 deriveLevels();
1358 } else {
1359 numPerLevel[0] = maxLeaves;
1360 numPerLevel[1] = num_addrs / maxLeaves;
1361 if (num_addrs % maxLeaves)
1362 numPerLevel[1]++;
1363 }
1364
1365 base_num_threads = num_addrs;
1366 for (int i = maxLevels - 1; i >= 0;
1367 --i) // count non-empty levels to get depth
1368 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1369 depth++;
1370
1371 kmp_uint32 branch = minBranch;
1372 if (numPerLevel[0] == 1)
1373 branch = num_addrs / maxLeaves;
1374 if (branch < minBranch)
1375 branch = minBranch;
1376 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1377 while (numPerLevel[d] > branch ||
1378 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1379 if (numPerLevel[d] & 1)
1380 numPerLevel[d]++;
1381 numPerLevel[d] = numPerLevel[d] >> 1;
1382 if (numPerLevel[d + 1] == 1)
1383 depth++;
1384 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1385 }
1386 if (numPerLevel[0] == 1) {
1387 branch = branch >> 1;
1388 if (branch < 4)
1389 branch = minBranch;
1390 }
1391 }
1392
1393 for (kmp_uint32 i = 1; i < depth; ++i)
1394 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1395 // Fill in hierarchy in the case of oversubscription
1396 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1397 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1398
1399 uninitialized = initialized; // One writer
1400 }
1401
1402 // Resize the hierarchy if nproc changes to something larger than before
1403 void resize(kmp_uint32 nproc) {
1404 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1405 while (bool_result == 0) { // someone else is trying to resize
1406 KMP_CPU_PAUSE();
1407 if (nproc <= base_num_threads) // happy with other thread's resize
1408 return;
1409 else // try to resize
1410 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1411 }
1412 KMP_DEBUG_ASSERT(bool_result != 0);
1413 if (nproc <= base_num_threads)
1414 return; // happy with other thread's resize
1415
1416 // Calculate new maxLevels
1417 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1418 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1419 // First see if old maxLevels is enough to contain new size
1420 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1421 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1422 numPerLevel[i - 1] *= 2;
1423 old_sz *= 2;
1424 depth++;
1425 }
1426 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1427 while (nproc > old_sz) {
1428 old_sz *= 2;
1429 incs++;
1430 depth++;
1431 }
1432 maxLevels += incs;
1433
1434 // Resize arrays
1435 kmp_uint32 *old_numPerLevel = numPerLevel;
1436 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1437 numPerLevel = skipPerLevel = NULL;
1438 numPerLevel =
1439 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1440 skipPerLevel = &(numPerLevel[maxLevels]);
1441
1442 // Copy old elements from old arrays
1443 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1444 // init numPerLevel[*] to 1 item per level
1445 numPerLevel[i] = old_numPerLevel[i];
1446 skipPerLevel[i] = old_skipPerLevel[i];
1447 }
1448
1449 // Init new elements in arrays to 1
1450 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1451 // init numPerLevel[*] to 1 item per level
1452 numPerLevel[i] = 1;
1453 skipPerLevel[i] = 1;
1454 }
1455
1456 // Free old arrays
1457 __kmp_free(old_numPerLevel);
1458 }
1459
1460 // Fill in oversubscription levels of hierarchy
1461 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1462 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1463
1464 base_num_threads = nproc;
1465 resizing = 0; // One writer
1466 }
1467};
1468#endif // KMP_AFFINITY_H