30#include "kmp_dispatch.h"
32#include "kmp_dispatch_hier.h"
36#include "ompt-specific.h"
42void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
73 bool use_hier =
false) {
76 int monotonicity = SCHEDULE_NONMONOTONIC;
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
93#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
95static inline float __kmp_round_2decimal_val(
float num) {
96 return (
float)(
static_cast<int>(num * 100 + 0.5)) / 100;
98static inline int __kmp_get_round_val(
float num) {
99 return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
105__kmp_initialize_self_buffer(kmp_team_t *team, T
id,
106 dispatch_private_info_template<T> *pr,
107 typename traits_t<T>::unsigned_t nchunks, T nproc,
108 typename traits_t<T>::unsigned_t &init,
109 T &small_chunk, T &extras, T &p_extra) {
111#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112 if (pr->flags.use_hybrid) {
113 kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((
int)
id, team)];
114 kmp_hw_core_type_t type =
115 (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116 T pchunks = pr->u.p.pchunks;
117 T echunks = nchunks - pchunks;
118 T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119 T num_procs_with_ecore = nproc - num_procs_with_pcore;
120 T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
122 pchunks / num_procs_with_pcore;
124 echunks / num_procs_with_ecore;
127 (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
129 p_extra = (big_chunk - small_chunk);
131 if (type == KMP_HW_CORE_TYPE_CORE) {
132 if (
id < first_thread_with_ecore) {
133 init =
id * small_chunk +
id * p_extra + (
id < extras ? id : extras);
135 init =
id * small_chunk + (
id - num_procs_with_ecore) * p_extra +
136 (
id < extras ?
id : extras);
139 if (
id == first_thread_with_ecore) {
140 init =
id * small_chunk +
id * p_extra + (
id < extras ? id : extras);
142 init =
id * small_chunk + first_thread_with_ecore * p_extra +
143 (
id < extras ? id : extras);
146 p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
151 small_chunk = nchunks / nproc;
152 extras = nchunks % nproc;
154 init =
id * small_chunk + (
id < extras ? id : extras);
157#if KMP_STATIC_STEAL_ENABLED
184void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
185 dispatch_private_info_template<T> *pr,
187 typename traits_t<T>::signed_t st,
189 kmp_uint64 *cur_chunk,
191 typename traits_t<T>::signed_t chunk,
193 typedef typename traits_t<T>::unsigned_t UT;
194 typedef typename traits_t<T>::floating_t DBL;
204 typedef typename traits_t<T>::signed_t ST;
208 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called "
209 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211 traits_t<T>::spec, traits_t<T>::spec,
212 traits_t<ST>::spec, traits_t<ST>::spec,
213 traits_t<T>::spec, traits_t<T>::spec);
214 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215 __kmp_str_free(&buff);
219 th = __kmp_threads[gtid];
220 team = th->th.th_team;
221 active = !team->t.t_serialized;
224 int itt_need_metadata_reporting =
225 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227 team->t.t_active_level == 1;
230#if KMP_USE_HIER_SCHED
231 use_hier = pr->flags.use_hier;
237 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
242 pr->flags.nomerge = TRUE;
246 pr->flags.nomerge = FALSE;
248 pr->type_size = traits_t<T>::type_size;
250 pr->flags.ordered = TRUE;
254 pr->flags.ordered = FALSE;
257 if (pr->flags.ordered) {
258 monotonicity = SCHEDULE_MONOTONIC;
262 schedule = __kmp_static;
264 if (schedule == kmp_sch_runtime) {
267 schedule = team->t.t_sched.r_sched_type;
268 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270 if (pr->flags.ordered)
271 monotonicity = SCHEDULE_MONOTONIC;
275 schedule = __kmp_guided;
277 schedule = __kmp_static;
281 chunk = team->t.t_sched.chunk;
290 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: "
291 "schedule:%%d chunk:%%%s\n",
293 KD_TRACE(10, (buff, gtid, schedule, chunk));
294 __kmp_str_free(&buff);
299 schedule = __kmp_guided;
302 chunk = KMP_DEFAULT_CHUNK;
308 schedule = __kmp_auto;
313 buff = __kmp_str_format(
314 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315 "schedule:%%d chunk:%%%s\n",
317 KD_TRACE(10, (buff, gtid, schedule, chunk));
318 __kmp_str_free(&buff);
322#if KMP_STATIC_STEAL_ENABLED
324 if (schedule == kmp_sch_dynamic_chunked) {
325 if (monotonicity == SCHEDULE_NONMONOTONIC)
326 schedule = kmp_sch_static_steal;
330 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331 schedule = kmp_sch_guided_iterative_chunked;
332 KMP_WARNING(DispatchManyThreads);
336 schedule = team->t.t_sched.r_sched_type;
337 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
342 schedule == __kmp_static) {
343 schedule = kmp_sch_static_balanced_chunked;
348 chunk = team->t.t_sched.chunk * chunk;
358 buff = __kmp_str_format(
359 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
362 KD_TRACE(10, (buff, gtid, schedule, chunk));
363 __kmp_str_free(&buff);
367 pr->u.p.parm1 = chunk;
370 "unknown scheduling type");
374 if (__kmp_env_consistency_check) {
376 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
391 tc = (UT)(lb - ub) / (-st) + 1;
399 tc = (UT)(ub - lb) / st + 1;
406 if (KMP_MASTER_GTID(gtid)) {
417 pr->u.p.last_upper = ub + st;
423 if (pr->flags.ordered) {
424 pr->ordered_bumped = 0;
425 pr->u.p.ordered_lower = 1;
426 pr->u.p.ordered_upper = 0;
431#if KMP_STATIC_STEAL_ENABLED
432 case kmp_sch_static_steal: {
436 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
439 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440 if (nproc > 1 && ntc >= nproc) {
443 T small_chunk, extras, p_extra = 0;
444 kmp_uint32 old = UNUSED;
445 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446 if (traits_t<T>::type_size > 4) {
452 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
453 __kmp_init_lock(pr->u.p.steal_lock);
456#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
459 bool use_hybrid =
false;
460 kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461 T first_thread_with_ecore = 0;
462 T num_procs_with_pcore = 0;
463 T num_procs_with_ecore = 0;
464 T p_ntc = 0, e_ntc = 0;
465 if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466 __kmp_affinity.type != affinity_explicit) {
468 core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470 __kmp_first_osid_with_ecore > -1) {
471 for (
int i = 0; i < team->t.t_nproc; ++i) {
472 kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473 ->th.th_topology_attrs.core_type;
474 int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475 if (
id == __kmp_first_osid_with_ecore) {
476 first_thread_with_ecore =
477 team->t.t_threads[i]->th.th_info.ds.ds_tid;
479 if (type == KMP_HW_CORE_TYPE_CORE) {
480 num_procs_with_pcore++;
481 }
else if (type == KMP_HW_CORE_TYPE_ATOM) {
482 num_procs_with_ecore++;
489 if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490 float multiplier = 60.0 / 40.0;
491 float p_ratio = (float)num_procs_with_pcore / nproc;
492 float e_ratio = (float)num_procs_with_ecore / nproc;
495 (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496 float p_multiplier = multiplier * e_multiplier;
497 p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498 if ((
int)p_ntc > (
int)(ntc * p_ratio * p_multiplier))
500 (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
502 e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503 KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
507 use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508 e_ntc >= num_procs_with_ecore)
515 pr->flags.use_hybrid = use_hybrid;
516 pr->u.p.pchunks = p_ntc;
517 pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518 pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
521 KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522 T big_chunk = p_ntc / num_procs_with_pcore;
523 small_chunk = e_ntc / num_procs_with_ecore;
526 (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
528 p_extra = (big_chunk - small_chunk);
530 if (core_type == KMP_HW_CORE_TYPE_CORE) {
531 if (
id < first_thread_with_ecore) {
533 id * small_chunk +
id * p_extra + (
id < extras ? id : extras);
535 init =
id * small_chunk + (
id - num_procs_with_ecore) * p_extra +
536 (
id < extras ?
id : extras);
539 if (
id == first_thread_with_ecore) {
541 id * small_chunk +
id * p_extra + (
id < extras ? id : extras);
543 init =
id * small_chunk + first_thread_with_ecore * p_extra +
544 (
id < extras ? id : extras);
547 p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
551 small_chunk = ntc / nproc;
552 extras = ntc % nproc;
553 init =
id * small_chunk + (
id < extras ? id : extras);
556 pr->u.p.count = init;
558 pr->u.p.ub = init + small_chunk + p_extra + (
id < extras ? 1 : 0);
561 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
564 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
570 pr->u.p.parm3 = nproc;
571 pr->u.p.parm4 = (
id + 1) % nproc;
575 schedule = kmp_sch_dynamic_chunked;
576 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d switching to "
577 "kmp_sch_dynamic_chunked\n",
584 case kmp_sch_static_balanced: {
589 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
599 pr->u.p.parm1 = (
id == tc - 1);
602 pr->u.p.parm1 = FALSE;
606 T small_chunk = tc / nproc;
607 T extras = tc % nproc;
608 init =
id * small_chunk + (
id < extras ? id : extras);
609 limit = init + small_chunk - (
id < extras ? 0 : 1);
610 pr->u.p.parm1 = (
id == nproc - 1);
616 pr->u.p.parm1 = TRUE;
620 pr->u.p.parm1 = FALSE;
626 if (itt_need_metadata_reporting)
628 *cur_chunk = limit - init + 1;
631 pr->u.p.lb = lb + init;
632 pr->u.p.ub = lb + limit;
635 T ub_tmp = lb + limit * st;
636 pr->u.p.lb = lb + init * st;
640 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
642 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
645 if (pr->flags.ordered) {
646 pr->u.p.ordered_lower = init;
647 pr->u.p.ordered_upper = limit;
651 case kmp_sch_static_balanced_chunked: {
654 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655 " -> falling-through to static_greedy\n",
657 schedule = kmp_sch_static_greedy;
659 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
665 case kmp_sch_guided_iterative_chunked: {
668 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
673 if ((2L * chunk + 1) * nproc >= tc) {
675 schedule = kmp_sch_dynamic_chunked;
679 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680 *(
double *)&pr->u.p.parm3 =
681 guided_flt_param / (
double)nproc;
684 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
685 "kmp_sch_static_greedy\n",
687 schedule = kmp_sch_static_greedy;
691 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
697 case kmp_sch_guided_analytical_chunked: {
698 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
699 "kmp_sch_guided_analytical_chunked case\n",
703 if ((2L * chunk + 1) * nproc >= tc) {
705 schedule = kmp_sch_dynamic_chunked;
711#if KMP_USE_X87CONTROL
721 unsigned int oldFpcw = _control87(0, 0);
722 _control87(_PC_64, _MCW_PC);
726 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
733 x = 1.0 - 0.5 / (double)nproc;
744 ptrdiff_t natural_alignment =
745 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
749 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
754 *(DBL *)&pr->u.p.parm3 = x;
767 p = __kmp_pow<UT>(x, right);
772 }
while (p > target && right < (1 << 27));
780 while (left + 1 < right) {
781 mid = (left + right) / 2;
782 if (__kmp_pow<UT>(x, mid) > target) {
791 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792 __kmp_pow<UT>(x, cross) <= target);
795 pr->u.p.parm2 = cross;
798#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
801#define GUIDED_ANALYTICAL_WORKAROUND (x)
805 __kmp_dispatch_guided_remaining(
806 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
808#if KMP_USE_X87CONTROL
810 _control87(oldFpcw, _MCW_PC);
814 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
815 "kmp_sch_static_greedy\n",
817 schedule = kmp_sch_static_greedy;
823 case kmp_sch_static_greedy:
826 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
828 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
830 case kmp_sch_static_chunked:
831 case kmp_sch_dynamic_chunked:
835 if (pr->u.p.parm1 <= 0)
836 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837 else if (pr->u.p.parm1 > tc)
841 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
843 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
846 case kmp_sch_trapezoidal: {
849 T parm1, parm2, parm3, parm4;
851 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
857 parm2 = (tc / (2 * nproc));
867 }
else if (parm1 > parm2) {
872 parm3 = (parm2 + parm1);
873 parm3 = (2 * tc + parm3 - 1) / parm3;
881 parm4 = (parm2 - parm1) / parm4;
888 pr->u.p.parm1 = parm1;
889 pr->u.p.parm2 = parm2;
890 pr->u.p.parm3 = parm3;
891 pr->u.p.parm4 = parm4;
896 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
897 KMP_HNT(GetNewerLibrary),
902 pr->schedule = schedule;
905#if KMP_USE_HIER_SCHED
907inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
908 typename traits_t<T>::signed_t st);
911__kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
912 kmp_int32 ub, kmp_int32 st) {
913 __kmp_dispatch_init_hierarchy<kmp_int32>(
914 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
919__kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
920 kmp_uint32 ub, kmp_int32 st) {
921 __kmp_dispatch_init_hierarchy<kmp_uint32>(
922 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
927__kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
928 kmp_int64 ub, kmp_int64 st) {
929 __kmp_dispatch_init_hierarchy<kmp_int64>(
930 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
935__kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
936 kmp_uint64 ub, kmp_int64 st) {
937 __kmp_dispatch_init_hierarchy<kmp_uint64>(
938 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
943void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945 for (
int i = 0; i < num_disp_buff; ++i) {
948 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
949 &team->t.t_disp_buffer[i]);
951 sh->hier->deallocate();
952 __kmp_free(sh->hier);
963 T ub,
typename traits_t<T>::signed_t st,
964 typename traits_t<T>::signed_t chunk,
int push_ws) {
965 typedef typename traits_t<T>::unsigned_t UT;
970 kmp_uint32 my_buffer_index;
971 dispatch_private_info_template<T> *pr;
972 dispatch_shared_info_template<T>
volatile *sh;
974 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
975 sizeof(dispatch_private_info));
976 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
977 sizeof(dispatch_shared_info));
978 __kmp_assert_valid_gtid(gtid);
980 if (!TCR_4(__kmp_init_parallel))
981 __kmp_parallel_initialize();
983 __kmp_resume_if_soft_paused();
986 SSC_MARK_DISPATCH_INIT();
989 typedef typename traits_t<T>::signed_t ST;
993 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d "
994 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995 traits_t<ST>::spec, traits_t<T>::spec,
996 traits_t<T>::spec, traits_t<ST>::spec);
997 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998 __kmp_str_free(&buff);
1002 th = __kmp_threads[gtid];
1003 team = th->th.th_team;
1004 active = !team->t.t_serialized;
1005 th->th.th_ident = loc;
1010 if (schedule == __kmp_static) {
1016#if KMP_USE_HIER_SCHED
1022 my_buffer_index = th->th.th_dispatch->th_disp_index;
1023 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1025 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1031 if (pr->flags.use_hier) {
1033 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. "
1034 "Disabling hierarchical scheduling.\n",
1036 pr->flags.use_hier = FALSE;
1039 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1042 if (!ordered && !pr->flags.use_hier)
1043 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1048 kmp_uint64 cur_chunk = chunk;
1049 int itt_need_metadata_reporting =
1050 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052 team->t.t_active_level == 1;
1055 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1056 th->th.th_dispatch->th_disp_buffer);
1058 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1061 my_buffer_index = th->th.th_dispatch->th_disp_index++;
1064 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1066 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
1068 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1071 if (sh->buffer_index != my_buffer_index) {
1072 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073 " sh->buffer_index:%d\n",
1074 gtid, my_buffer_index, sh->buffer_index));
1075 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1079 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080 "sh->buffer_index:%d\n",
1081 gtid, my_buffer_index, sh->buffer_index));
1085 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1089 chunk, (T)th->th.th_team_nproc,
1090 (T)th->th.th_info.ds.ds_tid);
1092 if (pr->flags.ordered == 0) {
1093 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1096 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1099 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100 th->th.th_dispatch->th_dispatch_sh_current =
1101 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
1103 if (pr->flags.ordered) {
1104 __kmp_itt_ordered_init(gtid);
1107 if (itt_need_metadata_reporting) {
1109 kmp_uint64 schedtype = 0;
1111 case kmp_sch_static_chunked:
1112 case kmp_sch_static_balanced:
1114 case kmp_sch_static_greedy:
1115 cur_chunk = pr->u.p.parm1;
1117 case kmp_sch_dynamic_chunked:
1120 case kmp_sch_guided_iterative_chunked:
1121 case kmp_sch_guided_analytical_chunked:
1131 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1133#if KMP_USE_HIER_SCHED
1134 if (pr->flags.use_hier) {
1136 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1146 buff = __kmp_str_format(
1147 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1149 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159 __kmp_str_free(&buff);
1162#if OMPT_SUPPORT && OMPT_OPTIONAL
1163 if (ompt_enabled.ompt_callback_work) {
1164 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166 ompt_callbacks.ompt_callback(ompt_callback_work)(
1167 ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168 &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169 OMPT_LOAD_RETURN_ADDRESS(gtid));
1172 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1180template <
typename UT>
1181static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
1182 typedef typename traits_t<UT>::signed_t ST;
1183 __kmp_assert_valid_gtid(gtid);
1184 kmp_info_t *th = __kmp_threads[gtid];
1186 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1187 if (!th->th.th_team->t.t_serialized) {
1189 dispatch_private_info_template<UT> *pr =
1190 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1191 th->th.th_dispatch->th_dispatch_pr_current);
1192 dispatch_shared_info_template<UT>
volatile *sh =
1193 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1194 th->th.th_dispatch->th_dispatch_sh_current);
1195 KMP_DEBUG_ASSERT(pr);
1196 KMP_DEBUG_ASSERT(sh);
1197 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1200 if (pr->ordered_bumped) {
1203 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1205 pr->ordered_bumped = 0;
1207 UT lower = pr->u.p.ordered_lower;
1213 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: "
1214 "ordered_iteration:%%%s lower:%%%s\n",
1215 traits_t<UT>::spec, traits_t<UT>::spec);
1216 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217 __kmp_str_free(&buff);
1221 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1222 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1228 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: "
1229 "ordered_iteration:%%%s lower:%%%s\n",
1230 traits_t<UT>::spec, traits_t<UT>::spec);
1231 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232 __kmp_str_free(&buff);
1236 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1239 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1242#ifdef KMP_GOMP_COMPAT
1244template <
typename UT>
1245static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1246 typedef typename traits_t<UT>::signed_t ST;
1247 __kmp_assert_valid_gtid(gtid);
1248 kmp_info_t *th = __kmp_threads[gtid];
1250 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251 if (!th->th.th_team->t.t_serialized) {
1252 dispatch_private_info_template<UT> *pr =
1253 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1254 th->th.th_dispatch->th_dispatch_pr_current);
1255 dispatch_shared_info_template<UT>
volatile *sh =
1256 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1257 th->th.th_dispatch->th_dispatch_sh_current);
1258 KMP_DEBUG_ASSERT(pr);
1259 KMP_DEBUG_ASSERT(sh);
1260 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1263 UT lower = pr->u.p.ordered_lower;
1264 UT upper = pr->u.p.ordered_upper;
1265 UT inc = upper - lower + 1;
1267 if (pr->ordered_bumped == inc) {
1270 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1272 pr->ordered_bumped = 0;
1274 inc -= pr->ordered_bumped;
1280 buff = __kmp_str_format(
1281 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285 __kmp_str_free(&buff);
1289 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1290 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1293 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting "
1294 "ordered_bumped to zero\n",
1296 pr->ordered_bumped = 0;
1302 buff = __kmp_str_format(
1303 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306 traits_t<UT>::spec);
1308 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309 __kmp_str_free(&buff);
1313 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1317 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1322template <
typename T>
1323int __kmp_dispatch_next_algorithm(
int gtid,
1324 dispatch_private_info_template<T> *pr,
1325 dispatch_shared_info_template<T>
volatile *sh,
1326 kmp_int32 *p_last, T *p_lb, T *p_ub,
1327 typename traits_t<T>::signed_t *p_st, T nproc,
1329 typedef typename traits_t<T>::unsigned_t UT;
1330 typedef typename traits_t<T>::signed_t ST;
1331 typedef typename traits_t<T>::floating_t DBL;
1336 UT limit, trip, init;
1337 kmp_info_t *th = __kmp_threads[gtid];
1338 kmp_team_t *team = th->th.th_team;
1340 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342 KMP_DEBUG_ASSERT(pr);
1343 KMP_DEBUG_ASSERT(sh);
1344 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1350 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351 "sh:%%p nproc:%%%s tid:%%%s\n",
1352 traits_t<T>::spec, traits_t<T>::spec);
1353 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1354 __kmp_str_free(&buff);
1359 if (pr->u.p.tc == 0) {
1361 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1367 switch (pr->schedule) {
1368#if KMP_STATIC_STEAL_ENABLED
1369 case kmp_sch_static_steal: {
1370 T chunk = pr->u.p.parm1;
1371 UT nchunks = pr->u.p.parm2;
1373 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1376 trip = pr->u.p.tc - 1;
1378 if (traits_t<T>::type_size > 4) {
1381 kmp_lock_t *lck = pr->u.p.steal_lock;
1382 KMP_DEBUG_ASSERT(lck != NULL);
1383 if (pr->u.p.count < (UT)pr->u.p.ub) {
1384 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385 __kmp_acquire_lock(lck, gtid);
1387 init = (pr->u.p.count)++;
1388 status = (init < (UT)pr->u.p.ub);
1389 __kmp_release_lock(lck, gtid);
1395 T while_limit = pr->u.p.parm3;
1397 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1398 __kmp_dispatch_num_buffers;
1400 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF);
1401 while ((!status) && (while_limit != ++while_index)) {
1402 dispatch_private_info_template<T> *v;
1404 T victimId = pr->u.p.parm4;
1405 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1406 v =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1407 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1408 KMP_DEBUG_ASSERT(v);
1409 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410 oldVictimId != victimId) {
1411 victimId = (victimId + 1) % nproc;
1412 v =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1413 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1414 KMP_DEBUG_ASSERT(v);
1416 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1419 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420 kmp_uint32 old = UNUSED;
1422 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1426 T small_chunk = 0, extras = 0, p_extra = 0;
1427 __kmp_initialize_self_buffer<T>(team,
id, pr, nchunks, nproc,
1428 init, small_chunk, extras,
1430 __kmp_acquire_lock(lck, gtid);
1431 pr->u.p.count = init + 1;
1432 pr->u.p.ub = init + small_chunk + p_extra + (
id < extras ? 1 : 0);
1433 __kmp_release_lock(lck, gtid);
1434 pr->u.p.parm4 = (
id + 1) % nproc;
1440 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1441 "stolen chunks from T#%%d, "
1442 "count:%%%s ub:%%%s\n",
1443 traits_t<UT>::spec, traits_t<T>::spec);
1444 KD_TRACE(10, (buff, gtid,
id, pr->u.p.count, pr->u.p.ub));
1445 __kmp_str_free(&buff);
1449 if (pr->u.p.count < (UT)pr->u.p.ub)
1450 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1454 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1455 v->u.p.count >= (UT)v->u.p.ub) {
1456 pr->u.p.parm4 = (victimId + 1) % nproc;
1459 lckv = v->u.p.steal_lock;
1460 KMP_ASSERT(lckv != NULL);
1461 __kmp_acquire_lock(lckv, gtid);
1463 if (v->u.p.count >= limit) {
1464 __kmp_release_lock(lckv, gtid);
1465 pr->u.p.parm4 = (victimId + 1) % nproc;
1471 remaining = limit - v->u.p.count;
1472 if (remaining > 7) {
1474 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1475 init = (v->u.p.ub -= (remaining >> 2));
1478 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1479 init = (v->u.p.ub -= 1);
1481 __kmp_release_lock(lckv, gtid);
1486 buff = __kmp_str_format(
1487 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488 "count:%%%s ub:%%%s\n",
1489 traits_t<UT>::spec, traits_t<UT>::spec);
1490 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1491 __kmp_str_free(&buff);
1494 KMP_DEBUG_ASSERT(init + 1 <= limit);
1495 pr->u.p.parm4 = victimId;
1498 __kmp_acquire_lock(lck, gtid);
1499 pr->u.p.count = init + 1;
1501 __kmp_release_lock(lck, gtid);
1503 if (init + 1 < limit)
1504 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1517 union_i4 vold, vnew;
1518 if (pr->u.p.count < (UT)pr->u.p.ub) {
1519 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1523 while (!KMP_COMPARE_AND_STORE_REL64(
1524 (
volatile kmp_int64 *)&pr->u.p.count,
1525 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1526 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1528 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1532 init = vold.p.count;
1533 status = (init < (UT)vold.p.ub);
1538 T while_limit = pr->u.p.parm3;
1540 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1541 __kmp_dispatch_num_buffers;
1543 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF);
1544 while ((!status) && (while_limit != ++while_index)) {
1545 dispatch_private_info_template<T> *v;
1547 T victimId = pr->u.p.parm4;
1548 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1549 v =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1550 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1551 KMP_DEBUG_ASSERT(v);
1552 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553 oldVictimId != victimId) {
1554 victimId = (victimId + 1) % nproc;
1555 v =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1556 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1557 KMP_DEBUG_ASSERT(v);
1559 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1562 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563 kmp_uint32 old = UNUSED;
1565 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1569 T small_chunk = 0, extras = 0, p_extra = 0;
1570 __kmp_initialize_self_buffer<T>(team,
id, pr, nchunks, nproc,
1571 init, small_chunk, extras,
1573 vnew.p.count = init + 1;
1574 vnew.p.ub = init + small_chunk + p_extra + (
id < extras ? 1 : 0);
1577 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1579 *(
volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1581 pr->u.p.parm4 = (
id + 1) % nproc;
1587 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1588 "stolen chunks from T#%%d, "
1589 "count:%%%s ub:%%%s\n",
1590 traits_t<UT>::spec, traits_t<T>::spec);
1591 KD_TRACE(10, (buff, gtid,
id, pr->u.p.count, pr->u.p.ub));
1592 __kmp_str_free(&buff);
1596 if (pr->u.p.count < (UT)pr->u.p.ub)
1597 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1603 vold.b = *(
volatile kmp_int64 *)(&v->u.p.count);
1604 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1605 vold.p.count >= (UT)vold.p.ub) {
1606 pr->u.p.parm4 = (victimId + 1) % nproc;
1610 remaining = vold.p.ub - vold.p.count;
1613 if (remaining > 7) {
1614 vnew.p.ub -= remaining >> 2;
1618 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1619 if (KMP_COMPARE_AND_STORE_REL64(
1620 (
volatile kmp_int64 *)&v->u.p.count,
1621 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1622 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1628 buff = __kmp_str_format(
1629 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630 "count:%%%s ub:%%%s\n",
1631 traits_t<T>::spec, traits_t<T>::spec);
1632 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633 __kmp_str_free(&buff);
1636 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637 vold.p.ub - vnew.p.ub);
1639 pr->u.p.parm4 = victimId;
1642 vold.p.count = init + 1;
1644 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1646 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1649 if (vold.p.count < (UT)vold.p.ub)
1650 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1666 limit = chunk + init - 1;
1668 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1670 KMP_DEBUG_ASSERT(init <= trip);
1674 if ((last = (limit >= trip)) != 0)
1680 *p_lb = start + init;
1681 *p_ub = start + limit;
1683 *p_lb = start + init * incr;
1684 *p_ub = start + limit * incr;
1690 case kmp_sch_static_balanced: {
1693 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1696 if ((status = !pr->u.p.count) != 0) {
1700 last = (pr->u.p.parm1 != 0);
1704 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1708 case kmp_sch_static_greedy:
1710 case kmp_sch_static_chunked: {
1713 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1714 "kmp_sch_static_[affinity|chunked] case\n",
1716 parm1 = pr->u.p.parm1;
1718 trip = pr->u.p.tc - 1;
1719 init = parm1 * (pr->u.p.count + tid);
1721 if ((status = (init <= trip)) != 0) {
1724 limit = parm1 + init - 1;
1726 if ((last = (limit >= trip)) != 0)
1732 pr->u.p.count += nproc;
1735 *p_lb = start + init;
1736 *p_ub = start + limit;
1738 *p_lb = start + init * incr;
1739 *p_ub = start + limit * incr;
1742 if (pr->flags.ordered) {
1743 pr->u.p.ordered_lower = init;
1744 pr->u.p.ordered_upper = limit;
1750 case kmp_sch_dynamic_chunked: {
1752 UT chunk_size = pr->u.p.parm1;
1753 UT nchunks = pr->u.p.parm2;
1757 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1760 chunk_number = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1761 status = (chunk_number < nchunks);
1768 init = chunk_size * chunk_number;
1769 trip = pr->u.p.tc - 1;
1773 if ((last = (trip - init < (UT)chunk_size)))
1776 limit = chunk_size + init - 1;
1782 *p_lb = start + init;
1783 *p_ub = start + limit;
1785 *p_lb = start + init * incr;
1786 *p_ub = start + limit * incr;
1789 if (pr->flags.ordered) {
1790 pr->u.p.ordered_lower = init;
1791 pr->u.p.ordered_upper = limit;
1797 case kmp_sch_guided_iterative_chunked: {
1798 T chunkspec = pr->u.p.parm1;
1799 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1806 init = sh->u.s.iteration;
1807 remaining = trip - init;
1808 if (remaining <= 0) {
1817 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1819 remaining = trip - init;
1820 if (remaining <= 0) {
1825 if ((T)remaining > chunkspec) {
1826 limit = init + chunkspec - 1;
1829 limit = init + remaining - 1;
1834 limit = init + (UT)((
double)remaining *
1835 *(
double *)&pr->u.p.parm3);
1836 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1837 (ST)init, (ST)limit)) {
1849 *p_lb = start + init * incr;
1850 *p_ub = start + limit * incr;
1851 if (pr->flags.ordered) {
1852 pr->u.p.ordered_lower = init;
1853 pr->u.p.ordered_upper = limit;
1867 T chunk = pr->u.p.parm1;
1869 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1875 init = sh->u.s.iteration;
1876 remaining = trip - init;
1877 if (remaining <= 0) {
1881 KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1883 if ((T)remaining < pr->u.p.parm2) {
1886 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1888 remaining = trip - init;
1889 if (remaining <= 0) {
1894 if ((T)remaining > chunk) {
1895 limit = init + chunk - 1;
1898 limit = init + remaining - 1;
1905 __kmp_type_convert((
double)remaining * (*(
double *)&pr->u.p.parm3),
1907 UT rem = span % chunk;
1909 span += chunk - rem;
1910 limit = init + span;
1911 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1912 (ST)init, (ST)limit)) {
1924 *p_lb = start + init * incr;
1925 *p_ub = start + limit * incr;
1926 if (pr->flags.ordered) {
1927 pr->u.p.ordered_lower = init;
1928 pr->u.p.ordered_upper = limit;
1939 case kmp_sch_guided_analytical_chunked: {
1940 T chunkspec = pr->u.p.parm1;
1942#if KMP_USE_X87CONTROL
1945 unsigned int oldFpcw;
1946 unsigned int fpcwSet = 0;
1948 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1949 "kmp_sch_guided_analytical_chunked case\n",
1954 KMP_DEBUG_ASSERT(nproc > 1);
1955 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1959 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1960 if (chunkIdx >= (UT)pr->u.p.parm2) {
1963 init = chunkIdx * chunkspec + pr->u.p.count;
1966 if ((status = (init > 0 && init <= trip)) != 0) {
1967 limit = init + chunkspec - 1;
1969 if ((last = (limit >= trip)) != 0)
1979#if KMP_USE_X87CONTROL
1984 oldFpcw = _control87(0, 0);
1985 _control87(_PC_64, _MCW_PC);
1990 init = __kmp_dispatch_guided_remaining<T>(
1991 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1992 KMP_DEBUG_ASSERT(init);
1996 limit = trip - __kmp_dispatch_guided_remaining<T>(
1997 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1998 KMP_ASSERT(init <= limit);
2000 KMP_DEBUG_ASSERT(limit <= trip);
2007#if KMP_USE_X87CONTROL
2011 if (fpcwSet && (oldFpcw & fpcwSet))
2012 _control87(oldFpcw, _MCW_PC);
2019 *p_lb = start + init * incr;
2020 *p_ub = start + limit * incr;
2021 if (pr->flags.ordered) {
2022 pr->u.p.ordered_lower = init;
2023 pr->u.p.ordered_upper = limit;
2034 case kmp_sch_trapezoidal: {
2036 T parm2 = pr->u.p.parm2;
2037 T parm3 = pr->u.p.parm3;
2038 T parm4 = pr->u.p.parm4;
2040 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2043 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
2045 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2046 trip = pr->u.p.tc - 1;
2048 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2055 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2058 if ((last = (limit >= trip)) != 0)
2065 *p_lb = start + init;
2066 *p_ub = start + limit;
2068 *p_lb = start + init * incr;
2069 *p_ub = start + limit * incr;
2072 if (pr->flags.ordered) {
2073 pr->u.p.ordered_lower = init;
2074 pr->u.p.ordered_upper = limit;
2081 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
2082 KMP_HNT(GetNewerLibrary),
2090 if (pr->flags.ordered) {
2093 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
2094 "ordered_lower:%%%s ordered_upper:%%%s\n",
2095 traits_t<UT>::spec, traits_t<UT>::spec);
2096 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097 __kmp_str_free(&buff);
2102 buff = __kmp_str_format(
2103 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106 KMP_DEBUG_ASSERT(p_last);
2107 KMP_DEBUG_ASSERT(p_st);
2108 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2109 __kmp_str_free(&buff);
2118#if OMPT_SUPPORT && OMPT_OPTIONAL
2119#define OMPT_LOOP_END \
2120 if (status == 0) { \
2121 if (ompt_enabled.ompt_callback_work) { \
2122 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2123 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2124 ompt_callbacks.ompt_callback(ompt_callback_work)( \
2125 ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
2126 &(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
2129#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
2130 if (ompt_enabled.ompt_callback_dispatch && status) { \
2131 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2132 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2133 ompt_dispatch_chunk_t chunk; \
2134 ompt_data_t instance = ompt_data_none; \
2135 OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
2136 instance.ptr = &chunk; \
2137 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
2138 &(team_info->parallel_data), &(task_info->task_data), \
2139 ompt_dispatch_ws_loop_chunk, instance); \
2143#define OMPT_LOOP_END
2144#define OMPT_LOOP_DISPATCH(lb, ub, st, status)
2147#if KMP_STATS_ENABLED
2148#define KMP_STATS_LOOP_END \
2150 kmp_int64 u, l, t, i; \
2151 l = (kmp_int64)(*p_lb); \
2152 u = (kmp_int64)(*p_ub); \
2153 i = (kmp_int64)(pr->u.p.st); \
2154 if (status == 0) { \
2156 KMP_POP_PARTITIONED_TIMER(); \
2157 } else if (i == 1) { \
2162 } else if (i < 0) { \
2164 t = (l - u) / (-i) + 1; \
2169 t = (u - l) / i + 1; \
2173 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2176#define KMP_STATS_LOOP_END
2179template <
typename T>
2180static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
2182 typename traits_t<T>::signed_t *p_st
2183#
if OMPT_SUPPORT && OMPT_OPTIONAL
2189 typedef typename traits_t<T>::unsigned_t UT;
2190 typedef typename traits_t<T>::signed_t ST;
2195 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2198 dispatch_private_info_template<T> *pr;
2199 __kmp_assert_valid_gtid(gtid);
2200 kmp_info_t *th = __kmp_threads[gtid];
2201 kmp_team_t *team = th->th.th_team;
2203 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
2206 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207 gtid, p_lb, p_ub, p_st, p_last));
2209 if (team->t.t_serialized) {
2211 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2212 th->th.th_dispatch->th_disp_buffer);
2213 KMP_DEBUG_ASSERT(pr);
2215 if ((status = (pr->u.p.tc != 0)) == 0) {
2222 if (__kmp_env_consistency_check) {
2223 if (pr->pushed_ws != ct_none) {
2224 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2227 }
else if (pr->flags.nomerge) {
2230 UT limit, trip, init;
2232 T chunk = pr->u.p.parm1;
2234 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2237 init = chunk * pr->u.p.count++;
2238 trip = pr->u.p.tc - 1;
2240 if ((status = (init <= trip)) == 0) {
2247 if (__kmp_env_consistency_check) {
2248 if (pr->pushed_ws != ct_none) {
2249 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2254 limit = chunk + init - 1;
2257 if ((last = (limit >= trip)) != 0) {
2260 pr->u.p.last_upper = pr->u.p.ub;
2268 *p_lb = start + init;
2269 *p_ub = start + limit;
2271 *p_lb = start + init * incr;
2272 *p_ub = start + limit * incr;
2275 if (pr->flags.ordered) {
2276 pr->u.p.ordered_lower = init;
2277 pr->u.p.ordered_upper = limit;
2282 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d "
2283 "ordered_lower:%%%s ordered_upper:%%%s\n",
2284 traits_t<UT>::spec, traits_t<UT>::spec);
2285 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286 pr->u.p.ordered_upper));
2287 __kmp_str_free(&buff);
2297 pr->u.p.last_upper = *p_ub;
2308 buff = __kmp_str_format(
2309 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2311 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2313 (p_last ? *p_last : 0), status));
2314 __kmp_str_free(&buff);
2317#if INCLUDE_SSC_MARKS
2318 SSC_MARK_DISPATCH_NEXT();
2320 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2326 dispatch_shared_info_template<T>
volatile *sh;
2328 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2331 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2332 th->th.th_dispatch->th_dispatch_pr_current);
2333 KMP_DEBUG_ASSERT(pr);
2334 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2335 th->th.th_dispatch->th_dispatch_sh_current);
2336 KMP_DEBUG_ASSERT(sh);
2338#if KMP_USE_HIER_SCHED
2339 if (pr->flags.use_hier)
2340 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2343 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344 p_st, th->th.th_team_nproc,
2345 th->th.th_info.ds.ds_tid);
2349 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2354 buff = __kmp_str_format(
2355 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356 traits_t<ST>::spec);
2357 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2358 __kmp_str_free(&buff);
2362#if KMP_USE_HIER_SCHED
2363 pr->flags.use_hier = FALSE;
2365 if (num_done == th->th.th_team_nproc - 1) {
2366#if KMP_STATIC_STEAL_ENABLED
2367 if (pr->schedule == kmp_sch_static_steal) {
2369 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2370 __kmp_dispatch_num_buffers;
2372 for (i = 0; i < th->th.th_team_nproc; ++i) {
2373 dispatch_private_info_template<T> *buf =
2374 reinterpret_cast<dispatch_private_info_template<T> *
>(
2375 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2376 KMP_ASSERT(buf->steal_flag == THIEF);
2377 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378 if (traits_t<T>::type_size > 4) {
2380 kmp_lock_t *lck = buf->u.p.steal_lock;
2381 KMP_ASSERT(lck != NULL);
2382 __kmp_destroy_lock(lck);
2384 buf->u.p.steal_lock = NULL;
2393 sh->u.s.num_done = 0;
2394 sh->u.s.iteration = 0;
2397 if (pr->flags.ordered) {
2398 sh->u.s.ordered_iteration = 0;
2403 sh->buffer_index += __kmp_dispatch_num_buffers;
2404 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405 gtid, sh->buffer_index));
2410 if (__kmp_env_consistency_check) {
2411 if (pr->pushed_ws != ct_none) {
2412 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2416 th->th.th_dispatch->th_deo_fcn = NULL;
2417 th->th.th_dispatch->th_dxo_fcn = NULL;
2418 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2423 pr->u.p.last_upper = pr->u.p.ub;
2426 if (p_last != NULL && status != 0)
2434 buff = __kmp_str_format(
2435 "__kmp_dispatch_next: T#%%d normal case: "
2436 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2439 (p_last ? *p_last : 0), status));
2440 __kmp_str_free(&buff);
2443#if INCLUDE_SSC_MARKS
2444 SSC_MARK_DISPATCH_NEXT();
2446 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2472 kmp_uint32 my_buffer_index;
2473 dispatch_shared_info_template<kmp_int32>
volatile *sh;
2475 KMP_DEBUG_ASSERT(__kmp_init_serial);
2477 if (!TCR_4(__kmp_init_parallel))
2478 __kmp_parallel_initialize();
2479 __kmp_resume_if_soft_paused();
2482 th = __kmp_threads[gtid];
2483 team = th->th.th_team;
2484 active = !team->t.t_serialized;
2485 th->th.th_ident = loc;
2488 KD_TRACE(10, (
"__kmpc_sections: called by T#%d\n", gtid));
2495 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2498 my_buffer_index = th->th.th_dispatch->th_disp_index++;
2501 sh =
reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
2502 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503 KD_TRACE(10, (
"__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2506 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2509 KD_TRACE(100, (
"__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510 "sh->buffer_index:%d\n",
2511 gtid, my_buffer_index, sh->buffer_index));
2512 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2513 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2517 KD_TRACE(100, (
"__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518 "sh->buffer_index:%d\n",
2519 gtid, my_buffer_index, sh->buffer_index));
2521 th->th.th_dispatch->th_dispatch_pr_current =
2523 th->th.th_dispatch->th_dispatch_sh_current =
2524 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
2527#if OMPT_SUPPORT && OMPT_OPTIONAL
2528 if (ompt_enabled.ompt_callback_work) {
2529 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2530 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2531 ompt_callbacks.ompt_callback(ompt_callback_work)(
2532 ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2536 KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2552 kmp_int32 numberOfSections) {
2554 KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2556 kmp_info_t *th = __kmp_threads[gtid];
2558 kmp_team_t *team = th->th.th_team;
2561 KD_TRACE(1000, (
"__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2565 KMP_DEBUG_ASSERT(!team->t.t_serialized);
2567 dispatch_shared_info_template<kmp_int32>
volatile *sh;
2569 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2572 KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573 sh =
reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
2574 th->th.th_dispatch->th_dispatch_sh_current);
2575 KMP_DEBUG_ASSERT(sh);
2577 kmp_int32 sectionIndex = 0;
2578 bool moreSectionsToExecute =
true;
2581 sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2582 if (sectionIndex >= numberOfSections) {
2583 moreSectionsToExecute =
false;
2588 if (!moreSectionsToExecute) {
2591 num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2593 if (num_done == th->th.th_team_nproc - 1) {
2598 sh->u.s.num_done = 0;
2599 sh->u.s.iteration = 0;
2603 sh->buffer_index += __kmp_dispatch_num_buffers;
2604 KD_TRACE(100, (
"__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2611 th->th.th_dispatch->th_deo_fcn = NULL;
2612 th->th.th_dispatch->th_dxo_fcn = NULL;
2613 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2616#if OMPT_SUPPORT && OMPT_OPTIONAL
2617 if (ompt_enabled.ompt_callback_dispatch) {
2618 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2619 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2620 ompt_data_t instance = ompt_data_none;
2621 instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2622 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623 &(team_info->parallel_data), &(task_info->task_data),
2624 ompt_dispatch_section, instance);
2629 return sectionIndex;
2642 kmp_info_t *th = __kmp_threads[gtid];
2643 int active = !th->th.th_team->t.t_serialized;
2645 KD_TRACE(100, (
"__kmpc_end_sections: T#%d called\n", gtid));
2649#if OMPT_SUPPORT && OMPT_OPTIONAL
2650 if (ompt_enabled.ompt_callback_work) {
2651 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2652 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2653 ompt_callbacks.ompt_callback(ompt_callback_work)(
2654 ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2660 KMP_POP_PARTITIONED_TIMER();
2661 KD_TRACE(100, (
"__kmpc_end_sections: T#%d returned\n", gtid));
2664template <
typename T>
2665static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2666 kmp_int32 *plastiter, T *plower, T *pupper,
2667 typename traits_t<T>::signed_t incr) {
2668 typedef typename traits_t<T>::unsigned_t UT;
2675 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2678 typedef typename traits_t<T>::signed_t ST;
2682 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684 traits_t<T>::spec, traits_t<T>::spec,
2685 traits_t<ST>::spec, traits_t<T>::spec);
2686 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2687 __kmp_str_free(&buff);
2691 if (__kmp_env_consistency_check) {
2693 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2696 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2706 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2709 __kmp_assert_valid_gtid(gtid);
2710 th = __kmp_threads[gtid];
2711 team = th->th.th_team;
2712 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2713 nteams = th->th.th_teams_size.nteams;
2714 team_id = team->t.t_master_tid;
2715 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2719 trip_count = *pupper - *plower + 1;
2720 }
else if (incr == -1) {
2721 trip_count = *plower - *pupper + 1;
2722 }
else if (incr > 0) {
2724 trip_count = (UT)(*pupper - *plower) / incr + 1;
2726 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2729 if (trip_count <= nteams) {
2731 __kmp_static == kmp_sch_static_greedy ||
2733 kmp_sch_static_balanced);
2735 if (team_id < trip_count) {
2736 *pupper = *plower = *plower + team_id * incr;
2738 *plower = *pupper + incr;
2740 if (plastiter != NULL)
2741 *plastiter = (team_id == trip_count - 1);
2743 if (__kmp_static == kmp_sch_static_balanced) {
2744 UT chunk = trip_count / nteams;
2745 UT extras = trip_count % nteams;
2747 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2749 if (plastiter != NULL)
2750 *plastiter = (team_id == nteams - 1);
2753 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2755 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2757 *plower += team_id * chunk_inc_count;
2758 *pupper = *plower + chunk_inc_count - incr;
2761 if (*pupper < *plower)
2762 *pupper = traits_t<T>::max_value;
2763 if (plastiter != NULL)
2764 *plastiter = *plower <= upper && *pupper > upper - incr;
2765 if (*pupper > upper)
2768 if (*pupper > *plower)
2769 *pupper = traits_t<T>::min_value;
2770 if (plastiter != NULL)
2771 *plastiter = *plower >= upper && *pupper < upper - incr;
2772 if (*pupper < upper)
2804 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2805 KMP_DEBUG_ASSERT(__kmp_init_serial);
2806#if OMPT_SUPPORT && OMPT_OPTIONAL
2807 OMPT_STORE_RETURN_ADDRESS(gtid);
2809 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2816 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2817 KMP_DEBUG_ASSERT(__kmp_init_serial);
2818#if OMPT_SUPPORT && OMPT_OPTIONAL
2819 OMPT_STORE_RETURN_ADDRESS(gtid);
2821 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2829 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2830 KMP_DEBUG_ASSERT(__kmp_init_serial);
2831#if OMPT_SUPPORT && OMPT_OPTIONAL
2832 OMPT_STORE_RETURN_ADDRESS(gtid);
2834 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2842 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2843 KMP_DEBUG_ASSERT(__kmp_init_serial);
2844#if OMPT_SUPPORT && OMPT_OPTIONAL
2845 OMPT_STORE_RETURN_ADDRESS(gtid);
2847 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2861 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2863 KMP_DEBUG_ASSERT(__kmp_init_serial);
2864#if OMPT_SUPPORT && OMPT_OPTIONAL
2865 OMPT_STORE_RETURN_ADDRESS(gtid);
2867 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2868 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2871void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2873 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2875 KMP_DEBUG_ASSERT(__kmp_init_serial);
2876#if OMPT_SUPPORT && OMPT_OPTIONAL
2877 OMPT_STORE_RETURN_ADDRESS(gtid);
2879 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2880 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2883void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2885 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2887 KMP_DEBUG_ASSERT(__kmp_init_serial);
2888#if OMPT_SUPPORT && OMPT_OPTIONAL
2889 OMPT_STORE_RETURN_ADDRESS(gtid);
2891 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2892 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2895void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2897 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2899 KMP_DEBUG_ASSERT(__kmp_init_serial);
2900#if OMPT_SUPPORT && OMPT_OPTIONAL
2901 OMPT_STORE_RETURN_ADDRESS(gtid);
2903 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2904 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2921 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2922#if OMPT_SUPPORT && OMPT_OPTIONAL
2923 OMPT_STORE_RETURN_ADDRESS(gtid);
2925 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926#
if OMPT_SUPPORT && OMPT_OPTIONAL
2928 OMPT_LOAD_RETURN_ADDRESS(gtid)
2937 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2939#if OMPT_SUPPORT && OMPT_OPTIONAL
2940 OMPT_STORE_RETURN_ADDRESS(gtid);
2942 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943#
if OMPT_SUPPORT && OMPT_OPTIONAL
2945 OMPT_LOAD_RETURN_ADDRESS(gtid)
2954 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2955#if OMPT_SUPPORT && OMPT_OPTIONAL
2956 OMPT_STORE_RETURN_ADDRESS(gtid);
2958 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959#
if OMPT_SUPPORT && OMPT_OPTIONAL
2961 OMPT_LOAD_RETURN_ADDRESS(gtid)
2970 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2972#if OMPT_SUPPORT && OMPT_OPTIONAL
2973 OMPT_STORE_RETURN_ADDRESS(gtid);
2975 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976#
if OMPT_SUPPORT && OMPT_OPTIONAL
2978 OMPT_LOAD_RETURN_ADDRESS(gtid)
2990 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2997 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3004 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3011 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3023kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3024 return value == checker;
3027kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3028 return value != checker;
3031kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3032 return value < checker;
3035kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3036 return value >= checker;
3039kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3040 return value <= checker;
3044__kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
3045 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3049 volatile kmp_uint32 *spin = spinner;
3050 kmp_uint32 check = checker;
3052 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3056 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057 KMP_INIT_YIELD(spins);
3058 KMP_INIT_BACKOFF(time);
3060 while (!f(r = TCR_4(*spin), check)) {
3061 KMP_FSYNC_SPIN_PREPARE(obj);
3066 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3068 KMP_FSYNC_SPIN_ACQUIRED(obj);
3072void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
3073 kmp_uint32 (*pred)(
void *, kmp_uint32),
3077 void *spin = spinner;
3078 kmp_uint32 check = checker;
3080 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
3083 KMP_FSYNC_SPIN_INIT(obj, spin);
3084 KMP_INIT_YIELD(spins);
3085 KMP_INIT_BACKOFF(time);
3087 while (!f(spin, check)) {
3088 KMP_FSYNC_SPIN_PREPARE(obj);
3091 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3093 KMP_FSYNC_SPIN_ACQUIRED(obj);
3098#ifdef KMP_GOMP_COMPAT
3100void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
3102 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3104 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3108void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
3110 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3112 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3116void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
3118 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3120 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3124void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
3126 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3128 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3132void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
3133 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3136void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
3137 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3140void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
3141 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3144void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
3145 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid)
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, kmp_int32 numberOfSections)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)