19#include "kmp_collapse.h"
22#include "ompt-specific.h"
29template <
typename T> T __kmp_abs(
const T val) {
30 return (val < 0) ? -val : val;
32kmp_uint32 __kmp_abs(
const kmp_uint32 val) {
return val; }
33kmp_uint64 __kmp_abs(
const kmp_uint64 val) {
return val; }
39template <
typename T>
int __kmp_sign(T val) {
40 return (T(0) < val) - (val < T(0));
43template <
typename T>
class CollapseAllocator {
47 static const size_t allocaSize = 32;
49 char stackAlloc[allocaSize];
50 static constexpr size_t maxElemCount = allocaSize /
sizeof(T);
54 CollapseAllocator(
size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55 if (n > maxElemCount) {
56 pTAlloc =
reinterpret_cast<pT
>(__kmp_allocate(n *
sizeof(T)));
59 ~CollapseAllocator() {
60 if (pTAlloc !=
reinterpret_cast<pT
>(stackAlloc)) {
64 T &operator[](
int index) {
return pTAlloc[index]; }
65 operator const pT() {
return pTAlloc; }
76void kmp_canonicalize_one_loop_XX(
80 if (__kmp_env_consistency_check) {
81 if (bounds->step == 0) {
82 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
87 if (bounds->comparison == comparison_t::comp_not_eq) {
89 if (bounds->step > 0) {
90 bounds->comparison = comparison_t::comp_less;
92 bounds->comparison = comparison_t::comp_greater;
96 if (bounds->comparison == comparison_t::comp_less) {
101 bounds->comparison = comparison_t::comp_less_or_eq;
102 }
else if (bounds->comparison == comparison_t::comp_greater) {
104 bounds->comparison = comparison_t::comp_greater_or_eq;
109void kmp_canonicalize_loop_nest(
ident_t *loc,
113 for (kmp_index_t ind = 0; ind < n; ++ind) {
114 auto bounds = &(original_bounds_nest[ind]);
116 switch (bounds->loop_type) {
117 case loop_type_t::loop_type_int32:
118 kmp_canonicalize_one_loop_XX<kmp_int32>(
122 case loop_type_t::loop_type_uint32:
123 kmp_canonicalize_one_loop_XX<kmp_uint32>(
127 case loop_type_t::loop_type_int64:
128 kmp_canonicalize_one_loop_XX<kmp_int64>(
132 case loop_type_t::loop_type_uint64:
133 kmp_canonicalize_one_loop_XX<kmp_uint64>(
153kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
156 if (bounds->comparison == comparison_t::comp_less_or_eq) {
157 if (bounds->ub0 < bounds->lb0) {
160 bounds->trip_count = 0;
165 static_cast<kmp_loop_nest_iv_t
>(bounds->ub0 - bounds->lb0) /
166 __kmp_abs(bounds->step) +
169 }
else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170 if (bounds->lb0 < bounds->ub0) {
173 bounds->trip_count = 0;
178 static_cast<kmp_loop_nest_iv_t
>(bounds->lb0 - bounds->ub0) /
179 __kmp_abs(bounds->step) +
185 return bounds->trip_count;
189kmp_loop_nest_iv_t kmp_calculate_trip_count(
bounds_info_t *bounds) {
191 kmp_loop_nest_iv_t trip_count = 0;
193 switch (bounds->loop_type) {
194 case loop_type_t::loop_type_int32:
195 trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
198 case loop_type_t::loop_type_uint32:
199 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
202 case loop_type_t::loop_type_int64:
203 trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
206 case loop_type_t::loop_type_uint64:
207 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
222kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
225 switch (loop_iv_type) {
226 case loop_type_t::loop_type_int8:
227 res =
static_cast<kmp_uint64
>(
static_cast<kmp_int8
>(original_iv));
229 case loop_type_t::loop_type_uint8:
230 res =
static_cast<kmp_uint64
>(
static_cast<kmp_uint8
>(original_iv));
232 case loop_type_t::loop_type_int16:
233 res =
static_cast<kmp_uint64
>(
static_cast<kmp_int16
>(original_iv));
235 case loop_type_t::loop_type_uint16:
236 res =
static_cast<kmp_uint64
>(
static_cast<kmp_uint16
>(original_iv));
238 case loop_type_t::loop_type_int32:
239 res =
static_cast<kmp_uint64
>(
static_cast<kmp_int32
>(original_iv));
241 case loop_type_t::loop_type_uint32:
242 res =
static_cast<kmp_uint64
>(
static_cast<kmp_uint32
>(original_iv));
244 case loop_type_t::loop_type_int64:
245 res =
static_cast<kmp_uint64
>(
static_cast<kmp_int64
>(original_iv));
247 case loop_type_t::loop_type_uint64:
248 res =
static_cast<kmp_uint64
>(original_iv);
259bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260 kmp_uint64 original_iv2) {
263 switch (loop_iv_type) {
264 case loop_type_t::loop_type_int8:
265 res =
static_cast<kmp_int8
>(original_iv1) ==
266 static_cast<kmp_int8
>(original_iv2);
268 case loop_type_t::loop_type_uint8:
269 res =
static_cast<kmp_uint8
>(original_iv1) ==
270 static_cast<kmp_uint8
>(original_iv2);
272 case loop_type_t::loop_type_int16:
273 res =
static_cast<kmp_int16
>(original_iv1) ==
274 static_cast<kmp_int16
>(original_iv2);
276 case loop_type_t::loop_type_uint16:
277 res =
static_cast<kmp_uint16
>(original_iv1) ==
278 static_cast<kmp_uint16
>(original_iv2);
280 case loop_type_t::loop_type_int32:
281 res =
static_cast<kmp_int32
>(original_iv1) ==
282 static_cast<kmp_int32
>(original_iv2);
284 case loop_type_t::loop_type_uint32:
285 res =
static_cast<kmp_uint32
>(original_iv1) ==
286 static_cast<kmp_uint32
>(original_iv2);
288 case loop_type_t::loop_type_int64:
289 res =
static_cast<kmp_int64
>(original_iv1) ==
290 static_cast<kmp_int64
>(original_iv2);
292 case loop_type_t::loop_type_uint64:
293 res =
static_cast<kmp_uint64
>(original_iv1) ==
294 static_cast<kmp_uint64
>(original_iv2);
309 const kmp_point_t original_ivs,
312 T iv =
static_cast<T
>(original_ivs[ind]);
313 T outer_iv =
static_cast<T
>(original_ivs[bounds->outer_iv]);
315 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
331 kmp_point_t original_ivs,
332 const kmp_iterations_t iterations, kmp_index_t ind,
333 bool start_with_lower_bound,
bool checkBounds) {
336 T outer_iv =
static_cast<T
>(original_ivs[bounds->outer_iv]);
338 if (start_with_lower_bound) {
341 temp = bounds->lb0 + bounds->lb1 * outer_iv;
343 auto iteration = iterations[ind];
344 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
348 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
351 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
358 kmp_point_t original_ivs,
359 const kmp_iterations_t iterations, kmp_index_t ind,
360 bool start_with_lower_bound,
bool checkBounds) {
362 switch (bounds->loop_type) {
363 case loop_type_t::loop_type_int32:
364 return kmp_calc_one_iv_XX<kmp_int32>(
366 original_ivs, iterations, ind, start_with_lower_bound,
369 case loop_type_t::loop_type_uint32:
370 return kmp_calc_one_iv_XX<kmp_uint32>(
372 original_ivs, iterations, ind, start_with_lower_bound,
375 case loop_type_t::loop_type_int64:
376 return kmp_calc_one_iv_XX<kmp_int64>(
378 original_ivs, iterations, ind, start_with_lower_bound,
381 case loop_type_t::loop_type_uint64:
382 return kmp_calc_one_iv_XX<kmp_uint64>(
384 original_ivs, iterations, ind, start_with_lower_bound,
400 kmp_uint64 *original_ivs,
401 const kmp_iterations_t iterations,
404 auto iteration = iterations[ind];
408 bounds->lb1 *
static_cast<T
>(original_ivs[bounds->outer_iv]) +
409 iteration * bounds->step;
412 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
416 kmp_uint64 *original_ivs,
417 const kmp_iterations_t iterations,
420 switch (bounds->loop_type) {
421 case loop_type_t::loop_type_int32:
422 kmp_calc_one_iv_rectang_XX<kmp_int32>(
424 original_ivs, iterations, ind);
426 case loop_type_t::loop_type_uint32:
427 kmp_calc_one_iv_rectang_XX<kmp_uint32>(
429 original_ivs, iterations, ind);
431 case loop_type_t::loop_type_int64:
432 kmp_calc_one_iv_rectang_XX<kmp_int64>(
434 original_ivs, iterations, ind);
436 case loop_type_t::loop_type_uint64:
437 kmp_calc_one_iv_rectang_XX<kmp_uint64>(
439 original_ivs, iterations, ind);
459extern "C" kmp_loop_nest_iv_t
460__kmpc_process_loop_nest_rectang(
ident_t *loc, kmp_int32 gtid,
464 kmp_canonicalize_loop_nest(loc, original_bounds_nest, n);
466 kmp_loop_nest_iv_t total = 1;
468 for (kmp_index_t ind = 0; ind < n; ++ind) {
469 auto bounds = &(original_bounds_nest[ind]);
471 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count( bounds);
488__kmpc_calc_original_ivs_rectang(
ident_t *loc, kmp_loop_nest_iv_t new_iv,
490 kmp_uint64 *original_ivs,
493 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
496 for (kmp_index_t ind = n; ind > 0;) {
498 auto bounds = &(original_bounds_nest[ind]);
501 auto temp = new_iv / bounds->trip_count;
502 auto iteration = new_iv % bounds->trip_count;
505 iterations[ind] = iteration;
507 KMP_ASSERT(new_iv == 0);
509 for (kmp_index_t ind = 0; ind < n; ++ind) {
510 auto bounds = &(original_bounds_nest[ind]);
512 kmp_calc_one_iv_rectang(bounds, original_ivs, iterations, ind);
528void kmp_calc_span_lessoreq_XX(
529 bounds_info_internalXX_template<T> *bounds,
530 bounds_info_internal_t *bounds_nest) {
532 typedef typename traits_t<T>::unsigned_t UT;
538 auto &bbounds = bounds->b;
540 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
543 bounds_info_internalXX_template<T> *previous =
544 reinterpret_cast<bounds_info_internalXX_template<T> *
>(
545 &(bounds_nest[bbounds.outer_iv]));
551 span_t bound_candidate1 =
552 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553 span_t bound_candidate2 =
554 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555 if (bound_candidate1 < bound_candidate2) {
556 bounds->span_smallest = bound_candidate1;
558 bounds->span_smallest = bound_candidate2;
566 span_t bound_candidate1 =
567 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568 span_t bound_candidate2 =
569 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570 if (bound_candidate1 < bound_candidate2) {
571 bounds->span_biggest = bound_candidate2;
573 bounds->span_biggest = bound_candidate1;
578 bounds->span_smallest = bbounds.lb0;
579 bounds->span_biggest = bbounds.ub0;
581 if (!bounds->loop_bounds_adjusted) {
585 bounds->span_biggest -=
586 (
static_cast<UT
>(bbounds.ub0 - bbounds.lb0)) % bbounds.step;
592void kmp_calc_span_greateroreq_XX(
593 bounds_info_internalXX_template<T> *bounds,
594 bounds_info_internal_t *bounds_nest) {
596 typedef typename traits_t<T>::unsigned_t UT;
602 auto &bbounds = bounds->b;
604 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
607 bounds_info_internalXX_template<T> *previous =
608 reinterpret_cast<bounds_info_internalXX_template<T> *
>(
609 &(bounds_nest[bbounds.outer_iv]));
615 span_t bound_candidate1 =
616 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617 span_t bound_candidate2 =
618 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619 if (bound_candidate1 >= bound_candidate2) {
620 bounds->span_smallest = bound_candidate1;
622 bounds->span_smallest = bound_candidate2;
630 span_t bound_candidate1 =
631 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632 span_t bound_candidate2 =
633 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634 if (bound_candidate1 >= bound_candidate2) {
635 bounds->span_biggest = bound_candidate2;
637 bounds->span_biggest = bound_candidate1;
643 bounds->span_biggest = bbounds.lb0;
644 bounds->span_smallest = bbounds.ub0;
646 if (!bounds->loop_bounds_adjusted) {
650 bounds->span_biggest -=
651 (
static_cast<UT
>(bbounds.ub0 - bbounds.lb0)) % bbounds.step;
657void kmp_calc_span_XX(
658 bounds_info_internalXX_template<T> *bounds,
659 bounds_info_internal_t *bounds_nest) {
661 if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662 kmp_calc_span_lessoreq_XX( bounds, bounds_nest);
664 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665 kmp_calc_span_greateroreq_XX( bounds, bounds_nest);
676void kmp_calc_new_bounds_XX(
677 bounds_info_internalXX_template<T> *bounds,
678 bounds_info_internal_t *bounds_nest) {
680 auto &bbounds = bounds->b;
682 if (bbounds.lb1 == bbounds.ub1) {
684 bounds->loop_bounds_adjusted =
false;
686 bounds->loop_bounds_adjusted =
true;
688 T old_lb1 = bbounds.lb1;
689 T old_ub1 = bbounds.ub1;
691 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
699 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700 ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701 bbounds.lb1 = old_ub1;
703 bbounds.ub1 = old_lb1;
710 bounds_info_internalXX_template<T> *previous =
711 reinterpret_cast<bounds_info_internalXX_template<T> *
>(
712 &bounds_nest[bbounds.outer_iv]);
714 if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715 if (old_lb1 < bbounds.lb1) {
716 KMP_ASSERT(old_lb1 < 0);
720 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
723 }
else if (old_lb1 > bbounds.lb1) {
725 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
729 if (old_ub1 > bbounds.ub1) {
730 KMP_ASSERT(old_ub1 > 0);
734 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
736 }
else if (old_ub1 < bbounds.ub1) {
738 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
742 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743 if (old_lb1 < bbounds.lb1) {
744 KMP_ASSERT(old_lb1 < 0);
745 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
747 }
else if (old_lb1 > bbounds.lb1) {
748 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
752 if (old_ub1 > bbounds.ub1) {
753 KMP_ASSERT(old_ub1 > 0);
754 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
756 }
else if (old_ub1 < bbounds.ub1) {
757 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
767kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768 bounds_info_internalXX_template<T> *bounds,
769 bounds_info_internal_t *bounds_nest) {
771 kmp_calc_new_bounds_XX( bounds, bounds_nest);
772 kmp_calc_span_XX( bounds, bounds_nest);
773 return kmp_calculate_trip_count_XX( &(bounds->b));
781kmp_loop_nest_iv_t kmp_process_loop_nest(
782 bounds_info_internal_t *bounds_nest, kmp_index_t n) {
784 kmp_loop_nest_iv_t total = 1;
786 for (kmp_index_t ind = 0; ind < n; ++ind) {
787 auto bounds = &(bounds_nest[ind]);
788 kmp_loop_nest_iv_t trip_count = 0;
790 switch (bounds->b.loop_type) {
791 case loop_type_t::loop_type_int32:
792 trip_count = kmp_process_one_loop_XX<kmp_int32>(
793 (bounds_info_internalXX_template<kmp_int32> *)(bounds),
796 case loop_type_t::loop_type_uint32:
797 trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798 (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
801 case loop_type_t::loop_type_int64:
802 trip_count = kmp_process_one_loop_XX<kmp_int64>(
803 (bounds_info_internalXX_template<kmp_int64> *)(bounds),
806 case loop_type_t::loop_type_uint64:
807 trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808 (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
828 const kmp_point_t original_ivs,
831 kmp_loop_nest_iv_t iterations = 0;
833 if (bounds->comparison == comparison_t::comp_less_or_eq) {
835 (
static_cast<T
>(original_ivs[ind]) - bounds->lb0 -
836 bounds->lb1 *
static_cast<T
>(original_ivs[bounds->outer_iv])) /
837 __kmp_abs(bounds->step);
839 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840 iterations = (bounds->lb0 +
841 bounds->lb1 *
static_cast<T
>(original_ivs[bounds->outer_iv]) -
842 static_cast<T
>(original_ivs[ind])) /
843 __kmp_abs(bounds->step);
851kmp_loop_nest_iv_t kmp_calc_number_of_iterations(
const bounds_info_t *bounds,
852 const kmp_point_t original_ivs,
855 switch (bounds->loop_type) {
856 case loop_type_t::loop_type_int32:
857 return kmp_calc_number_of_iterations_XX<kmp_int32>(
860 case loop_type_t::loop_type_uint32:
861 return kmp_calc_number_of_iterations_XX<kmp_uint32>(
864 case loop_type_t::loop_type_int64:
865 return kmp_calc_number_of_iterations_XX<kmp_int64>(
868 case loop_type_t::loop_type_uint64:
869 return kmp_calc_number_of_iterations_XX<kmp_uint64>(
886kmp_calc_new_iv_from_original_ivs(
const bounds_info_internal_t *bounds_nest,
887 const kmp_point_t original_ivs,
890 kmp_loop_nest_iv_t new_iv = 0;
892 for (kmp_index_t ind = 0; ind < n; ++ind) {
893 auto bounds = &(bounds_nest[ind].b);
895 new_iv = new_iv * bounds->trip_count +
896 kmp_calc_number_of_iterations(bounds, original_ivs, ind);
907bool kmp_calc_original_ivs_from_iterations(
909 kmp_point_t original_ivs,
910 kmp_iterations_t iterations, kmp_index_t ind) {
912 kmp_index_t lengthened_ind = n;
915 auto bounds = &(original_bounds_nest[ind]);
916 bool good = kmp_calc_one_iv(bounds, original_ivs, iterations,
917 ind, (lengthened_ind < ind),
true);
928 lengthened_ind = ind;
929 for (kmp_index_t i = ind + 1; i < n; ++i) {
947bool kmp_calc_original_ivs_for_start(
const bounds_info_t *original_bounds_nest,
949 kmp_point_t original_ivs) {
952 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953 for (kmp_index_t ind = n; ind > 0;) {
959 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
969bool kmp_calc_next_original_ivs(
const bounds_info_t *original_bounds_nest,
970 kmp_index_t n,
const kmp_point_t original_ivs,
971 kmp_point_t next_original_ivs) {
973 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
975 for (kmp_index_t ind = 0; ind < n; ++ind) {
976 auto bounds = &(original_bounds_nest[ind]);
977 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
980 for (kmp_index_t ind = 0; ind < n; ++ind) {
981 next_original_ivs[ind] = original_ivs[ind];
986 kmp_index_t ind = n - 1;
989 bool b = kmp_calc_original_ivs_from_iterations(
990 original_bounds_nest, n, next_original_ivs, iterations, ind);
1001template <
typename T>
1002bool kmp_calc_one_iv_for_chunk_end_XX(
1005 kmp_point_t original_ivs,
const kmp_iterations_t iterations,
1006 kmp_index_t ind,
bool start_with_lower_bound,
bool compare_with_start,
1007 const kmp_point_t original_ivs_start) {
1015 T outer_iv =
static_cast<T
>(original_ivs[bounds->outer_iv]);
1017 if (start_with_lower_bound) {
1020 temp = bounds->lb0 + bounds->lb1 * outer_iv;
1030 auto iteration = iterations[ind];
1032 auto step = bounds->step;
1035 auto accountForStep =
1036 ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1040 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041 accountForStep + iteration * step;
1043 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1049 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1052 if (compare_with_start) {
1054 T start =
static_cast<T
>(original_ivs_start[ind]);
1056 temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1060 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1062 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1066 temp = start + iteration / 4 * step;
1071 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1073 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1086bool kmp_calc_one_iv_for_chunk_end(
const bounds_info_t *bounds,
1088 kmp_point_t original_ivs,
1089 const kmp_iterations_t iterations,
1090 kmp_index_t ind,
bool start_with_lower_bound,
1091 bool compare_with_start,
1092 const kmp_point_t original_ivs_start) {
1094 switch (bounds->loop_type) {
1095 case loop_type_t::loop_type_int32:
1096 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1100 original_ivs, iterations, ind, start_with_lower_bound,
1101 compare_with_start, original_ivs_start);
1103 case loop_type_t::loop_type_uint32:
1104 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1108 original_ivs, iterations, ind, start_with_lower_bound,
1109 compare_with_start, original_ivs_start);
1111 case loop_type_t::loop_type_int64:
1112 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1116 original_ivs, iterations, ind, start_with_lower_bound,
1117 compare_with_start, original_ivs_start);
1119 case loop_type_t::loop_type_uint64:
1120 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1124 original_ivs, iterations, ind, start_with_lower_bound,
1125 compare_with_start, original_ivs_start);
1147bool kmp_calc_original_ivs_for_chunk_end(
1149 const bounds_info_internal_t *updated_bounds_nest,
1150 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151 kmp_point_t original_ivs) {
1154 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1156 for (kmp_index_t ind = n; ind > 0;) {
1158 auto &updated_bounds = updated_bounds_nest[ind];
1161 auto new_ind = new_iv / updated_bounds.b.trip_count;
1162 auto iteration = new_iv % updated_bounds.b.trip_count;
1165 iterations[ind] = iteration;
1167 KMP_DEBUG_ASSERT(new_iv == 0);
1169 kmp_index_t lengthened_ind = n;
1170 kmp_index_t equal_ind = -1;
1173 for (kmp_index_t ind = 0; ind < n;) {
1174 auto bounds = &(original_bounds_nest[ind]);
1175 auto updated_bounds = &(updated_bounds_nest[ind].b);
1177 bool good = kmp_calc_one_iv_for_chunk_end(
1178 bounds, updated_bounds,
1179 original_ivs, iterations, ind, (lengthened_ind < ind),
1180 (equal_ind >= ind - 1), original_ivs_start);
1190 ++(iterations[ind]);
1191 lengthened_ind = ind;
1192 if (equal_ind >= lengthened_ind) {
1195 equal_ind = lengthened_ind - 1;
1197 for (kmp_index_t i = ind + 1; i < n; ++i) {
1204 if ((equal_ind == ind - 1) &&
1205 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206 original_ivs_start[ind]))) {
1208 }
else if ((equal_ind > ind - 1) &&
1209 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210 original_ivs_start[ind]))) {
1211 equal_ind = ind - 1;
1222template <
typename T>
1224 kmp_point_t original_ivs,
1227 T temp = bounds->ub0 +
1228 bounds->ub1 *
static_cast<T
>(original_ivs[bounds->outer_iv]);
1230 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1234 kmp_point_t original_ivs, kmp_index_t ind) {
1236 switch (bounds->loop_type) {
1240 case loop_type_t::loop_type_int32:
1241 kmp_calc_one_iv_end_XX<kmp_int32>(
1245 case loop_type_t::loop_type_uint32:
1246 kmp_calc_one_iv_end_XX<kmp_uint32>(
1250 case loop_type_t::loop_type_int64:
1251 kmp_calc_one_iv_end_XX<kmp_int64>(
1255 case loop_type_t::loop_type_uint64:
1256 kmp_calc_one_iv_end_XX<kmp_uint64>(
1266void kmp_calc_original_ivs_for_end(
1267 const bounds_info_t *
const original_bounds_nest, kmp_index_t n,
1268 kmp_point_t original_ivs) {
1269 for (kmp_index_t ind = 0; ind < n; ++ind) {
1270 auto bounds = &(original_bounds_nest[ind]);
1271 kmp_calc_one_iv_end(bounds, original_ivs, ind);
1284kmp_identify_nested_loop_structure(
bounds_info_t *original_bounds_nest,
1288 return nested_loop_type_unkown;
1292 (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293 (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1295 kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296 original_bounds_nest[0].lb0_u64);
1297 kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298 original_bounds_nest[0].ub0_u64);
1299 kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300 original_bounds_nest[0].lb1_u64);
1301 kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302 original_bounds_nest[0].ub1_u64);
1303 if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304 return nested_loop_type_unkown;
1307 kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308 original_bounds_nest[1].lb0_u64);
1309 kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310 original_bounds_nest[1].ub0_u64);
1311 kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312 original_bounds_nest[1].lb1_u64);
1313 kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314 original_bounds_nest[1].ub1_u64);
1316 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317 (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318 return nested_loop_type_lower_triangular_matrix;
1321 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322 inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323 return nested_loop_type_upper_triangular_matrix;
1325 return nested_loop_type_unkown;
1333#define level_of_precision 0.1
1334double sqrt_newton_approx( kmp_uint64 x) {
1335 double sqrt_old = 0.;
1336 double sqrt_new = (double)x;
1338 sqrt_old = sqrt_new;
1339 sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340 }
while ((sqrt_old - sqrt_new) > level_of_precision);
1349void kmp_handle_lower_triangle_matrix(
1357 for (kmp_index_t i = 0; i < n; ++i) {
1358 chunk_bounds_nest[i] = original_bounds_nest[i];
1361 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362 original_bounds_nest[0].ub0_u64);
1363 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364 original_bounds_nest[0].lb0_u64);
1365 kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366 original_bounds_nest[1].ub0_u64);
1373 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1380 kmp_uint64 iter_current =
1381 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1393 kmp_uint64 iter_before_current =
1394 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1397 kmp_uint64 iter_with_current = iter_before_current + iter_current;
1407 kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408 kmp_uint64 lower_bound_outer =
1409 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410 8 * iter_before_current) +
1417 kmp_uint64 lower_bound_inner =
1418 iter_before_current -
1419 ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1423 kmp_uint64 upper_bound_outer =
1424 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425 8 * iter_with_current) +
1432 kmp_uint64 upper_bound_inner =
1434 ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1437 if (upper_bound_inner == 0) {
1439 upper_bound_outer -= 1;
1440 upper_bound_inner = upper_bound_outer;
1443 upper_bound_inner -= 1;
1448 chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449 chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450 chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451 chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452 chunk_bounds_nest[0].lb1_u64 = 0;
1453 chunk_bounds_nest[0].ub1_u64 = 0;
1454 chunk_bounds_nest[1].lb1_u64 = 0;
1455 chunk_bounds_nest[1].ub1_u64 = 0;
1458 printf(
"tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1469void kmp_handle_upper_triangle_matrix(
1477 for (kmp_index_t i = 0; i < n; ++i) {
1478 chunk_bounds_nest[i] = original_bounds_nest[i];
1481 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482 original_bounds_nest[0].ub0_u64);
1483 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484 original_bounds_nest[0].lb0_u64);
1485 [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486 original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1493 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1500 kmp_uint64 iter_current =
1501 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1513 kmp_uint64 iter_before_current =
1514 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1517 kmp_uint64 iter_with_current = iter_before_current + iter_current;
1523 kmp_uint64 lower_bound_outer =
1524 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1528 kmp_uint64 lower_bound_inner =
1529 iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1533 kmp_uint64 upper_bound_outer =
1534 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1538 kmp_uint64 upper_bound_inner =
1539 iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1542 if (upper_bound_inner == 0) {
1544 upper_bound_outer -= 1;
1545 upper_bound_inner = upper_bound_outer;
1548 upper_bound_inner -= 1;
1553 chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554 chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555 chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556 chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557 chunk_bounds_nest[0].lb1_u64 = 0;
1558 chunk_bounds_nest[0].ub1_u64 = 0;
1559 chunk_bounds_nest[1].lb1_u64 = 0;
1560 chunk_bounds_nest[1].ub1_u64 = 0;
1563 printf(
"tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1588__kmpc_for_collapsed_init(
ident_t *loc, kmp_int32 gtid,
1591 kmp_index_t n, kmp_int32 *plastiter) {
1593 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594 KE_TRACE(10, (
"__kmpc_for_collapsed_init called (%d)\n", gtid));
1596 if (__kmp_env_consistency_check) {
1597 __kmp_push_workshare(gtid, ct_pdo, loc);
1600 kmp_canonicalize_loop_nest(loc, original_bounds_nest, n);
1602 CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1604 for (kmp_index_t i = 0; i < n; ++i) {
1605 updated_bounds_nest[i].b = original_bounds_nest[i];
1608 kmp_loop_nest_iv_t total =
1609 kmp_process_loop_nest( updated_bounds_nest, n);
1611 if (plastiter != NULL) {
1621 __kmp_assert_valid_gtid(gtid);
1622 kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1624 kmp_info_t *th = __kmp_threads[gtid];
1625 kmp_team_t *team = th->th.th_team;
1626 kmp_uint32 nth = team->t.t_nproc;
1628 KMP_DEBUG_ASSERT(tid < nth);
1631 nested_loop_type_t loop_type =
1632 kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633 if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634 kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1637 }
else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638 kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1643 CollapseAllocator<kmp_uint64> original_ivs_start(n);
1645 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646 original_ivs_start)) {
1669 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670 updated_bounds_nest, original_ivs_start, n);
1672 bool last_iter =
false;
1678 KMP_DEBUG_ASSERT(total >= new_iv);
1680 kmp_loop_nest_iv_t total_left = total - new_iv;
1681 kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682 kmp_loop_nest_iv_t remainder = total_left % nth;
1684 kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1686 if (remainder > 0) {
1691#if defined(KMP_DEBUG)
1692 kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1695 if (curr_chunk_size > 1) {
1696 new_iv += curr_chunk_size - 1;
1699 CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700 if ((nth == 1) || (new_iv >= total - 1)) {
1703 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1709 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710 updated_bounds_nest,
1711 original_ivs_start, new_iv,
1712 original_ivs_end)) {
1714 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1721#if defined(KMP_DEBUG)
1722 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723 updated_bounds_nest, original_ivs_end, n);
1724 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1727 if (last_iter && (tid != 0)) {
1737 CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1739 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740 original_ivs_next_start)) {
1743 if (plastiter != NULL) {
1749 for (kmp_index_t i = 0; i < n; ++i) {
1750 chunk_bounds_nest[i] =
1751 original_bounds_nest[i];
1752 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753 chunk_bounds_nest[i].lb1_u64 = 0;
1755 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756 chunk_bounds_nest[i].ub1_u64 = 0;
1765 bool next_chunk = kmp_calc_next_original_ivs(
1766 original_bounds_nest, n, original_ivs_end, original_ivs_start);
1776 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777 original_ivs_start, n);