5 #if defined(_WIN32) || defined(WIN32) 6 #define LIBDIVIDE_WINDOWS 1 10 #define LIBDIVIDE_VC 1 23 #if ! LIBDIVIDE_HAS_STDINT_TYPES && ! LIBDIVIDE_VC 26 #define LIBDIVIDE_HAS_STDINT_TYPES 1 29 #if ! LIBDIVIDE_HAS_STDINT_TYPES 30 typedef __int32 int32_t;
31 typedef unsigned __int32 uint32_t;
32 typedef __int64 int64_t;
33 typedef unsigned __int64 uint64_t;
34 typedef __int8 int8_t;
35 typedef unsigned __int8 uint8_t;
38 #if LIBDIVIDE_USE_SSE2 42 #include <emmintrin.h> 46 #define __has_builtin(x) 0 // Compatibility with non-clang compilers. 50 #define HAS_INT128_T 0 52 #define HAS_INT128_T __LP64__ 55 #if defined(__x86_64__) || defined(_WIN64) || defined(_M_64) 56 #define LIBDIVIDE_IS_X86_64 1 60 #define LIBDIVIDE_IS_i386 1 63 #if __GNUC__ || __clang__ 64 #define LIBDIVIDE_GCC_STYLE_ASM 1 69 #ifdef LIBDIVIDE_USE_SSE4_1 70 #include <smmintrin.h> 103 LIBDIVIDE_32_SHIFT_MASK = 0x1F,
104 LIBDIVIDE_64_SHIFT_MASK = 0x3F,
105 LIBDIVIDE_ADD_MARKER = 0x40,
106 LIBDIVIDE_U32_SHIFT_PATH = 0x80,
107 LIBDIVIDE_U64_SHIFT_PATH = 0x80,
108 LIBDIVIDE_S32_SHIFT_PATH = 0x20,
109 LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
135 #ifndef LIBDIVIDE_API 138 #define LIBDIVIDE_API 140 #define LIBDIVIDE_API static 150 LIBDIVIDE_API int32_t libdivide_s32_do(int32_t numer,
const struct libdivide_s32_t *denom);
151 LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer,
const struct libdivide_u32_t *denom);
152 LIBDIVIDE_API int64_t libdivide_s64_do(int64_t numer,
const struct libdivide_s64_t *denom);
153 LIBDIVIDE_API uint64_t libdivide_u64_do(uint64_t y,
const struct libdivide_u64_t *denom);
155 LIBDIVIDE_API
int libdivide_u32_get_algorithm(
const struct libdivide_u32_t *denom);
156 LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer,
const struct libdivide_u32_t *denom);
157 LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer,
const struct libdivide_u32_t *denom);
158 LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer,
const struct libdivide_u32_t *denom);
160 LIBDIVIDE_API
int libdivide_u64_get_algorithm(
const struct libdivide_u64_t *denom);
161 LIBDIVIDE_API uint64_t libdivide_u64_do_alg0(uint64_t numer,
const struct libdivide_u64_t *denom);
162 LIBDIVIDE_API uint64_t libdivide_u64_do_alg1(uint64_t numer,
const struct libdivide_u64_t *denom);
163 LIBDIVIDE_API uint64_t libdivide_u64_do_alg2(uint64_t numer,
const struct libdivide_u64_t *denom);
165 LIBDIVIDE_API
int libdivide_s32_get_algorithm(
const struct libdivide_s32_t *denom);
166 LIBDIVIDE_API int32_t libdivide_s32_do_alg0(int32_t numer,
const struct libdivide_s32_t *denom);
167 LIBDIVIDE_API int32_t libdivide_s32_do_alg1(int32_t numer,
const struct libdivide_s32_t *denom);
168 LIBDIVIDE_API int32_t libdivide_s32_do_alg2(int32_t numer,
const struct libdivide_s32_t *denom);
169 LIBDIVIDE_API int32_t libdivide_s32_do_alg3(int32_t numer,
const struct libdivide_s32_t *denom);
170 LIBDIVIDE_API int32_t libdivide_s32_do_alg4(int32_t numer,
const struct libdivide_s32_t *denom);
172 LIBDIVIDE_API
int libdivide_s64_get_algorithm(
const struct libdivide_s64_t *denom);
173 LIBDIVIDE_API int64_t libdivide_s64_do_alg0(int64_t numer,
const struct libdivide_s64_t *denom);
174 LIBDIVIDE_API int64_t libdivide_s64_do_alg1(int64_t numer,
const struct libdivide_s64_t *denom);
175 LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer,
const struct libdivide_s64_t *denom);
176 LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer,
const struct libdivide_s64_t *denom);
177 LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer,
const struct libdivide_s64_t *denom);
179 #if LIBDIVIDE_USE_SSE2 180 LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers,
const struct libdivide_u32_t *denom);
181 LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers,
const struct libdivide_s32_t *denom);
182 LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers,
const struct libdivide_u64_t *denom);
183 LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers,
const struct libdivide_s64_t *denom);
185 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers,
const struct libdivide_u32_t *denom);
186 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers,
const struct libdivide_u32_t *denom);
187 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers,
const struct libdivide_u32_t *denom);
189 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers,
const struct libdivide_s32_t *denom);
190 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers,
const struct libdivide_s32_t *denom);
191 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers,
const struct libdivide_s32_t *denom);
192 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers,
const struct libdivide_s32_t *denom);
193 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers,
const struct libdivide_s32_t *denom);
195 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers,
const struct libdivide_u64_t *denom);
196 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers,
const struct libdivide_u64_t *denom);
197 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers,
const struct libdivide_u64_t *denom);
199 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers,
const struct libdivide_s64_t *denom);
200 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers,
const struct libdivide_s64_t *denom);
201 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers,
const struct libdivide_s64_t *denom);
202 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers,
const struct libdivide_s64_t *denom);
203 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers,
const struct libdivide_s64_t *denom);
210 static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
211 uint64_t xl = x, yl = y;
212 uint64_t rl = xl * yl;
213 return (uint32_t)(rl >> 32);
216 static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
218 __uint128_t xl = x, yl = y;
219 __uint128_t rl = xl * yl;
220 return (uint64_t)(rl >> 64);
223 const uint32_t mask = 0xFFFFFFFF;
224 const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> 32);
225 const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> 32);
226 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
227 const uint64_t x0y1 = x0 * (uint64_t)y1;
228 const uint64_t x1y0 = x1 * (uint64_t)y0;
229 const uint64_t x1y1 = x1 * (uint64_t)y1;
231 uint64_t temp = x1y0 + x0y0_hi;
232 uint64_t temp_lo = temp & mask, temp_hi = temp >> 32;
233 return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
237 static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
239 __int128_t xl = x, yl = y;
240 __int128_t rl = xl * yl;
241 return (int64_t)(rl >> 64);
244 const uint32_t mask = 0xFFFFFFFF;
245 const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask);
246 const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32);
247 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
248 const int64_t t = x1*(int64_t)y0 + x0y0_hi;
249 const int64_t w1 = x0*(int64_t)y1 + (t & mask);
250 return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32);
254 #if LIBDIVIDE_USE_SSE2 256 static inline __m128i libdivide__u64_to_m128(uint64_t x) {
259 _declspec(align(16)) uint64_t temp[2] = {x, x};
260 return _mm_load_si128((
const __m128i *)temp);
262 uint64_t __attribute__((aligned(16))) temp[2] = {x,x};
263 return _mm_load_si128((
const __m128i *)temp);
266 return (__m128i) {x, x};
269 return _mm_set1_epi64x(x);
273 static inline __m128i libdivide_get_FFFFFFFF00000000(
void) {
275 __m128i result = _mm_set1_epi8(-1);
276 return _mm_slli_epi64(result, 32);
279 static inline __m128i libdivide_get_00000000FFFFFFFF(
void) {
281 __m128i result = _mm_set1_epi8(-1);
282 result = _mm_srli_epi64(result, 32);
286 static inline __m128i libdivide_get_0000FFFF(
void) {
289 result = _mm_cmpeq_epi8(result, result);
290 result = _mm_srli_epi32(result, 16);
294 static inline __m128i libdivide_s64_signbits(__m128i v) {
296 __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
297 __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
302 static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
303 return _mm_set_epi32(0, 0, 0, amt);
306 static inline __m128i libdivide_s64_shift_right_vector(__m128i v,
int amt) {
308 const int b = 64 - amt;
309 __m128i m = libdivide__u64_to_m128(1ULL << (b - 1));
310 __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
311 __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
316 static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) {
317 __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
318 __m128i a1X3X = _mm_srli_epi64(a, 32);
319 __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
320 return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
325 static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
327 const __m128i mask = libdivide_get_00000000FFFFFFFF();
328 const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, 32);
329 const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, 32);
330 const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32);
331 const __m128i x0y1 = _mm_mul_epu32(x0, y1);
332 const __m128i x1y0 = _mm_mul_epu32(x1, y0);
333 const __m128i x1y1 = _mm_mul_epu32(x1, y1);
335 const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
336 __m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, 32);
337 temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
338 temp_hi = _mm_add_epi64(x1y1, temp_hi);
340 return _mm_add_epi64(temp_lo, temp_hi);
344 static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
345 __m128i p = libdivide_mullhi_u64_flat_vector(x, y);
346 __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
347 p = _mm_sub_epi64(p, t1);
348 __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
349 p = _mm_sub_epi64(p, t2);
353 #ifdef LIBDIVIDE_USE_SSE4_1 356 static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
357 __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32);
358 __m128i a1X3X = _mm_srli_epi64(a, 32);
359 __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
360 return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
366 static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
367 __m128i p = libdivide__mullhi_u32_flat_vector(a, b);
368 __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
369 __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
370 p = _mm_sub_epi32(p, t1);
371 p = _mm_sub_epi32(p, t2);
377 static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) {
378 #if __GNUC__ || __has_builtin(__builtin_ctz) 380 return __builtin_ctz(val);
384 val = (val ^ (val - 1)) >> 1;
393 static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) {
394 #if __LP64__ && (__GNUC__ || __has_builtin(__builtin_ctzll)) 396 return __builtin_ctzll(val);
399 uint32_t lo = val & 0xFFFFFFFF;
400 if (lo != 0) {
return libdivide__count_trailing_zeros32(lo); }
401 return 32 + libdivide__count_trailing_zeros32(val >> 32);
405 static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
406 #if __GNUC__ || __has_builtin(__builtin_clzll) 408 return __builtin_clz(val);
412 while (! (val & (1U << 31))) {
420 static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
421 #if __GNUC__ || __has_builtin(__builtin_clzll) 423 return __builtin_clzll(val);
427 while (! (val & (1ULL << 63))) {
436 #if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM 437 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
440 :
"=a"(result),
"=d"(*r)
441 : [v]
"r"(v),
"a"(u0),
"d"(u1)
446 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
447 uint64_t n = (((uint64_t)u1) << 32) | u0;
448 uint32_t result = (uint32_t)(n / v);
449 *r = (uint32_t)(n - result * (uint64_t)v);
454 #if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM 455 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
461 :
"=a"(result),
"=d"(*r)
462 : [v]
"r"(v),
"a"(u0),
"d"(u1)
471 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
472 const uint64_t b = (1ULL << 32);
484 return (uint64_t)(-1);
488 s = libdivide__count_leading_zeros64(v);
492 vn0 = v & 0xFFFFFFFF;
494 un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
498 un0 = un10 & 0xFFFFFFFF;
501 rhat = un64 - q1*vn1;
503 if (q1 >= b || q1*vn0 > b*rhat + un1) {
506 if (rhat < b) {
goto again1; }
509 un21 = un64*b + un1 - q1*v;
512 rhat = un21 - q0*vn1;
514 if (q0 >= b || q0*vn0 > b*rhat + un0) {
517 if (rhat < b) {
goto again2; }
521 *r = (un21*b + un0 - q0*v) >> s;
527 #if LIBDIVIDE_ASSERTIONS_ON 528 #define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0) 530 #define LIBDIVIDE_ASSERT(x) 533 #ifndef LIBDIVIDE_HEADER_ONLY 539 if ((d & (d - 1)) == 0) {
541 result.more = libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH;
543 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
546 uint32_t rem, proposed_m;
547 proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
549 LIBDIVIDE_ASSERT(rem > 0 && rem < d);
550 const uint32_t e = d - rem;
553 if (e < (1U << floor_log_2_d)) {
555 more = floor_log_2_d;
558 proposed_m += proposed_m;
559 const uint32_t twice_rem = rem + rem;
560 if (twice_rem >= d || twice_rem < rem) { proposed_m += 1; }
561 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
563 result.magic = 1 + proposed_m;
571 uint32_t libdivide_u32_do(uint32_t numer,
const struct libdivide_u32_t *denom) {
572 uint8_t more = denom->more;
573 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
574 return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
576 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
577 if (more & LIBDIVIDE_ADD_MARKER) {
578 uint32_t t = ((numer - q) >> 1) + q;
579 return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
588 uint8_t more = denom->more;
589 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
return 0; }
590 else if (! (more & LIBDIVIDE_ADD_MARKER)) {
return 1; }
594 uint32_t libdivide_u32_do_alg0(uint32_t numer,
const struct libdivide_u32_t *denom) {
595 return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
598 uint32_t libdivide_u32_do_alg1(uint32_t numer,
const struct libdivide_u32_t *denom) {
599 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
600 return q >> denom->more;
603 uint32_t libdivide_u32_do_alg2(uint32_t numer,
const struct libdivide_u32_t *denom) {
605 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
606 uint32_t t = ((numer - q) >> 1) + q;
607 return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
613 #if LIBDIVIDE_USE_SSE2 614 __m128i libdivide_u32_do_vector(__m128i numers,
const struct libdivide_u32_t *denom) {
615 uint8_t more = denom->more;
616 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
617 return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
619 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
620 if (more & LIBDIVIDE_ADD_MARKER) {
623 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
624 return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
628 return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
633 __m128i libdivide_u32_do_vector_alg0(__m128i numers,
const struct libdivide_u32_t *denom) {
634 return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
637 __m128i libdivide_u32_do_vector_alg1(__m128i numers,
const struct libdivide_u32_t *denom) {
638 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
639 return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
642 __m128i libdivide_u32_do_vector_alg2(__m128i numers,
const struct libdivide_u32_t *denom) {
643 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
644 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
645 return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
654 if ((d & (d - 1)) == 0) {
655 result.more = libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH;
658 const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d);
660 uint64_t proposed_m, rem;
662 proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
664 LIBDIVIDE_ASSERT(rem > 0 && rem < d);
665 const uint64_t e = d - rem;
668 if (e < (1ULL << floor_log_2_d)) {
670 more = floor_log_2_d;
673 proposed_m += proposed_m;
674 const uint64_t twice_rem = rem + rem;
675 if (twice_rem >= d || twice_rem < rem) { proposed_m += 1; }
676 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
678 result.magic = 1 + proposed_m;
685 uint64_t libdivide_u64_do(uint64_t numer,
const struct libdivide_u64_t *denom) {
686 uint8_t more = denom->more;
687 if (more & LIBDIVIDE_U64_SHIFT_PATH) {
688 return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
690 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
691 if (more & LIBDIVIDE_ADD_MARKER) {
692 uint64_t t = ((numer - q) >> 1) + q;
693 return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
702 uint8_t more = denom->more;
703 if (more & LIBDIVIDE_U64_SHIFT_PATH) {
return 0; }
704 else if (! (more & LIBDIVIDE_ADD_MARKER)) {
return 1; }
708 uint64_t libdivide_u64_do_alg0(uint64_t numer,
const struct libdivide_u64_t *denom) {
709 return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
712 uint64_t libdivide_u64_do_alg1(uint64_t numer,
const struct libdivide_u64_t *denom) {
713 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
714 return q >> denom->more;
717 uint64_t libdivide_u64_do_alg2(uint64_t numer,
const struct libdivide_u64_t *denom) {
718 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
719 uint64_t t = ((numer - q) >> 1) + q;
720 return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
723 #if LIBDIVIDE_USE_SSE2 724 __m128i libdivide_u64_do_vector(__m128i numers,
const struct libdivide_u64_t *denom) {
725 uint8_t more = denom->more;
726 if (more & LIBDIVIDE_U64_SHIFT_PATH) {
727 return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
729 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
730 if (more & LIBDIVIDE_ADD_MARKER) {
733 __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
734 return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
737 return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
742 __m128i libdivide_u64_do_vector_alg0(__m128i numers,
const struct libdivide_u64_t *denom) {
743 return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
746 __m128i libdivide_u64_do_vector_alg1(__m128i numers,
const struct libdivide_u64_t *denom) {
747 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
748 return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
751 __m128i libdivide_u64_do_vector_alg2(__m128i numers,
const struct libdivide_u64_t *denom) {
752 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
753 __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
754 return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
763 static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
764 int64_t xl = x, yl = y;
765 int64_t rl = xl * yl;
766 return (int32_t)(rl >> 32);
773 uint32_t absD = (uint32_t)(d < 0 ? -d : d);
774 if ((absD & (absD - 1)) == 0) {
776 result.more = libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH;
778 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD);
779 LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
783 uint32_t rem, proposed_m;
784 proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
785 const uint32_t e = absD - rem;
788 if (e < (1U << floor_log_2_d)) {
790 more = floor_log_2_d - 1;
793 proposed_m += proposed_m;
794 const uint32_t twice_rem = rem + rem;
795 if (twice_rem >= absD || twice_rem < rem) { proposed_m += 1; }
796 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
799 result.magic = (d < 0 ? -(int32_t)proposed_m : (int32_t)proposed_m);
806 int32_t libdivide_s32_do(int32_t numer,
const struct libdivide_s32_t *denom) {
807 uint8_t more = denom->more;
808 if (more & LIBDIVIDE_S32_SHIFT_PATH) {
809 uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
810 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
812 int32_t shiftMask = (int8_t)more >> 7;
813 q = (q ^ shiftMask) - shiftMask;
816 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
817 if (more & LIBDIVIDE_ADD_MARKER) {
818 int32_t sign = (int8_t)more >> 7;
819 q += ((numer ^ sign) - sign);
821 q >>= more & LIBDIVIDE_32_SHIFT_MASK;
828 uint8_t more = denom->more;
829 int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
830 if (more & LIBDIVIDE_S32_SHIFT_PATH) {
return (positiveDivisor ? 0 : 1); }
831 else if (more & LIBDIVIDE_ADD_MARKER) {
return (positiveDivisor ? 2 : 3); }
835 int32_t libdivide_s32_do_alg0(int32_t numer,
const struct libdivide_s32_t *denom) {
836 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
837 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
841 int32_t libdivide_s32_do_alg1(int32_t numer,
const struct libdivide_s32_t *denom) {
842 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
843 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
844 return - (q >> shifter);
847 int32_t libdivide_s32_do_alg2(int32_t numer,
const struct libdivide_s32_t *denom) {
848 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
850 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
855 int32_t libdivide_s32_do_alg3(int32_t numer,
const struct libdivide_s32_t *denom) {
856 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
858 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
863 int32_t libdivide_s32_do_alg4(int32_t numer,
const struct libdivide_s32_t *denom) {
864 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
865 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
870 #if LIBDIVIDE_USE_SSE2 871 __m128i libdivide_s32_do_vector(__m128i numers,
const struct libdivide_s32_t *denom) {
872 uint8_t more = denom->more;
873 if (more & LIBDIVIDE_S32_SHIFT_PATH) {
874 uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
875 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
876 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
877 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
878 __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
879 q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask);
882 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
883 if (more & LIBDIVIDE_ADD_MARKER) {
884 __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7);
885 q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
887 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
888 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
893 __m128i libdivide_s32_do_vector_alg0(__m128i numers,
const struct libdivide_s32_t *denom) {
894 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
895 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
896 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
897 return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
900 __m128i libdivide_s32_do_vector_alg1(__m128i numers,
const struct libdivide_s32_t *denom) {
901 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
902 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
903 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
904 return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)));
907 __m128i libdivide_s32_do_vector_alg2(__m128i numers,
const struct libdivide_s32_t *denom) {
908 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
909 q = _mm_add_epi32(q, numers);
910 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
911 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
915 __m128i libdivide_s32_do_vector_alg3(__m128i numers,
const struct libdivide_s32_t *denom) {
916 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
917 q = _mm_sub_epi32(q, numers);
918 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
919 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
923 __m128i libdivide_s32_do_vector_alg4(__m128i numers,
const struct libdivide_s32_t *denom) {
924 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
925 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more));
926 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
938 const uint64_t absD = (uint64_t)(d < 0 ? -d : d);
939 if ((absD & (absD - 1)) == 0) {
940 result.more = libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
943 const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD);
947 uint64_t rem, proposed_m;
948 proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
949 const uint64_t e = absD - rem;
952 if (e < (1ULL << floor_log_2_d)) {
954 more = floor_log_2_d - 1;
957 proposed_m += proposed_m;
958 const uint64_t twice_rem = rem + rem;
959 if (twice_rem >= absD || twice_rem < rem) { proposed_m += 1; }
960 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
964 result.magic = (d < 0 ? -(int64_t)proposed_m : (int64_t)proposed_m);
969 int64_t libdivide_s64_do(int64_t numer,
const struct libdivide_s64_t *denom) {
970 uint8_t more = denom->more;
971 int64_t magic = denom->magic;
973 uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
974 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
976 int64_t shiftMask = (int8_t)more >> 7;
977 q = (q ^ shiftMask) - shiftMask;
980 int64_t q = libdivide__mullhi_s64(magic, numer);
981 if (more & LIBDIVIDE_ADD_MARKER) {
982 int64_t sign = (int8_t)more >> 7;
983 q += ((numer ^ sign) - sign);
985 q >>= more & LIBDIVIDE_64_SHIFT_MASK;
993 uint8_t more = denom->more;
994 int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
995 if (denom->magic == 0) {
return (positiveDivisor ? 0 : 1); }
996 else if (more & LIBDIVIDE_ADD_MARKER) {
return (positiveDivisor ? 2 : 3); }
1000 int64_t libdivide_s64_do_alg0(int64_t numer,
const struct libdivide_s64_t *denom) {
1001 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1002 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1003 return q >> shifter;
1006 int64_t libdivide_s64_do_alg1(int64_t numer,
const struct libdivide_s64_t *denom) {
1008 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1009 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1010 return - (q >> shifter);
1013 int64_t libdivide_s64_do_alg2(int64_t numer,
const struct libdivide_s64_t *denom) {
1014 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1016 q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1021 int64_t libdivide_s64_do_alg3(int64_t numer,
const struct libdivide_s64_t *denom) {
1022 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1024 q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1029 int64_t libdivide_s64_do_alg4(int64_t numer,
const struct libdivide_s64_t *denom) {
1030 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1037 #if LIBDIVIDE_USE_SSE2 1038 __m128i libdivide_s64_do_vector(__m128i numers,
const struct libdivide_s64_t *denom) {
1039 uint8_t more = denom->more;
1040 int64_t magic = denom->magic;
1042 uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1043 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1044 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1045 q = libdivide_s64_shift_right_vector(q, shifter);
1046 __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
1047 q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask);
1050 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
1051 if (more & LIBDIVIDE_ADD_MARKER) {
1052 __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
1053 q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
1055 q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
1056 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1061 __m128i libdivide_s64_do_vector_alg0(__m128i numers,
const struct libdivide_s64_t *denom) {
1062 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1063 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1064 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1065 q = libdivide_s64_shift_right_vector(q, shifter);
1069 __m128i libdivide_s64_do_vector_alg1(__m128i numers,
const struct libdivide_s64_t *denom) {
1070 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1071 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1072 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1073 q = libdivide_s64_shift_right_vector(q, shifter);
1074 return _mm_sub_epi64(_mm_setzero_si128(), q);
1077 __m128i libdivide_s64_do_vector_alg2(__m128i numers,
const struct libdivide_s64_t *denom) {
1078 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1079 q = _mm_add_epi64(q, numers);
1080 q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1081 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1085 __m128i libdivide_s64_do_vector_alg3(__m128i numers,
const struct libdivide_s64_t *denom) {
1086 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1087 q = _mm_sub_epi64(q, numers);
1088 q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1089 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1093 __m128i libdivide_s64_do_vector_alg4(__m128i numers,
const struct libdivide_s64_t *denom) {
1094 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1095 q = libdivide_s64_shift_right_vector(q, denom->more);
1096 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1118 namespace libdivide_internal {
1120 #if LIBDIVIDE_USE_SSE2 1121 #define MAYBE_VECTOR(x) x 1122 #define MAYBE_VECTOR_PARAM __m128i vector_func(__m128i, const DenomType *) 1124 #define MAYBE_VECTOR(x) 0 1125 #define MAYBE_VECTOR_PARAM int vector_func 1129 uint32_t crash_u32(uint32_t,
const libdivide_u32_t *) { abort();
return *(uint32_t *)NULL; }
1130 uint64_t crash_u64(uint64_t,
const libdivide_u64_t *) { abort();
return *(uint64_t *)NULL; }
1131 #if LIBDIVIDE_USE_SSE2 1132 __m128i crash_u32_vector(__m128i,
const libdivide_u32_t *) { abort();
return *(__m128i *)NULL; }
1133 __m128i crash_u64_vector(__m128i,
const libdivide_u64_t *) { abort();
return *(__m128i *)NULL; }
1136 template<
typename IntType,
typename DenomType, DenomType gen_func(IntType),
int get_algo(const DenomType *), IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
1137 class divider_base {
1140 divider_base(IntType d) : denom(gen_func(d)) { }
1141 divider_base(
const DenomType& d) : denom(d) { }
1143 IntType perform_divide(IntType val)
const {
return do_func(val, &denom); }
1144 #if LIBDIVIDE_USE_SSE2 1145 __m128i perform_divide_vector(__m128i val)
const {
return vector_func(val, &denom); }
1148 int get_algorithm()
const {
return get_algo(&denom); }
1152 template<
class T>
struct divider_mid { };
1154 template<>
struct divider_mid<uint32_t> {
1155 typedef uint32_t IntType;
1157 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
struct denom {
1158 typedef divider_base<IntType, DenomType, libdivide_u32_gen, libdivide_u32_get_algorithm, do_func, vector_func> divider;
1161 template<
int ALGO,
int J = 0>
struct algo { };
1162 template<
int J>
struct algo<-1, J> {
typedef denom<libdivide_u32_do, MAYBE_VECTOR(libdivide_u32_do_vector)>::divider divider; };
1163 template<
int J>
struct algo<0, J> {
typedef denom<libdivide_u32_do_alg0, MAYBE_VECTOR(libdivide_u32_do_vector_alg0)>::divider divider; };
1164 template<
int J>
struct algo<1, J> {
typedef denom<libdivide_u32_do_alg1, MAYBE_VECTOR(libdivide_u32_do_vector_alg1)>::divider divider; };
1165 template<
int J>
struct algo<2, J> {
typedef denom<libdivide_u32_do_alg2, MAYBE_VECTOR(libdivide_u32_do_vector_alg2)>::divider divider; };
1168 template<
int J>
struct algo<3, J> {
typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1169 template<
int J>
struct algo<4, J> {
typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1173 template<>
struct divider_mid<int32_t> {
1174 typedef int32_t IntType;
1176 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
struct denom {
1177 typedef divider_base<IntType, DenomType, libdivide_s32_gen, libdivide_s32_get_algorithm, do_func, vector_func> divider;
1181 template<
int ALGO,
int J = 0>
struct algo { };
1182 template<
int J>
struct algo<-1, J> {
typedef denom<libdivide_s32_do, MAYBE_VECTOR(libdivide_s32_do_vector)>::divider divider; };
1183 template<
int J>
struct algo<0, J> {
typedef denom<libdivide_s32_do_alg0, MAYBE_VECTOR(libdivide_s32_do_vector_alg0)>::divider divider; };
1184 template<
int J>
struct algo<1, J> {
typedef denom<libdivide_s32_do_alg1, MAYBE_VECTOR(libdivide_s32_do_vector_alg1)>::divider divider; };
1185 template<
int J>
struct algo<2, J> {
typedef denom<libdivide_s32_do_alg2, MAYBE_VECTOR(libdivide_s32_do_vector_alg2)>::divider divider; };
1186 template<
int J>
struct algo<3, J> {
typedef denom<libdivide_s32_do_alg3, MAYBE_VECTOR(libdivide_s32_do_vector_alg3)>::divider divider; };
1187 template<
int J>
struct algo<4, J> {
typedef denom<libdivide_s32_do_alg4, MAYBE_VECTOR(libdivide_s32_do_vector_alg4)>::divider divider; };
1191 template<>
struct divider_mid<uint64_t> {
1192 typedef uint64_t IntType;
1194 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
struct denom {
1195 typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1198 template<
int ALGO,
int J = 0>
struct algo { };
1199 template<
int J>
struct algo<-1, J> {
typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1200 template<
int J>
struct algo<0, J> {
typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1201 template<
int J>
struct algo<1, J> {
typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1202 template<
int J>
struct algo<2, J> {
typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1205 template<
int J>
struct algo<3, J> {
typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1206 template<
int J>
struct algo<4, J> {
typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1211 template<>
struct divider_mid<int64_t> {
1212 typedef int64_t IntType;
1214 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
struct denom {
1215 typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1218 template<
int ALGO,
int J = 0>
struct algo { };
1219 template<
int J>
struct algo<-1, J> {
typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1220 template<
int J>
struct algo<0, J> {
typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1221 template<
int J>
struct algo<1, J> {
typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1222 template<
int J>
struct algo<2, J> {
typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1223 template<
int J>
struct algo<3, J> {
typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1224 template<
int J>
struct algo<4, J> {
typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1229 template<
typename T,
int ALGO = -1>
1232 typename libdivide_internal::divider_mid<T>::template algo<ALGO>::divider sub;
1233 template<
int NEW_ALGO,
typename S>
friend divider<S, NEW_ALGO> unswitch(
const divider<S, -1> & d);
1234 divider(
const typename libdivide_internal::divider_mid<T>::DenomType& denom) : sub(denom) { }
1239 divider(T n) : sub(n) { }
1242 divider() : sub(1) { }
1245 T perform_divide(T val)
const {
return sub.perform_divide(val); }
1247 #if LIBDIVIDE_USE_SSE2 1249 __m128i perform_divide_vector(__m128i val)
const {
return sub.perform_divide_vector(val); }
1253 int get_algorithm()
const {
return sub.get_algorithm(); }
1256 bool operator==(
const divider<T, ALGO>& him)
const {
return sub.denom.magic == him.sub.denom.magic && sub.denom.more == him.sub.denom.more; }
1258 bool operator!=(
const divider<T, ALGO>& him)
const {
return ! (*
this == him); }
1262 template<
int NEW_ALGO,
typename S>
1263 divider<S, NEW_ALGO> unswitch(
const divider<S, -1> & d) {
return divider<S, NEW_ALGO>(d.sub.denom); }
1266 template<
typename int_type,
int ALGO>
1267 int_type operator/(int_type numer,
const divider<int_type, ALGO>& denom) {
1268 return denom.perform_divide(numer);
1271 #if LIBDIVIDE_USE_SSE2 1273 template<
typename int_type,
int ALGO>
1274 __m128i operator/(__m128i numer,
const divider<int_type, ALGO>& denom) {
1275 return denom.perform_divide_vector(numer);
1280 #endif //__cplusplus 1282 #endif //LIBDIVIDE_HEADER_ONLY Definition: libdivide.h:113
Definition: libdivide.h:128
Definition: libdivide.h:118
Definition: libdivide.h:123