BFGraph
libdivide.h
1 /* libdivide.h
2  Copyright 2010 ridiculous_fish
3 */
4 
5 #if defined(_WIN32) || defined(WIN32)
6 #define LIBDIVIDE_WINDOWS 1
7 #endif
8 
9 #if defined(_MSC_VER)
10 #define LIBDIVIDE_VC 1
11 #endif
12 
13 #ifdef __cplusplus
14 #include <cstdlib>
15 #include <cstdio>
16 #include <cassert>
17 #else
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <assert.h>
21 #endif
22 
23 #if ! LIBDIVIDE_HAS_STDINT_TYPES && ! LIBDIVIDE_VC
24 /* Visual C++ still doesn't ship with stdint.h (!) */
25 #include <stdint.h>
26 #define LIBDIVIDE_HAS_STDINT_TYPES 1
27 #endif
28 
29 #if ! LIBDIVIDE_HAS_STDINT_TYPES
30 typedef __int32 int32_t;
31 typedef unsigned __int32 uint32_t;
32 typedef __int64 int64_t;
33 typedef unsigned __int64 uint64_t;
34 typedef __int8 int8_t;
35 typedef unsigned __int8 uint8_t;
36 #endif
37 
38 #if LIBDIVIDE_USE_SSE2
39 #if LIBDIVIDE_VC
40 #include <mmintrin.h>
41 #endif
42 #include <emmintrin.h>
43 #endif
44 
45 #ifndef __has_builtin
46 #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
47 #endif
48 
49 #ifdef __ICC
50 #define HAS_INT128_T 0
51 #else
52 #define HAS_INT128_T __LP64__
53 #endif
54 
55 #if defined(__x86_64__) || defined(_WIN64) || defined(_M_64)
56 #define LIBDIVIDE_IS_X86_64 1
57 #endif
58 
59 #if defined(__i386__)
60 #define LIBDIVIDE_IS_i386 1
61 #endif
62 
63 #if __GNUC__ || __clang__
64 #define LIBDIVIDE_GCC_STYLE_ASM 1
65 #endif
66 
67 
68 /* libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) which is in SSE 4.1. However, signed multiplication can be emulated efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so it is OK to not turn this on */
69 #ifdef LIBDIVIDE_USE_SSE4_1
70 #include <smmintrin.h>
71 #endif
72 
73 #ifdef __cplusplus
74 /* We place libdivide within the libdivide namespace, and that goes in an anonymous namespace so that the functions are only visible to files that #include this header and don't get external linkage. At least that's the theory. */
75 namespace {
76 namespace libdivide {
77 #endif
78 
79 /* Explanation of "more" field: bit 6 is whether to use shift path. If we are using the shift path, bit 7 is whether the divisor is negative in the signed case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the "negative divisor indicator" so that we can use sign extension to efficiently go to a full-width -1.
80 
81 
82 u32: [0-4] shift value
83  [5] ignored
84  [6] add indicator
85  [7] shift path
86 
87 s32: [0-4] shift value
88  [5] shift path
89  [6] add indicator
90  [7] indicates negative divisor
91 
92 u64: [0-5] shift value
93  [6] add indicator
94  [7] shift path
95 
96 s64: [0-5] shift value
97  [6] add indicator
98  [7] indicates negative divisor
99  magic number of 0 indicates shift path (we ran out of bits!)
100 */
101 
102 enum {
103  LIBDIVIDE_32_SHIFT_MASK = 0x1F,
104  LIBDIVIDE_64_SHIFT_MASK = 0x3F,
105  LIBDIVIDE_ADD_MARKER = 0x40,
106  LIBDIVIDE_U32_SHIFT_PATH = 0x80,
107  LIBDIVIDE_U64_SHIFT_PATH = 0x80,
108  LIBDIVIDE_S32_SHIFT_PATH = 0x20,
109  LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
110 };
111 
112 
114  uint32_t magic;
115  uint8_t more;
116 };
117 
119  int32_t magic;
120  uint8_t more;
121 };
122 
124  uint64_t magic;
125  uint8_t more;
126 };
127 
129  int64_t magic;
130  uint8_t more;
131 };
132 
133 
134 
135 #ifndef LIBDIVIDE_API
136 #ifdef __cplusplus
137 /* In C++, we don't want our public functions to be static, because they are arguments to templates and static functions can't do that. They get internal linkage through virtue of the anonymous namespace. In C, they should be static. */
138 #define LIBDIVIDE_API
139 #else
140 #define LIBDIVIDE_API static
141 #endif
142 #endif
143 
144 
145 LIBDIVIDE_API struct libdivide_s32_t libdivide_s32_gen(int32_t y);
146 LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y);
147 LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(int64_t y);
148 LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(uint64_t y);
149 
150 LIBDIVIDE_API int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
151 LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
152 LIBDIVIDE_API int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
153 LIBDIVIDE_API uint64_t libdivide_u64_do(uint64_t y, const struct libdivide_u64_t *denom);
154 
155 LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom);
156 LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom);
157 LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom);
158 LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom);
159 
160 LIBDIVIDE_API int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom);
161 LIBDIVIDE_API uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom);
162 LIBDIVIDE_API uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom);
163 LIBDIVIDE_API uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom);
164 
165 LIBDIVIDE_API int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom);
166 LIBDIVIDE_API int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom);
167 LIBDIVIDE_API int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom);
168 LIBDIVIDE_API int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom);
169 LIBDIVIDE_API int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom);
170 LIBDIVIDE_API int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom);
171 
172 LIBDIVIDE_API int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom);
173 LIBDIVIDE_API int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom);
174 LIBDIVIDE_API int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom);
175 LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom);
176 LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom);
177 LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom);
178 
179 #if LIBDIVIDE_USE_SSE2
180 LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
181 LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
182 LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
183 LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
184 
185 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom);
186 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom);
187 LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom);
188 
189 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom);
190 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom);
191 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom);
192 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom);
193 LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom);
194 
195 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom);
196 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom);
197 LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom);
198 
199 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom);
200 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom);
201 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom);
202 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom);
203 LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom);
204 #endif
205 
206 
207 
209 
210 static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
211  uint64_t xl = x, yl = y;
212  uint64_t rl = xl * yl;
213  return (uint32_t)(rl >> 32);
214 }
215 
216 static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
217 #if HAS_INT128_T
218  __uint128_t xl = x, yl = y;
219  __uint128_t rl = xl * yl;
220  return (uint64_t)(rl >> 64);
221 #else
222  //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
223  const uint32_t mask = 0xFFFFFFFF;
224  const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> 32);
225  const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> 32);
226  const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
227  const uint64_t x0y1 = x0 * (uint64_t)y1;
228  const uint64_t x1y0 = x1 * (uint64_t)y0;
229  const uint64_t x1y1 = x1 * (uint64_t)y1;
230 
231  uint64_t temp = x1y0 + x0y0_hi;
232  uint64_t temp_lo = temp & mask, temp_hi = temp >> 32;
233  return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
234 #endif
235 }
236 
237 static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
238 #if HAS_INT128_T
239  __int128_t xl = x, yl = y;
240  __int128_t rl = xl * yl;
241  return (int64_t)(rl >> 64);
242 #else
243  //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
244  const uint32_t mask = 0xFFFFFFFF;
245  const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask);
246  const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32);
247  const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
248  const int64_t t = x1*(int64_t)y0 + x0y0_hi;
249  const int64_t w1 = x0*(int64_t)y1 + (t & mask);
250  return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32);
251 #endif
252 }
253 
254 #if LIBDIVIDE_USE_SSE2
255 
256 static inline __m128i libdivide__u64_to_m128(uint64_t x) {
257 #if LIBDIVIDE_VC
258  //64 bit windows doesn't seem to have an implementation of any of these load intrinsics, and 32 bit Visual C++ crashes
259  _declspec(align(16)) uint64_t temp[2] = {x, x};
260  return _mm_load_si128((const __m128i *)temp);
261 #elif defined(__ICC)
262  uint64_t __attribute__((aligned(16))) temp[2] = {x,x};
263  return _mm_load_si128((const __m128i *)temp);
264 #elif __clang__
265  // clang does not provide this intrinsic either
266  return (__m128i) {x, x};
267 #else
268  // everyone else gets it right
269  return _mm_set1_epi64x(x);
270 #endif
271 }
272 
273 static inline __m128i libdivide_get_FFFFFFFF00000000(void) {
274  //returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) without touching memory
275  __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X
276  return _mm_slli_epi64(result, 32);
277 }
278 
279 static inline __m128i libdivide_get_00000000FFFFFFFF(void) {
280  //returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) without touching memory
281  __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X
282  result = _mm_srli_epi64(result, 32);
283  return result;
284 }
285 
286 static inline __m128i libdivide_get_0000FFFF(void) {
287  //returns the same as _mm_set1_epi32(0x0000FFFFULL) without touching memory
288  __m128i result; //we don't care what its contents are
289  result = _mm_cmpeq_epi8(result, result); //all 1s
290  result = _mm_srli_epi32(result, 16);
291  return result;
292 }
293 
294 static inline __m128i libdivide_s64_signbits(__m128i v) {
295  //we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there is no 64 bit shift right arithmetic instruction in SSE2. So we have to fake it by first duplicating the high 32 bit values, and then using a 32 bit shift. Another option would be to use _mm_srli_epi64(v, 63) and then subtract that from 0, but that approach appears to be substantially slower for unknown reasons
296  __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
297  __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
298  return signBits;
299 }
300 
301 /* Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. */
302 static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
303  return _mm_set_epi32(0, 0, 0, amt);
304 }
305 
306 static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
307  //implementation of _mm_sra_epi64. Here we have two 64 bit values which are shifted right to logically become (64 - amt) values, and are then sign extended from a (64 - amt) bit number.
308  const int b = 64 - amt;
309  __m128i m = libdivide__u64_to_m128(1ULL << (b - 1));
310  __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
311  __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); //result = x^m - m
312  return result;
313 }
314 
315 /* Here, b is assumed to contain one 32 bit value repeated four times. If it did not, the function would not work. */
316 static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) {
317  __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
318  __m128i a1X3X = _mm_srli_epi64(a, 32);
319  __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
320  return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
321 }
322 
323 
324 /* Here, y is assumed to contain one 64 bit value repeated twice. */
325 static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
326  //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
327  const __m128i mask = libdivide_get_00000000FFFFFFFF();
328  const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, 32); //x0 is low half of 2 64 bit values, x1 is high half in low slots
329  const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, 32);
330  const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); //x0 happens to have the low half of the two 64 bit values in 32 bit slots 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift right by 32 to get just the high values
331  const __m128i x0y1 = _mm_mul_epu32(x0, y1);
332  const __m128i x1y0 = _mm_mul_epu32(x1, y0);
333  const __m128i x1y1 = _mm_mul_epu32(x1, y1);
334 
335  const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
336  __m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, 32);
337  temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
338  temp_hi = _mm_add_epi64(x1y1, temp_hi);
339 
340  return _mm_add_epi64(temp_lo, temp_hi);
341 }
342 
343 /* y is one 64 bit value repeated twice */
344 static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
345  __m128i p = libdivide_mullhi_u64_flat_vector(x, y);
346  __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
347  p = _mm_sub_epi64(p, t1);
348  __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
349  p = _mm_sub_epi64(p, t2);
350  return p;
351 }
352 
353 #ifdef LIBDIVIDE_USE_SSE4_1
354 
355 /* b is one 32 bit value repeated four times. */
356 static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
357  __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32);
358  __m128i a1X3X = _mm_srli_epi64(a, 32);
359  __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
360  return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
361 }
362 
363 #else
364 
365 /* SSE2 does not have a signed multiplication instruction, but we can convert unsigned to signed pretty efficiently. Again, b is just a 32 bit value repeated four times. */
366 static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
367  __m128i p = libdivide__mullhi_u32_flat_vector(a, b);
368  __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); //t1 = (a >> 31) & y, arithmetic shift
369  __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
370  p = _mm_sub_epi32(p, t1);
371  p = _mm_sub_epi32(p, t2);
372  return p;
373 }
374 #endif
375 #endif
376 
377 static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) {
378 #if __GNUC__ || __has_builtin(__builtin_ctz)
379  /* Fast way to count trailing zeros */
380  return __builtin_ctz(val);
381 #else
382  /* Dorky way to count trailing zeros. Note that this hangs for val = 0! */
383  int32_t result = 0;
384  val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest
385  while (val) {
386  val >>= 1;
387  result++;
388  }
389  return result;
390 #endif
391 }
392 
393 static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) {
394 #if __LP64__ && (__GNUC__ || __has_builtin(__builtin_ctzll))
395  /* Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - it calls through to a dynamically bound function. */
396  return __builtin_ctzll(val);
397 #else
398  /* Pretty good way to count trailing zeros. Note that this hangs for val = 0! */
399  uint32_t lo = val & 0xFFFFFFFF;
400  if (lo != 0) { return libdivide__count_trailing_zeros32(lo); }
401  return 32 + libdivide__count_trailing_zeros32(val >> 32);
402 #endif
403 }
404 
405 static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
406 #if __GNUC__ || __has_builtin(__builtin_clzll)
407  /* Fast way to count leading zeros */
408  return __builtin_clz(val);
409 #else
410  /* Dorky way to count leading zeros. Note that this hangs for val = 0! */
411  int32_t result = 0;
412  while (! (val & (1U << 31))) {
413  val <<= 1;
414  result++;
415  }
416  return result;
417 #endif
418 }
419 
420 static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
421 #if __GNUC__ || __has_builtin(__builtin_clzll)
422  /* Fast way to count leading zeros */
423  return __builtin_clzll(val);
424 #else
425  /* Dorky way to count leading zeros. Note that this hangs for val = 0! */
426  int32_t result = 0;
427  while (! (val & (1ULL << 63))) {
428  val <<= 1;
429  result++;
430  }
431  return result;
432 #endif
433 }
434 
435 //libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. Returns the quotient directly and the remainder in *r
436 #if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM
437 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
438  uint32_t result;
439  __asm__("divl %[v]"
440  : "=a"(result), "=d"(*r)
441  : [v] "r"(v), "a"(u0), "d"(u1)
442  );
443  return result;
444 }
445 #else
446 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
447  uint64_t n = (((uint64_t)u1) << 32) | u0;
448  uint32_t result = (uint32_t)(n / v);
449  *r = (uint32_t)(n - result * (uint64_t)v);
450  return result;
451 }
452 #endif
453 
454 #if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM
455 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
456  //u0 -> rax
457  //u1 -> rdx
458  //divq
459  uint64_t result;
460  __asm__("divq %[v]"
461  : "=a"(result), "=d"(*r)
462  : [v] "r"(v), "a"(u0), "d"(u1)
463  );
464  return result;
465 
466 }
467 #else
468 
469 /* Code taken from Hacker's Delight, http://www.hackersdelight.org/HDcode/divlu.c . License permits inclusion here per http://www.hackersdelight.org/permissions.htm
470  */
471 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
472  const uint64_t b = (1ULL << 32); // Number base (16 bits).
473  uint64_t un1, un0, // Norm. dividend LSD's.
474  vn1, vn0, // Norm. divisor digits.
475  q1, q0, // Quotient digits.
476  un64, un21, un10,// Dividend digit pairs.
477  rhat; // A remainder.
478  int s; // Shift amount for norm.
479 
480  if (u1 >= v) { // If overflow, set rem.
481  if (r != NULL) { // to an impossible value,
482  *r = (uint64_t)(-1); // and return the largest
483  }
484  return (uint64_t)(-1);
485  } // possible quotient.
486 
487  /* count leading zeros */
488  s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63.
489 
490  v = v << s; // Normalize divisor.
491  vn1 = v >> 32; // Break divisor up into
492  vn0 = v & 0xFFFFFFFF; // two 32-bit digits.
493 
494  un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
495  un10 = u0 << s; // Shift dividend left.
496 
497  un1 = un10 >> 32; // Break right half of
498  un0 = un10 & 0xFFFFFFFF; // dividend into two digits.
499 
500  q1 = un64/vn1; // Compute the first
501  rhat = un64 - q1*vn1; // quotient digit, q1.
502 again1:
503  if (q1 >= b || q1*vn0 > b*rhat + un1) {
504  q1 = q1 - 1;
505  rhat = rhat + vn1;
506  if (rhat < b) { goto again1; }
507  }
508 
509  un21 = un64*b + un1 - q1*v; // Multiply and subtract.
510 
511  q0 = un21/vn1; // Compute the second
512  rhat = un21 - q0*vn1; // quotient digit, q0.
513 again2:
514  if (q0 >= b || q0*vn0 > b*rhat + un0) {
515  q0 = q0 - 1;
516  rhat = rhat + vn1;
517  if (rhat < b) { goto again2; }
518  }
519 
520  if (r != NULL) { // If remainder is wanted,
521  *r = (un21*b + un0 - q0*v) >> s; // return it.
522  }
523  return q1*b + q0;
524 }
525 #endif
526 
527 #if LIBDIVIDE_ASSERTIONS_ON
528 #define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0)
529 #else
530 #define LIBDIVIDE_ASSERT(x)
531 #endif
532 
533 #ifndef LIBDIVIDE_HEADER_ONLY
534 
536 
537 struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
538  struct libdivide_u32_t result;
539  if ((d & (d - 1)) == 0) {
540  result.magic = 0;
541  result.more = libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH;
542  } else {
543  const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
544 
545  uint8_t more;
546  uint32_t rem, proposed_m;
547  proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
548 
549  LIBDIVIDE_ASSERT(rem > 0 && rem < d);
550  const uint32_t e = d - rem;
551 
552  /* This power works if e < 2**floor_log_2_d. */
553  if (e < (1U << floor_log_2_d)) {
554  /* This power works */
555  more = floor_log_2_d;
556  } else {
557  /* We have to use the general 33-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
558  proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
559  const uint32_t twice_rem = rem + rem;
560  if (twice_rem >= d || twice_rem < rem) { proposed_m += 1; }
561  more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
562  }
563  result.magic = 1 + proposed_m;
564  result.more = more;
565  //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases.
566 
567  }
568  return result;
569 }
570 
571 uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
572  uint8_t more = denom->more;
573  if (more & LIBDIVIDE_U32_SHIFT_PATH) {
574  return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
575  } else {
576  uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
577  if (more & LIBDIVIDE_ADD_MARKER) {
578  uint32_t t = ((numer - q) >> 1) + q;
579  return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
580  } else {
581  return q >> more; //all upper bits are 0 - don't need to mask them off
582  }
583  }
584 }
585 
586 
587 int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
588  uint8_t more = denom->more;
589  if (more & LIBDIVIDE_U32_SHIFT_PATH) { return 0; }
590  else if (! (more & LIBDIVIDE_ADD_MARKER)) { return 1; }
591  else { return 2; }
592 }
593 
594 uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
595  return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
596 }
597 
598 uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
599  uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
600  return q >> denom->more;
601 }
602 
603 uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
604  // denom->add != 0
605  uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
606  uint32_t t = ((numer - q) >> 1) + q;
607  return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
608 }
609 
610 
611 
612 
613 #if LIBDIVIDE_USE_SSE2
614 __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
615  uint8_t more = denom->more;
616  if (more & LIBDIVIDE_U32_SHIFT_PATH) {
617  return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
618  } else {
619  __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
620  if (more & LIBDIVIDE_ADD_MARKER) {
621  //uint32_t t = ((numer - q) >> 1) + q;
622  //return t >> denom->shift;
623  __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
624  return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
625 
626  } else {
627  //q >> denom->shift
628  return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
629  }
630  }
631 }
632 
633 __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) {
634  return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
635 }
636 
637 __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) {
638  __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
639  return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
640 }
641 
642 __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) {
643  __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
644  __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
645  return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
646 }
647 
648 #endif
649 
651 
652 struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
653  struct libdivide_u64_t result;
654  if ((d & (d - 1)) == 0) {
655  result.more = libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH;
656  result.magic = 0;
657  } else {
658  const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d);
659 
660  uint64_t proposed_m, rem;
661  uint8_t more;
662  proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d
663 
664  LIBDIVIDE_ASSERT(rem > 0 && rem < d);
665  const uint64_t e = d - rem;
666 
667  /* This power works if e < 2**floor_log_2_d. */
668  if (e < (1ULL << floor_log_2_d)) {
669  /* This power works */
670  more = floor_log_2_d;
671  } else {
672  /* We have to use the general 65-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
673  proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
674  const uint64_t twice_rem = rem + rem;
675  if (twice_rem >= d || twice_rem < rem) { proposed_m += 1; }
676  more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
677  }
678  result.magic = 1 + proposed_m;
679  result.more = more;
680  //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases, which is why we do it outside of the if statement.
681  }
682  return result;
683 }
684 
685 uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
686  uint8_t more = denom->more;
687  if (more & LIBDIVIDE_U64_SHIFT_PATH) {
688  return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
689  } else {
690  uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
691  if (more & LIBDIVIDE_ADD_MARKER) {
692  uint64_t t = ((numer - q) >> 1) + q;
693  return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
694  } else {
695  return q >> more; //all upper bits are 0 - don't need to mask them off
696  }
697  }
698 }
699 
700 
701 int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) {
702  uint8_t more = denom->more;
703  if (more & LIBDIVIDE_U64_SHIFT_PATH) { return 0; }
704  else if (! (more & LIBDIVIDE_ADD_MARKER)) { return 1; }
705  else { return 2; }
706 }
707 
708 uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) {
709  return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
710 }
711 
712 uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) {
713  uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
714  return q >> denom->more;
715 }
716 
717 uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) {
718  uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
719  uint64_t t = ((numer - q) >> 1) + q;
720  return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
721 }
722 
723 #if LIBDIVIDE_USE_SSE2
724 __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
725  uint8_t more = denom->more;
726  if (more & LIBDIVIDE_U64_SHIFT_PATH) {
727  return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
728  } else {
729  __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
730  if (more & LIBDIVIDE_ADD_MARKER) {
731  //uint32_t t = ((numer - q) >> 1) + q;
732  //return t >> denom->shift;
733  __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
734  return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
735  } else {
736  //q >> denom->shift
737  return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
738  }
739  }
740 }
741 
742 __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) {
743  return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
744 }
745 
746 __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) {
747  __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
748  return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
749 }
750 
751 __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) {
752  __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
753  __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
754  return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
755 }
756 
757 
758 #endif
759 
761 
762 
763 static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
764  int64_t xl = x, yl = y;
765  int64_t rl = xl * yl;
766  return (int32_t)(rl >> 32); //needs to be arithmetic shift
767 }
768 
769 struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
770  struct libdivide_s32_t result;
771 
772  /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
773  uint32_t absD = (uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
774  if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
775  result.magic = 0;
776  result.more = libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH;
777  } else {
778  const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD);
779  LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
780 
781  uint8_t more;
782  //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1
783  uint32_t rem, proposed_m;
784  proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
785  const uint32_t e = absD - rem;
786 
787  /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */
788  if (e < (1U << floor_log_2_d)) {
789  /* This power works */
790  more = floor_log_2_d - 1;
791  } else {
792  /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */
793  proposed_m += proposed_m;
794  const uint32_t twice_rem = rem + rem;
795  if (twice_rem >= absD || twice_rem < rem) { proposed_m += 1; }
796  more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); //use the general algorithm
797  }
798  proposed_m += 1;
799  result.magic = (d < 0 ? -(int32_t)proposed_m : (int32_t)proposed_m);
800  result.more = more;
801 
802  }
803  return result;
804 }
805 
806 int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
807  uint8_t more = denom->more;
808  if (more & LIBDIVIDE_S32_SHIFT_PATH) {
809  uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
810  int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
811  q = q >> shifter;
812  int32_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend
813  q = (q ^ shiftMask) - shiftMask;
814  return q;
815  } else {
816  int32_t q = libdivide__mullhi_s32(denom->magic, numer);
817  if (more & LIBDIVIDE_ADD_MARKER) {
818  int32_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend
819  q += ((numer ^ sign) - sign);
820  }
821  q >>= more & LIBDIVIDE_32_SHIFT_MASK;
822  q += (q < 0);
823  return q;
824  }
825 }
826 
827 int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) {
828  uint8_t more = denom->more;
829  int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
830  if (more & LIBDIVIDE_S32_SHIFT_PATH) { return (positiveDivisor ? 0 : 1); }
831  else if (more & LIBDIVIDE_ADD_MARKER) { return (positiveDivisor ? 2 : 3); }
832  else { return 4; }
833 }
834 
835 int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) {
836  uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
837  int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
838  return q >> shifter;
839 }
840 
841 int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) {
842  uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
843  int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
844  return - (q >> shifter);
845 }
846 
847 int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) {
848  int32_t q = libdivide__mullhi_s32(denom->magic, numer);
849  q += numer;
850  q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
851  q += (q < 0);
852  return q;
853 }
854 
855 int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) {
856  int32_t q = libdivide__mullhi_s32(denom->magic, numer);
857  q -= numer;
858  q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
859  q += (q < 0);
860  return q;
861 }
862 
863 int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) {
864  int32_t q = libdivide__mullhi_s32(denom->magic, numer);
865  q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
866  q += (q < 0);
867  return q;
868 }
869 
870 #if LIBDIVIDE_USE_SSE2
871 __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
872  uint8_t more = denom->more;
873  if (more & LIBDIVIDE_S32_SHIFT_PATH) {
874  uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
875  __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); //could use _mm_srli_epi32 with an all -1 register
876  __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak);
877  q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter
878  __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //set all bits of shift mask = to the sign bit of more
879  q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
880  return q;
881  } else {
882  __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
883  if (more & LIBDIVIDE_ADD_MARKER) {
884  __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); //must be arithmetic shift
885  q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
886  }
887  q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift
888  q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
889  return q;
890  }
891 }
892 
893 __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) {
894  uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
895  __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
896  __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
897  return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
898 }
899 
900 __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) {
901  uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
902  __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
903  __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
904  return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)));
905 }
906 
907 __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) {
908  __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
909  q = _mm_add_epi32(q, numers);
910  q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
911  q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
912  return q;
913 }
914 
915 __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) {
916  __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
917  q = _mm_sub_epi32(q, numers);
918  q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
919  q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
920  return q;
921 }
922 
923 __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) {
924  __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
925  q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more)); //q >>= shift
926  q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
927  return q;
928 }
929 #endif
930 
932 
933 
934 struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
935  struct libdivide_s64_t result;
936 
937  /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
938  const uint64_t absD = (uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
939  if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
940  result.more = libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
941  result.magic = 0;
942  } else {
943  const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD);
944 
945  //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1
946  uint8_t more;
947  uint64_t rem, proposed_m;
948  proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
949  const uint64_t e = absD - rem;
950 
951  /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */
952  if (e < (1ULL << floor_log_2_d)) {
953  /* This power works */
954  more = floor_log_2_d - 1;
955  } else {
956  /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */
957  proposed_m += proposed_m;
958  const uint64_t twice_rem = rem + rem;
959  if (twice_rem >= absD || twice_rem < rem) { proposed_m += 1; }
960  more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
961  }
962  proposed_m += 1;
963  result.more = more;
964  result.magic = (d < 0 ? -(int64_t)proposed_m : (int64_t)proposed_m);
965  }
966  return result;
967 }
968 
969 int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
970  uint8_t more = denom->more;
971  int64_t magic = denom->magic;
972  if (magic == 0) { //shift path
973  uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
974  int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
975  q = q >> shifter;
976  int64_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend
977  q = (q ^ shiftMask) - shiftMask;
978  return q;
979  } else {
980  int64_t q = libdivide__mullhi_s64(magic, numer);
981  if (more & LIBDIVIDE_ADD_MARKER) {
982  int64_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend
983  q += ((numer ^ sign) - sign);
984  }
985  q >>= more & LIBDIVIDE_64_SHIFT_MASK;
986  q += (q < 0);
987  return q;
988  }
989 }
990 
991 
992 int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) {
993  uint8_t more = denom->more;
994  int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
995  if (denom->magic == 0) { return (positiveDivisor ? 0 : 1); } //shift path
996  else if (more & LIBDIVIDE_ADD_MARKER) { return (positiveDivisor ? 2 : 3); }
997  else { return 4; }
998 }
999 
1000 int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) {
1001  uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1002  int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1003  return q >> shifter;
1004 }
1005 
1006 int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) {
1007  //denom->shifter != -1 && demo->shiftMask != 0
1008  uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1009  int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1010  return - (q >> shifter);
1011 }
1012 
1013 int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) {
1014  int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1015  q += numer;
1016  q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1017  q += (q < 0);
1018  return q;
1019 }
1020 
1021 int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) {
1022  int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1023  q -= numer;
1024  q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1025  q += (q < 0);
1026  return q;
1027 }
1028 
1029 int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) {
1030  int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1031  q >>= denom->more;
1032  q += (q < 0);
1033  return q;
1034 }
1035 
1036 
1037 #if LIBDIVIDE_USE_SSE2
1038 __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) {
1039  uint8_t more = denom->more;
1040  int64_t magic = denom->magic;
1041  if (magic == 0) { //shift path
1042  uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1043  __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1044  __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); //q = numer + ((numer >> 63) & roundToZeroTweak);
1045  q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter
1046  __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
1047  q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
1048  return q;
1049  } else {
1050  __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
1051  if (more & LIBDIVIDE_ADD_MARKER) {
1052  __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //must be arithmetic shift
1053  q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
1054  }
1055  q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); //q >>= denom->mult_path.shift
1056  q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1057  return q;
1058  }
1059 }
1060 
1061 __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) {
1062  uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1063  __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1064  __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1065  q = libdivide_s64_shift_right_vector(q, shifter);
1066  return q;
1067 }
1068 
1069 __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) {
1070  uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1071  __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1072  __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1073  q = libdivide_s64_shift_right_vector(q, shifter);
1074  return _mm_sub_epi64(_mm_setzero_si128(), q);
1075 }
1076 
1077 __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) {
1078  __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1079  q = _mm_add_epi64(q, numers);
1080  q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1081  q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1082  return q;
1083 }
1084 
1085 __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) {
1086  __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1087  q = _mm_sub_epi64(q, numers);
1088  q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1089  q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1090  return q;
1091 }
1092 
1093 __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) {
1094  __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1095  q = libdivide_s64_shift_right_vector(q, denom->more);
1096  q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1097  return q;
1098 }
1099 
1100 #endif
1101 
1103 
1104 #ifdef __cplusplus
1105 
1106 /* The C++ template design here is a total mess. This needs to be fixed by someone better at templates than I. The current design is:
1107 
1108 - The base is a template divider_base that takes the integer type, the libdivide struct, a generating function, a get algorithm function, a do function, and either a do vector function or a dummy int.
1109 - The base has storage for the libdivide struct. This is the only storage (so the C++ class should be no larger than the libdivide struct).
1110 
1111 - Above that, there's divider_mid. This is an empty struct by default, but it is specialized against our four int types. divider_mid contains a template struct algo, that contains a typedef for a specialization of divider_base. struct algo is specialized to take an "algorithm number," where -1 means to use the general algorithm.
1112 
1113 - Publicly we have class divider, which inherits from divider_mid::algo. This also take an algorithm number, which defaults to -1 (the general algorithm).
1114 - divider has a operator / which allows you to use a divider as the divisor in a quotient expression.
1115 
1116 */
1117 
1118 namespace libdivide_internal {
1119 
1120 #if LIBDIVIDE_USE_SSE2
1121 #define MAYBE_VECTOR(x) x
1122 #define MAYBE_VECTOR_PARAM __m128i vector_func(__m128i, const DenomType *)
1123 #else
1124 #define MAYBE_VECTOR(x) 0
1125 #define MAYBE_VECTOR_PARAM int vector_func
1126 #endif
1127 
1128 /* Some bogus unswitch functions for unsigned types so the same (presumably templated) code can work for both signed and unsigned. */
1129 uint32_t crash_u32(uint32_t, const libdivide_u32_t *) { abort(); return *(uint32_t *)NULL; }
1130 uint64_t crash_u64(uint64_t, const libdivide_u64_t *) { abort(); return *(uint64_t *)NULL; }
1131 #if LIBDIVIDE_USE_SSE2
1132 __m128i crash_u32_vector(__m128i, const libdivide_u32_t *) { abort(); return *(__m128i *)NULL; }
1133 __m128i crash_u64_vector(__m128i, const libdivide_u64_t *) { abort(); return *(__m128i *)NULL; }
1134 #endif
1135 
1136 template<typename IntType, typename DenomType, DenomType gen_func(IntType), int get_algo(const DenomType *), IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
1137 class divider_base {
1138  public:
1139  DenomType denom;
1140  divider_base(IntType d) : denom(gen_func(d)) { }
1141  divider_base(const DenomType& d) : denom(d) { }
1142 
1143  IntType perform_divide(IntType val) const { return do_func(val, &denom); }
1144 #if LIBDIVIDE_USE_SSE2
1145  __m128i perform_divide_vector(__m128i val) const { return vector_func(val, &denom); }
1146 #endif
1147 
1148  int get_algorithm() const { return get_algo(&denom); }
1149 };
1150 
1151 
1152 template<class T> struct divider_mid { };
1153 
1154 template<> struct divider_mid<uint32_t> {
1155  typedef uint32_t IntType;
1156  typedef struct libdivide_u32_t DenomType;
1157  template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1158  typedef divider_base<IntType, DenomType, libdivide_u32_gen, libdivide_u32_get_algorithm, do_func, vector_func> divider;
1159  };
1160 
1161  template<int ALGO, int J = 0> struct algo { };
1162  template<int J> struct algo<-1, J> { typedef denom<libdivide_u32_do, MAYBE_VECTOR(libdivide_u32_do_vector)>::divider divider; };
1163  template<int J> struct algo<0, J> { typedef denom<libdivide_u32_do_alg0, MAYBE_VECTOR(libdivide_u32_do_vector_alg0)>::divider divider; };
1164  template<int J> struct algo<1, J> { typedef denom<libdivide_u32_do_alg1, MAYBE_VECTOR(libdivide_u32_do_vector_alg1)>::divider divider; };
1165  template<int J> struct algo<2, J> { typedef denom<libdivide_u32_do_alg2, MAYBE_VECTOR(libdivide_u32_do_vector_alg2)>::divider divider; };
1166 
1167  /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */
1168  template<int J> struct algo<3, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1169  template<int J> struct algo<4, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1170 
1171 };
1172 
1173 template<> struct divider_mid<int32_t> {
1174  typedef int32_t IntType;
1175  typedef struct libdivide_s32_t DenomType;
1176  template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1177  typedef divider_base<IntType, DenomType, libdivide_s32_gen, libdivide_s32_get_algorithm, do_func, vector_func> divider;
1178  };
1179 
1180 
1181  template<int ALGO, int J = 0> struct algo { };
1182  template<int J> struct algo<-1, J> { typedef denom<libdivide_s32_do, MAYBE_VECTOR(libdivide_s32_do_vector)>::divider divider; };
1183  template<int J> struct algo<0, J> { typedef denom<libdivide_s32_do_alg0, MAYBE_VECTOR(libdivide_s32_do_vector_alg0)>::divider divider; };
1184  template<int J> struct algo<1, J> { typedef denom<libdivide_s32_do_alg1, MAYBE_VECTOR(libdivide_s32_do_vector_alg1)>::divider divider; };
1185  template<int J> struct algo<2, J> { typedef denom<libdivide_s32_do_alg2, MAYBE_VECTOR(libdivide_s32_do_vector_alg2)>::divider divider; };
1186  template<int J> struct algo<3, J> { typedef denom<libdivide_s32_do_alg3, MAYBE_VECTOR(libdivide_s32_do_vector_alg3)>::divider divider; };
1187  template<int J> struct algo<4, J> { typedef denom<libdivide_s32_do_alg4, MAYBE_VECTOR(libdivide_s32_do_vector_alg4)>::divider divider; };
1188 
1189 };
1190 
1191 template<> struct divider_mid<uint64_t> {
1192  typedef uint64_t IntType;
1193  typedef struct libdivide_u64_t DenomType;
1194  template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1195  typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1196  };
1197 
1198  template<int ALGO, int J = 0> struct algo { };
1199  template<int J> struct algo<-1, J> { typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1200  template<int J> struct algo<0, J> { typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1201  template<int J> struct algo<1, J> { typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1202  template<int J> struct algo<2, J> { typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1203 
1204  /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */
1205  template<int J> struct algo<3, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1206  template<int J> struct algo<4, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1207 
1208 
1209 };
1210 
1211 template<> struct divider_mid<int64_t> {
1212  typedef int64_t IntType;
1213  typedef struct libdivide_s64_t DenomType;
1214  template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1215  typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1216  };
1217 
1218  template<int ALGO, int J = 0> struct algo { };
1219  template<int J> struct algo<-1, J> { typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1220  template<int J> struct algo<0, J> { typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1221  template<int J> struct algo<1, J> { typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1222  template<int J> struct algo<2, J> { typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1223  template<int J> struct algo<3, J> { typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1224  template<int J> struct algo<4, J> { typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1225 };
1226 
1227 }
1228 
1229 template<typename T, int ALGO = -1>
1230 class divider {
1231  private:
1232  typename libdivide_internal::divider_mid<T>::template algo<ALGO>::divider sub;
1233  template<int NEW_ALGO, typename S> friend divider<S, NEW_ALGO> unswitch(const divider<S, -1> & d);
1234  divider(const typename libdivide_internal::divider_mid<T>::DenomType& denom) : sub(denom) { }
1235 
1236  public:
1237 
1238  /* Ordinary constructor, that takes the divisor as a parameter. */
1239  divider(T n) : sub(n) { }
1240 
1241  /* Default constructor, that divides by 1 */
1242  divider() : sub(1) { }
1243 
1244  /* Divides the parameter by the divisor, returning the quotient */
1245  T perform_divide(T val) const { return sub.perform_divide(val); }
1246 
1247 #if LIBDIVIDE_USE_SSE2
1248  /* Treats the vector as either two or four packed values (depending on the size), and divides each of them by the divisor, returning the packed quotients. */
1249  __m128i perform_divide_vector(__m128i val) const { return sub.perform_divide_vector(val); }
1250 #endif
1251 
1252  /* Returns the index of algorithm, for use in the unswitch function */
1253  int get_algorithm() const { return sub.get_algorithm(); } // returns the algorithm for unswitching
1254 
1255  /* operator== */
1256  bool operator==(const divider<T, ALGO>& him) const { return sub.denom.magic == him.sub.denom.magic && sub.denom.more == him.sub.denom.more; }
1257 
1258  bool operator!=(const divider<T, ALGO>& him) const { return ! (*this == him); }
1259 };
1260 
1261 /* Returns a divider specialized for the given algorithm. */
1262 template<int NEW_ALGO, typename S>
1263 divider<S, NEW_ALGO> unswitch(const divider<S, -1> & d) { return divider<S, NEW_ALGO>(d.sub.denom); }
1264 
1265 /* Overload of the / operator for scalar division. */
1266 template<typename int_type, int ALGO>
1267 int_type operator/(int_type numer, const divider<int_type, ALGO>& denom) {
1268  return denom.perform_divide(numer);
1269 }
1270 
1271 #if LIBDIVIDE_USE_SSE2
1272 /* Overload of the / operator for vector division. */
1273 template<typename int_type, int ALGO>
1274 __m128i operator/(__m128i numer, const divider<int_type, ALGO>& denom) {
1275  return denom.perform_divide_vector(numer);
1276 }
1277 #endif
1278 
1279 
1280 #endif //__cplusplus
1281 
1282 #endif //LIBDIVIDE_HEADER_ONLY
1283 #ifdef __cplusplus
1284 } //close namespace libdivide
1285 } //close anonymous namespace
1286 #endif
Definition: libdivide.h:113
Definition: libdivide.h:128
Definition: libdivide.h:118
Definition: libdivide.h:123