|
8 | 8 | *
|
9 | 9 | * src/include/port/simd.h
|
10 | 10 | *
|
| 11 | + * NOTES |
| 12 | + * - VectorN in this file refers to a register where the element operands |
| 13 | + * are N bits wide. The vector width is platform-specific, so users that care |
| 14 | + * about that will need to inspect "sizeof(VectorN)". |
| 15 | + * |
11 | 16 | *-------------------------------------------------------------------------
|
12 | 17 | */
|
13 | 18 | #ifndef SIMD_H
|
14 | 19 | #define SIMD_H
|
15 | 20 |
|
| 21 | +#if (defined(__x86_64__) || defined(_M_AMD64)) |
16 | 22 | /*
|
17 | 23 | * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
|
18 | 24 | * that compilers targeting this architecture understand SSE2 intrinsics.
|
|
22 | 28 | * will allow the use of intrinsics that haven't been enabled at compile
|
23 | 29 | * time.
|
24 | 30 | */
|
25 |
| -#if (defined(__x86_64__) || defined(_M_AMD64)) |
26 | 31 | #include <emmintrin.h>
|
27 | 32 | #define USE_SSE2
|
| 33 | +typedef __m128i Vector8; |
| 34 | + |
| 35 | +#else |
| 36 | +/* |
| 37 | + * If no SIMD instructions are available, we can in some cases emulate vector |
| 38 | + * operations using bitwise operations on unsigned integers. |
| 39 | + */ |
| 40 | +#define USE_NO_SIMD |
| 41 | +typedef uint64 Vector8; |
| 42 | +#endif |
| 43 | + |
| 44 | + |
| 45 | +/* load/store operations */ |
| 46 | +static inline void vector8_load(Vector8 *v, const uint8 *s); |
| 47 | + |
| 48 | +/* assignment operations */ |
| 49 | +static inline Vector8 vector8_broadcast(const uint8 c); |
| 50 | + |
| 51 | +/* element-wise comparisons to a scalar */ |
| 52 | +static inline bool vector8_has(const Vector8 v, const uint8 c); |
| 53 | +static inline bool vector8_has_zero(const Vector8 v); |
| 54 | +static inline bool vector8_has_le(const Vector8 v, const uint8 c); |
| 55 | + |
| 56 | + |
| 57 | +/* |
| 58 | + * Load a chunk of memory into the given vector. |
| 59 | + */ |
| 60 | +static inline void |
| 61 | +vector8_load(Vector8 *v, const uint8 *s) |
| 62 | +{ |
| 63 | +#if defined(USE_SSE2) |
| 64 | + *v = _mm_loadu_si128((const __m128i *) s); |
| 65 | +#else |
| 66 | + memcpy(v, s, sizeof(Vector8)); |
28 | 67 | #endif
|
| 68 | +} |
| 69 | + |
| 70 | + |
| 71 | +/* |
| 72 | + * Create a vector with all elements set to the same value. |
| 73 | + */ |
| 74 | +static inline Vector8 |
| 75 | +vector8_broadcast(const uint8 c) |
| 76 | +{ |
| 77 | +#if defined(USE_SSE2) |
| 78 | + return _mm_set1_epi8(c); |
| 79 | +#else |
| 80 | + return ~UINT64CONST(0) / 0xFF * c; |
| 81 | +#endif |
| 82 | +} |
| 83 | + |
| 84 | +/* |
| 85 | + * Return true if any elements in the vector are equal to the given scalar. |
| 86 | + */ |
| 87 | +static inline bool |
| 88 | +vector8_has(const Vector8 v, const uint8 c) |
| 89 | +{ |
| 90 | + bool result; |
| 91 | + |
| 92 | + /* pre-compute the result for assert checking */ |
| 93 | +#ifdef USE_ASSERT_CHECKING |
| 94 | + bool assert_result = false; |
| 95 | + |
| 96 | + for (int i = 0; i < sizeof(Vector8); i++) |
| 97 | + { |
| 98 | + if (((const uint8 *) &v)[i] == c) |
| 99 | + { |
| 100 | + assert_result = true; |
| 101 | + break; |
| 102 | + } |
| 103 | + } |
| 104 | +#endif /* USE_ASSERT_CHECKING */ |
| 105 | + |
| 106 | +#if defined(USE_NO_SIMD) |
| 107 | + /* any bytes in v equal to c will evaluate to zero via XOR */ |
| 108 | + result = vector8_has_zero(v ^ vector8_broadcast(c)); |
| 109 | +#elif defined(USE_SSE2) |
| 110 | + result = _mm_movemask_epi8(_mm_cmpeq_epi8(v, vector8_broadcast(c))); |
| 111 | +#endif |
| 112 | + |
| 113 | + Assert(assert_result == result); |
| 114 | + return result; |
| 115 | +} |
| 116 | + |
| 117 | +/* |
| 118 | + * Convenience function equivalent to vector8_has(v, 0) |
| 119 | + */ |
| 120 | +static inline bool |
| 121 | +vector8_has_zero(const Vector8 v) |
| 122 | +{ |
| 123 | +#if defined(USE_NO_SIMD) |
| 124 | + /* |
| 125 | + * We cannot call vector8_has() here, because that would lead to a circular |
| 126 | + * definition. |
| 127 | + */ |
| 128 | + return vector8_has_le(v, 0); |
| 129 | +#elif defined(USE_SSE2) |
| 130 | + return vector8_has(v, 0); |
| 131 | +#endif |
| 132 | +} |
| 133 | + |
| 134 | +/* |
| 135 | + * Return true if any elements in the vector are less than or equal to the |
| 136 | + * given scalar. |
| 137 | + */ |
| 138 | +static inline bool |
| 139 | +vector8_has_le(const Vector8 v, const uint8 c) |
| 140 | +{ |
| 141 | + bool result = false; |
| 142 | +#if defined(USE_SSE2) |
| 143 | + __m128i sub; |
| 144 | +#endif |
| 145 | + |
| 146 | + /* pre-compute the result for assert checking */ |
| 147 | +#ifdef USE_ASSERT_CHECKING |
| 148 | + bool assert_result = false; |
| 149 | + |
| 150 | + for (int i = 0; i < sizeof(Vector8); i++) |
| 151 | + { |
| 152 | + if (((const uint8 *) &v)[i] <= c) |
| 153 | + { |
| 154 | + assert_result = true; |
| 155 | + break; |
| 156 | + } |
| 157 | + } |
| 158 | +#endif /* USE_ASSERT_CHECKING */ |
| 159 | + |
| 160 | +#if defined(USE_NO_SIMD) |
| 161 | + |
| 162 | + /* |
| 163 | + * To find bytes <= c, we can use bitwise operations to find bytes < c+1, |
| 164 | + * but it only works if c+1 <= 128 and if the highest bit in v is not set. |
| 165 | + * Adapted from |
| 166 | + * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord |
| 167 | + */ |
| 168 | + if ((int64) v >= 0 && c < 0x80) |
| 169 | + result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80); |
| 170 | + else |
| 171 | + { |
| 172 | + /* one byte at a time */ |
| 173 | + for (int i = 0; i < sizeof(Vector8); i++) |
| 174 | + { |
| 175 | + if (((const uint8 *) &v)[i] <= c) |
| 176 | + { |
| 177 | + result = true; |
| 178 | + break; |
| 179 | + } |
| 180 | + } |
| 181 | + } |
| 182 | +#elif defined(USE_SSE2) |
| 183 | + |
| 184 | + /* |
| 185 | + * Use saturating subtraction to find bytes <= c, which will present as |
| 186 | + * NUL bytes in 'sub'. |
| 187 | + */ |
| 188 | + sub = _mm_subs_epu8(v, vector8_broadcast(c)); |
| 189 | + result = vector8_has_zero(sub); |
| 190 | +#endif |
| 191 | + |
| 192 | + Assert(assert_result == result); |
| 193 | + return result; |
| 194 | +} |
29 | 195 |
|
30 | 196 | #endif /* SIMD_H */
|
0 commit comments