33
33
typedef __m128i Vector8 ;
34
34
typedef __m128i Vector32 ;
35
35
36
+ #elif defined(__aarch64__ ) && defined(__ARM_NEON )
37
+ /*
38
+ * We use the Neon instructions if the compiler provides access to them (as
39
+ * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40
+ * technically optional for aarch64, it appears that all available 64-bit
41
+ * hardware does have it. Neon exists in some 32-bit hardware too, but we
42
+ * could not realistically use it there without a run-time check, which seems
43
+ * not worth the trouble for now.
44
+ */
45
+ #include <arm_neon.h>
46
+ #define USE_NEON
47
+ typedef uint8x16_t Vector8 ;
48
+ typedef uint32x4_t Vector32 ;
49
+
36
50
#else
37
51
/*
38
52
* If no SIMD instructions are available, we can in some cases emulate vector
@@ -90,6 +104,8 @@ vector8_load(Vector8 *v, const uint8 *s)
90
104
{
91
105
#if defined(USE_SSE2 )
92
106
* v = _mm_loadu_si128 ((const __m128i * ) s );
107
+ #elif defined(USE_NEON )
108
+ * v = vld1q_u8 (s );
93
109
#else
94
110
memcpy (v , s , sizeof (Vector8 ));
95
111
#endif
@@ -101,6 +117,8 @@ vector32_load(Vector32 *v, const uint32 *s)
101
117
{
102
118
#ifdef USE_SSE2
103
119
* v = _mm_loadu_si128 ((const __m128i * ) s );
120
+ #elif defined(USE_NEON )
121
+ * v = vld1q_u32 (s );
104
122
#endif
105
123
}
106
124
#endif /* ! USE_NO_SIMD */
@@ -113,6 +131,8 @@ vector8_broadcast(const uint8 c)
113
131
{
114
132
#if defined(USE_SSE2 )
115
133
return _mm_set1_epi8 (c );
134
+ #elif defined(USE_NEON )
135
+ return vdupq_n_u8 (c );
116
136
#else
117
137
return ~UINT64CONST (0 ) / 0xFF * c ;
118
138
#endif
@@ -124,6 +144,8 @@ vector32_broadcast(const uint32 c)
124
144
{
125
145
#ifdef USE_SSE2
126
146
return _mm_set1_epi32 (c );
147
+ #elif defined(USE_NEON )
148
+ return vdupq_n_u32 (c );
127
149
#endif
128
150
}
129
151
#endif /* ! USE_NO_SIMD */
@@ -153,7 +175,7 @@ vector8_has(const Vector8 v, const uint8 c)
153
175
#if defined(USE_NO_SIMD )
154
176
/* any bytes in v equal to c will evaluate to zero via XOR */
155
177
result = vector8_has_zero (v ^ vector8_broadcast (c ));
156
- #elif defined( USE_SSE2 )
178
+ #else
157
179
result = vector8_is_highbit_set (vector8_eq (v , vector8_broadcast (c )));
158
180
#endif
159
181
@@ -173,7 +195,7 @@ vector8_has_zero(const Vector8 v)
173
195
* circular definition.
174
196
*/
175
197
return vector8_has_le (v , 0 );
176
- #elif defined( USE_SSE2 )
198
+ #else
177
199
return vector8_has (v , 0 );
178
200
#endif
179
201
}
@@ -223,7 +245,7 @@ vector8_has_le(const Vector8 v, const uint8 c)
223
245
}
224
246
}
225
247
}
226
- #elif defined( USE_SSE2 )
248
+ #else
227
249
228
250
/*
229
251
* Use saturating subtraction to find bytes <= c, which will present as
@@ -245,6 +267,8 @@ vector8_is_highbit_set(const Vector8 v)
245
267
{
246
268
#ifdef USE_SSE2
247
269
return _mm_movemask_epi8 (v ) != 0 ;
270
+ #elif defined(USE_NEON )
271
+ return vmaxvq_u8 (v ) > 0x7F ;
248
272
#else
249
273
return v & vector8_broadcast (0x80 );
250
274
#endif
@@ -258,6 +282,8 @@ vector8_or(const Vector8 v1, const Vector8 v2)
258
282
{
259
283
#ifdef USE_SSE2
260
284
return _mm_or_si128 (v1 , v2 );
285
+ #elif defined(USE_NEON )
286
+ return vorrq_u8 (v1 , v2 );
261
287
#else
262
288
return v1 | v2 ;
263
289
#endif
@@ -269,6 +295,8 @@ vector32_or(const Vector32 v1, const Vector32 v2)
269
295
{
270
296
#ifdef USE_SSE2
271
297
return _mm_or_si128 (v1 , v2 );
298
+ #elif defined(USE_NEON )
299
+ return vorrq_u32 (v1 , v2 );
272
300
#endif
273
301
}
274
302
#endif /* ! USE_NO_SIMD */
@@ -285,6 +313,8 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
285
313
{
286
314
#ifdef USE_SSE2
287
315
return _mm_subs_epu8 (v1 , v2 );
316
+ #elif defined(USE_NEON )
317
+ return vqsubq_u8 (v1 , v2 );
288
318
#endif
289
319
}
290
320
#endif /* ! USE_NO_SIMD */
@@ -299,6 +329,8 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
299
329
{
300
330
#ifdef USE_SSE2
301
331
return _mm_cmpeq_epi8 (v1 , v2 );
332
+ #elif defined(USE_NEON )
333
+ return vceqq_u8 (v1 , v2 );
302
334
#endif
303
335
}
304
336
#endif /* ! USE_NO_SIMD */
@@ -309,6 +341,8 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
309
341
{
310
342
#ifdef USE_SSE2
311
343
return _mm_cmpeq_epi32 (v1 , v2 );
344
+ #elif defined(USE_NEON )
345
+ return vceqq_u32 (v1 , v2 );
312
346
#endif
313
347
}
314
348
#endif /* ! USE_NO_SIMD */
0 commit comments