@@ -79,6 +79,7 @@ static inline bool vector8_has_le(const Vector8 v, const uint8 c);
79
79
static inline bool vector8_is_highbit_set (const Vector8 v );
80
80
#ifndef USE_NO_SIMD
81
81
static inline bool vector32_is_highbit_set (const Vector32 v );
82
+ static inline uint32 vector8_highbit_mask (const Vector8 v );
82
83
#endif
83
84
84
85
/* arithmetic operations */
@@ -96,6 +97,7 @@ static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
96
97
*/
97
98
#ifndef USE_NO_SIMD
98
99
static inline Vector8 vector8_eq (const Vector8 v1 , const Vector8 v2 );
100
+ static inline Vector8 vector8_min (const Vector8 v1 , const Vector8 v2 );
99
101
static inline Vector32 vector32_eq (const Vector32 v1 , const Vector32 v2 );
100
102
#endif
101
103
@@ -299,6 +301,36 @@ vector32_is_highbit_set(const Vector32 v)
299
301
}
300
302
#endif /* ! USE_NO_SIMD */
301
303
304
+ /*
305
+ * Return a bitmask formed from the high-bit of each element.
306
+ */
307
+ #ifndef USE_NO_SIMD
308
+ static inline uint32
309
+ vector8_highbit_mask (const Vector8 v )
310
+ {
311
+ #ifdef USE_SSE2
312
+ return (uint32 ) _mm_movemask_epi8 (v );
313
+ #elif defined(USE_NEON )
314
+ /*
315
+ * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
316
+ * returns a uint64, making it inconvenient to combine mask values from
317
+ * multiple vectors.
318
+ */
319
+ static const uint8 mask [16 ] = {
320
+ 1 << 0 , 1 << 1 , 1 << 2 , 1 << 3 ,
321
+ 1 << 4 , 1 << 5 , 1 << 6 , 1 << 7 ,
322
+ 1 << 0 , 1 << 1 , 1 << 2 , 1 << 3 ,
323
+ 1 << 4 , 1 << 5 , 1 << 6 , 1 << 7 ,
324
+ };
325
+
326
+ uint8x16_t masked = vandq_u8 (vld1q_u8 (mask ), (uint8x16_t ) vshrq_n_s8 (v , 7 ));
327
+ uint8x16_t maskedhi = vextq_u8 (masked , masked , 8 );
328
+
329
+ return (uint32 ) vaddvq_u16 ((uint16x8_t ) vzip1q_u8 (masked , maskedhi ));
330
+ #endif
331
+ }
332
+ #endif /* ! USE_NO_SIMD */
333
+
302
334
/*
303
335
* Return the bitwise OR of the inputs
304
336
*/
@@ -372,4 +404,19 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
372
404
}
373
405
#endif /* ! USE_NO_SIMD */
374
406
407
+ /*
408
+ * Given two vectors, return a vector with the minimum element of each.
409
+ */
410
+ #ifndef USE_NO_SIMD
411
+ static inline Vector8
412
+ vector8_min (const Vector8 v1 , const Vector8 v2 )
413
+ {
414
+ #ifdef USE_SSE2
415
+ return _mm_min_epu8 (v1 , v2 );
416
+ #elif defined(USE_NEON )
417
+ return vminq_u8 (v1 , v2 );
418
+ #endif
419
+ }
420
+ #endif /* ! USE_NO_SIMD */
421
+
375
422
#endif /* SIMD_H */
0 commit comments