@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
80
80
return false;
81
81
}
82
82
83
+ #ifndef USE_NO_SIMD
84
+ /*
85
+ * pg_lfind32_simd_helper
86
+ *
87
+ * Searches one 4-register-block of integers. The caller is responsible for
88
+ * ensuring that there are at least 4-registers-worth of integers remaining.
89
+ */
90
+ static inline bool
91
+ pg_lfind32_simd_helper (const Vector32 keys , uint32 * base )
92
+ {
93
+ const uint32 nelem_per_vector = sizeof (Vector32 ) / sizeof (uint32 );
94
+ Vector32 vals1 ,
95
+ vals2 ,
96
+ vals3 ,
97
+ vals4 ,
98
+ result1 ,
99
+ result2 ,
100
+ result3 ,
101
+ result4 ,
102
+ tmp1 ,
103
+ tmp2 ,
104
+ result ;
105
+
106
+ /* load the next block into 4 registers */
107
+ vector32_load (& vals1 , base );
108
+ vector32_load (& vals2 , & base [nelem_per_vector ]);
109
+ vector32_load (& vals3 , & base [nelem_per_vector * 2 ]);
110
+ vector32_load (& vals4 , & base [nelem_per_vector * 3 ]);
111
+
112
+ /* compare each value to the key */
113
+ result1 = vector32_eq (keys , vals1 );
114
+ result2 = vector32_eq (keys , vals2 );
115
+ result3 = vector32_eq (keys , vals3 );
116
+ result4 = vector32_eq (keys , vals4 );
117
+
118
+ /* combine the results into a single variable */
119
+ tmp1 = vector32_or (result1 , result2 );
120
+ tmp2 = vector32_or (result3 , result4 );
121
+ result = vector32_or (tmp1 , tmp2 );
122
+
123
+ /* return whether there was a match */
124
+ return vector32_is_highbit_set (result );
125
+ }
126
+ #endif /* ! USE_NO_SIMD */
127
+
83
128
/*
84
129
* pg_lfind32
85
130
*
@@ -95,8 +140,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
95
140
96
141
/*
97
142
* For better instruction-level parallelism, each loop iteration operates
98
- * on a block of four registers. Testing for SSE2 has showed this is ~40%
99
- * faster than using a block of two registers.
143
+ * on a block of four registers.
100
144
*/
101
145
const Vector32 keys = vector32_broadcast (key ); /* load copies of key */
102
146
const uint32 nelem_per_vector = sizeof (Vector32 ) / sizeof (uint32 );
@@ -109,57 +153,51 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
109
153
bool assert_result = false;
110
154
111
155
/* pre-compute the result for assert checking */
112
- for (i = 0 ; i < nelem ; i ++ )
156
+ for (int j = 0 ; j < nelem ; j ++ )
113
157
{
114
- if (key == base [i ])
158
+ if (key == base [j ])
115
159
{
116
160
assert_result = true;
117
161
break ;
118
162
}
119
163
}
120
164
#endif
121
165
122
- for (i = 0 ; i < tail_idx ; i += nelem_per_iteration )
166
+ /*
167
+ * If there aren't enough elements for the SIMD code, jump to the standard
168
+ * one-by-one linear search code.
169
+ */
170
+ if (nelem < nelem_per_iteration )
171
+ goto one_by_one ;
172
+
173
+ /*
174
+ * Process as many elements as possible with a block of 4 registers.
175
+ */
176
+ do
123
177
{
124
- Vector32 vals1 ,
125
- vals2 ,
126
- vals3 ,
127
- vals4 ,
128
- result1 ,
129
- result2 ,
130
- result3 ,
131
- result4 ,
132
- tmp1 ,
133
- tmp2 ,
134
- result ;
135
-
136
- /* load the next block into 4 registers */
137
- vector32_load (& vals1 , & base [i ]);
138
- vector32_load (& vals2 , & base [i + nelem_per_vector ]);
139
- vector32_load (& vals3 , & base [i + nelem_per_vector * 2 ]);
140
- vector32_load (& vals4 , & base [i + nelem_per_vector * 3 ]);
141
-
142
- /* compare each value to the key */
143
- result1 = vector32_eq (keys , vals1 );
144
- result2 = vector32_eq (keys , vals2 );
145
- result3 = vector32_eq (keys , vals3 );
146
- result4 = vector32_eq (keys , vals4 );
147
-
148
- /* combine the results into a single variable */
149
- tmp1 = vector32_or (result1 , result2 );
150
- tmp2 = vector32_or (result3 , result4 );
151
- result = vector32_or (tmp1 , tmp2 );
152
-
153
- /* see if there was a match */
154
- if (vector32_is_highbit_set (result ))
178
+ if (pg_lfind32_simd_helper (keys , & base [i ]))
155
179
{
156
180
Assert (assert_result == true);
157
181
return true;
158
182
}
159
- }
183
+
184
+ i += nelem_per_iteration ;
185
+
186
+ } while (i < tail_idx );
187
+
188
+ /*
189
+ * Process the last 'nelem_per_iteration' elements in the array with a
190
+ * 4-register block. This will cause us to check a subset of the elements
191
+ * more than once, but that won't affect correctness, and testing has
192
+ * demonstrated that this helps more cases than it harms.
193
+ */
194
+ Assert (assert_result == pg_lfind32_simd_helper (keys , & base [nelem - nelem_per_iteration ]));
195
+ return pg_lfind32_simd_helper (keys , & base [nelem - nelem_per_iteration ]);
196
+
160
197
#endif /* ! USE_NO_SIMD */
161
198
162
- /* Process the remaining elements one at a time. */
199
+ one_by_one :
200
+ /* Process the elements one at a time. */
163
201
for (; i < nelem ; i ++ )
164
202
{
165
203
if (key == base [i ])
0 commit comments