48 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
49 x1 = _mm_shuffle_epi32(x0, 0xEE);
50 x0 = _mm_or_si128(x0, x1);
51 x1 = _mm_shuffle_epi32(x0, 0x55);
52 x0 = _mm_or_si128(x0, x1);
53 _mm_storeu_si128((__m128i*)address, x0);
65 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
66 x1 = _mm_shuffle_epi32(x0, 0xEE);
67 x0 = _mm_or_si128(x0, x1);
68 _mm_storeu_si128((__m128i*)address, x0);
79 float delta_inv,
ui32 count,
ui32* max_val)
84 ui32 shift = 31 - K_max;
85 __m128i m0 = _mm_set1_epi32(INT_MIN);
86 __m128i zero = _mm_setzero_si128();
87 __m128i one = _mm_set1_epi32(1);
88 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
89 __m128i *p = (__m128i*)sp;
90 for (
ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
92 __m128i v = _mm_loadu_si128(p);
93 __m128i sign = _mm_cmplt_epi32(v, zero);
94 __m128i val = _mm_xor_si128(v, sign);
95 __m128i ones = _mm_and_si128(sign, one);
96 val = _mm_add_epi32(val, ones);
97 sign = _mm_and_si128(sign, m0);
98 val = _mm_slli_epi32(val, (
int)shift);
99 tmax = _mm_or_si128(tmax, val);
100 val = _mm_or_si128(val, sign);
101 _mm_storeu_si128((__m128i*)dp, val);
103 _mm_storeu_si128((__m128i*)max_val, tmax);
108 float delta_inv,
ui32 count,
ui32* max_val)
114 __m128 d = _mm_set1_ps(delta_inv);
115 __m128i zero = _mm_setzero_si128();
116 __m128i one = _mm_set1_epi32(1);
117 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
118 float *p = (
float*)sp;
119 for (
ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
121 __m128 vf = _mm_loadu_ps(p);
122 vf = _mm_mul_ps(vf, d);
123 __m128i val = _mm_cvtps_epi32(vf);
124 __m128i sign = _mm_cmplt_epi32(val, zero);
125 val = _mm_xor_si128(val, sign);
126 __m128i ones = _mm_and_si128(sign, one);
127 val = _mm_add_epi32(val, ones);
128 tmax = _mm_or_si128(tmax, val);
129 sign = _mm_slli_epi32(sign, 31);
130 val = _mm_or_si128(val, sign);
131 _mm_storeu_si128((__m128i*)dp, val);
133 _mm_storeu_si128((__m128i*)max_val, tmax);
138 float delta,
ui32 count)
141 ui32 shift = 31 - K_max;
142 __m128i m1 = _mm_set1_epi32(INT_MAX);
143 __m128i zero = _mm_setzero_si128();
144 __m128i one = _mm_set1_epi32(1);
146 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
148 __m128i v = _mm_load_si128((__m128i*)sp);
149 __m128i val = _mm_and_si128(v, m1);
150 val = _mm_srli_epi32(val, (
int)shift);
151 __m128i sign = _mm_cmplt_epi32(v, zero);
152 val = _mm_xor_si128(val, sign);
153 __m128i ones = _mm_and_si128(sign, one);
154 val = _mm_add_epi32(val, ones);
155 _mm_storeu_si128((__m128i*)p, val);
161 float delta,
ui32 count)
164 __m128i m1 = _mm_set1_epi32(INT_MAX);
165 __m128 d = _mm_set1_ps(delta);
166 float *p = (
float*)dp;
167 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
169 __m128i v = _mm_load_si128((__m128i*)sp);
170 __m128i vali = _mm_and_si128(v, m1);
171 __m128 valf = _mm_cvtepi32_ps(vali);
172 valf = _mm_mul_ps(valf, d);
173 __m128i sign = _mm_andnot_si128(m1, v);
174 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
175 _mm_storeu_ps(p, valf);
181 float delta_inv,
ui32 count,
ui64* max_val)
186 ui32 shift = 63 - K_max;
187 __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
188 __m128i zero = _mm_setzero_si128();
189 __m128i one = _mm_set1_epi64x(1);
190 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
191 __m128i *p = (__m128i*)sp;
192 for (
ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
194 __m128i v = _mm_loadu_si128(p);
195 __m128i sign = _mm_cmplt_epi32(v, zero);
196 sign = _mm_shuffle_epi32(sign, 0xF5);
197 __m128i val = _mm_xor_si128(v, sign);
198 __m128i ones = _mm_and_si128(sign, one);
199 val = _mm_add_epi64(val, ones);
200 sign = _mm_and_si128(sign, m0);
201 val = _mm_slli_epi64(val, (
int)shift);
202 tmax = _mm_or_si128(tmax, val);
203 val = _mm_or_si128(val, sign);
204 _mm_storeu_si128((__m128i*)dp, val);
206 _mm_storeu_si128((__m128i*)max_val, tmax);
211 float delta,
ui32 count)
214 ui32 shift = 63 - K_max;
215 __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
216 __m128i zero = _mm_setzero_si128();
217 __m128i one = _mm_set1_epi64x(1);
219 for (
ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
221 __m128i v = _mm_load_si128((__m128i*)sp);
222 __m128i val = _mm_and_si128(v, m1);
223 val = _mm_srli_epi64(val, (
int)shift);
224 __m128i sign = _mm_cmplt_epi32(v, zero);
225 sign = _mm_shuffle_epi32(sign, 0xF5);
226 val = _mm_xor_si128(val, sign);
227 __m128i ones = _mm_and_si128(sign, one);
228 val = _mm_add_epi64(val, ones);
229 _mm_storeu_si128((__m128i*)p, val);
void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 sse2_find_max_val32(ui32 *address)
ui64 sse2_find_max_val64(ui64 *address)
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)