57 __m128i x = _mm_srli_epi64(a, amt);
58 x = _mm_xor_si128(x, m);
59 __m128i result = _mm_sub_epi64(x, m);
67 t = _mm_cmplt_epi32(a, zero);
68 t = _mm_unpacklo_epi32(a, t);
76 t = _mm_cmplt_epi32(a, zero);
77 t = _mm_unpackhi_epi32(a, t);
83 const ui32 src_line_offset,
85 const ui32 dst_line_offset,
92 const si32 *sp = src_line->
i32 + src_line_offset;
93 si32 *dp = dst_line->
i32 + dst_line_offset;
94 __m128i sh = _mm_set1_epi32((
si32)shift);
95 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
97 __m128i s = _mm_loadu_si128((__m128i*)sp);
98 s = _mm_add_epi32(s, sh);
99 _mm_storeu_si128((__m128i*)dp, s);
104 const si32 *sp = src_line->
i32 + src_line_offset;
105 si64 *dp = dst_line->
i64 + dst_line_offset;
106 __m128i zero = _mm_setzero_si128();
107 __m128i sh = _mm_set1_epi64x(shift);
108 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
111 s = _mm_loadu_si128((__m128i*)sp);
114 t = _mm_add_epi64(t, sh);
115 _mm_storeu_si128((__m128i*)dp, t);
118 t = _mm_add_epi64(t, sh);
119 _mm_storeu_si128((__m128i*)dp + 1, t);
127 const si64 *sp = src_line->
i64 + src_line_offset;
128 si32 *dp = dst_line->
i32 + dst_line_offset;
129 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
130 __m128i sh = _mm_set1_epi64x(shift);
131 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
134 s = _mm_loadu_si128((__m128i*)sp);
135 s = _mm_add_epi64(s, sh);
137 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
138 t = _mm_and_si128(low_bits, t);
140 s = _mm_loadu_si128((__m128i*)sp + 1);
141 s = _mm_add_epi64(s, sh);
143 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
144 s = _mm_andnot_si128(low_bits, s);
146 t = _mm_or_si128(s, t);
147 _mm_storeu_si128((__m128i*)dp, t);
154 const ui32 src_line_offset,
156 const ui32 dst_line_offset,
163 const si32 *sp = src_line->
i32 + src_line_offset;
164 si32 *dp = dst_line->
i32 + dst_line_offset;
165 __m128i sh = _mm_set1_epi32((
si32)(-shift));
166 __m128i zero = _mm_setzero_si128();
167 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
169 __m128i s = _mm_loadu_si128((__m128i*)sp);
170 __m128i c = _mm_cmplt_epi32(s, zero);
171 __m128i v_m_sh = _mm_sub_epi32(sh, s);
172 v_m_sh = _mm_and_si128(c, v_m_sh);
173 s = _mm_andnot_si128(c, s);
174 s = _mm_or_si128(s, v_m_sh);
175 _mm_storeu_si128((__m128i*)dp, s);
180 const si32 *sp = src_line->
i32 + src_line_offset;
181 si64 *dp = dst_line->
i64 + dst_line_offset;
182 __m128i sh = _mm_set1_epi64x(-shift);
183 __m128i zero = _mm_setzero_si128();
184 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
186 __m128i s, t, u, c, v_m_sh;
187 s = _mm_loadu_si128((__m128i*)sp);
189 t = _mm_cmplt_epi32(s, zero);
190 u = _mm_unpacklo_epi32(s, t);
191 c = _mm_unpacklo_epi32(t, t);
193 v_m_sh = _mm_sub_epi64(sh, u);
194 v_m_sh = _mm_and_si128(c, v_m_sh);
195 u = _mm_andnot_si128(c, u);
196 u = _mm_or_si128(u, v_m_sh);
198 _mm_storeu_si128((__m128i*)dp, u);
199 u = _mm_unpackhi_epi32(s, t);
200 c = _mm_unpackhi_epi32(t, t);
202 v_m_sh = _mm_sub_epi64(sh, u);
203 v_m_sh = _mm_and_si128(c, v_m_sh);
204 u = _mm_andnot_si128(c, u);
205 u = _mm_or_si128(u, v_m_sh);
207 _mm_storeu_si128((__m128i*)dp + 1, u);
215 const si64 *sp = src_line->
i64 + src_line_offset;
216 si32 *dp = dst_line->
i32 + dst_line_offset;
217 __m128i sh = _mm_set1_epi64x(-shift);
218 __m128i zero = _mm_setzero_si128();
219 __m128i half_mask = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
220 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
224 __m128i s, t, p, n, m, tm;
225 s = _mm_loadu_si128((__m128i*)sp);
227 tm = _mm_cmplt_epi32(s, zero);
228 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
229 tm = _mm_sub_epi64(sh, s);
230 n = _mm_and_si128(m, tm);
231 p = _mm_andnot_si128(m, s);
232 tm = _mm_or_si128(n, p);
233 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
234 t = _mm_and_si128(half_mask, tm);
236 s = _mm_loadu_si128((__m128i*)sp + 1);
237 tm = _mm_cmplt_epi32(s, zero);
238 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
239 tm = _mm_sub_epi64(sh, s);
240 n = _mm_and_si128(m, tm);
241 p = _mm_andnot_si128(m, s);
242 tm = _mm_or_si128(n, p);
243 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
244 tm = _mm_andnot_si128(half_mask, tm);
246 t = _mm_or_si128(t, tm);
247 _mm_storeu_si128((__m128i*)dp, t);
256 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
257 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
258 __m128 shift = _mm_set1_ps(0.5f);
259 __m128 m = _mm_set1_ps(mul);
260 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
262 __m128 t = _mm_loadu_ps(sp);
263 __m128 s = _mm_add_ps(t, shift);
264 s = _mm_mul_ps(s, m);
265 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
267 _MM_SET_ROUNDING_MODE(rounding_mode);
274 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
275 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
276 __m128 m = _mm_set1_ps(mul);
277 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
279 __m128 t = _mm_loadu_ps(sp);
280 __m128 s = _mm_mul_ps(t, m);
281 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
283 _MM_SET_ROUNDING_MODE(rounding_mode);
310 for (
int i = (repeat + 3) >> 2; i > 0; --i)
312 __m128i mr = _mm_load_si128((__m128i*)rp);
313 __m128i mg = _mm_load_si128((__m128i*)gp);
314 __m128i mb = _mm_load_si128((__m128i*)bp);
315 __m128i t = _mm_add_epi32(mr, mb);
316 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
317 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
318 t = _mm_sub_epi32(mb, mg);
319 _mm_store_si128((__m128i*)cbp, t);
320 t = _mm_sub_epi32(mr, mg);
321 _mm_store_si128((__m128i*)crp, t);
323 rp += 4; gp += 4; bp += 4;
324 yp += 4; cbp += 4; crp += 4;
335 __m128i zero = _mm_setzero_si128();
336 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
339 for (
int i = (repeat + 3) >> 2; i > 0; --i)
341 __m128i mr32 = _mm_load_si128((__m128i*)rp);
342 __m128i mg32 = _mm_load_si128((__m128i*)gp);
343 __m128i mb32 = _mm_load_si128((__m128i*)bp);
344 __m128i mr, mg, mb, t;
349 t = _mm_add_epi64(mr, mb);
350 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
352 t = _mm_sub_epi64(mb, mg);
353 _mm_store_si128((__m128i*)cbp, t);
354 t = _mm_sub_epi64(mr, mg);
355 _mm_store_si128((__m128i*)crp, t);
357 yp += 2; cbp += 2; crp += 2;
363 t = _mm_add_epi64(mr, mb);
364 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
366 t = _mm_sub_epi64(mb, mg);
367 _mm_store_si128((__m128i*)cbp, t);
368 t = _mm_sub_epi64(mr, mg);
369 _mm_store_si128((__m128i*)crp, t);
371 rp += 4; gp += 4; bp += 4;
372 yp += 2; cbp += 2; crp += 2;
401 for (
int i = (repeat + 3) >> 2; i > 0; --i)
403 __m128i my = _mm_load_si128((__m128i*)yp);
404 __m128i mcb = _mm_load_si128((__m128i*)cbp);
405 __m128i mcr = _mm_load_si128((__m128i*)crp);
407 __m128i t = _mm_add_epi32(mcb, mcr);
408 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
409 _mm_store_si128((__m128i*)gp, t);
410 __m128i u = _mm_add_epi32(mcb, t);
411 _mm_store_si128((__m128i*)bp, u);
412 u = _mm_add_epi32(mcr, t);
413 _mm_store_si128((__m128i*)rp, u);
415 yp += 4; cbp += 4; crp += 4;
416 rp += 4; gp += 4; bp += 4;
427 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
428 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
431 for (
int i = (repeat + 3) >> 2; i > 0; --i)
433 __m128i my, mcb, mcr, tr, tg, tb;
434 my = _mm_load_si128((__m128i*)yp);
435 mcb = _mm_load_si128((__m128i*)cbp);
436 mcr = _mm_load_si128((__m128i*)crp);
438 tg = _mm_add_epi64(mcb, mcr);
440 tb = _mm_add_epi64(mcb, tg);
441 tr = _mm_add_epi64(mcr, tg);
444 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
445 mr = _mm_and_si128(low_bits, mr);
446 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
447 mg = _mm_and_si128(low_bits, mg);
448 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
449 mb = _mm_and_si128(low_bits, mb);
451 yp += 2; cbp += 2; crp += 2;
453 my = _mm_load_si128((__m128i*)yp);
454 mcb = _mm_load_si128((__m128i*)cbp);
455 mcr = _mm_load_si128((__m128i*)crp);
457 tg = _mm_add_epi64(mcb, mcr);
459 tb = _mm_add_epi64(mcb, tg);
460 tr = _mm_add_epi64(mcr, tg);
462 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
463 tr = _mm_andnot_si128(low_bits, tr);
464 mr = _mm_or_si128(mr, tr);
465 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
466 tg = _mm_andnot_si128(low_bits, tg);
467 mg = _mm_or_si128(mg, tg);
468 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
469 tb = _mm_andnot_si128(low_bits, tb);
470 mb = _mm_or_si128(mb, tb);
472 _mm_store_si128((__m128i*)rp, mr);
473 _mm_store_si128((__m128i*)gp, mg);
474 _mm_store_si128((__m128i*)bp, mb);
476 yp += 2; cbp += 2; crp += 2;
477 rp += 4; gp += 4; bp += 4;
void sse2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
static __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
void sse2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)