39#include <wasm_simd128.h>
51 const ui32 src_line_offset,
53 const ui32 dst_line_offset,
60 const si32 *sp = src_line->
i32 + src_line_offset;
61 si32 *dp = dst_line->
i32 + dst_line_offset;
62 v128_t sh = wasm_i32x4_splat((
si32)shift);
63 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
65 v128_t s = wasm_v128_load(sp);
66 s = wasm_i32x4_add(s, sh);
67 wasm_v128_store(dp, s);
72 const si32 *sp = src_line->
i32 + src_line_offset;
73 si64 *dp = dst_line->
i64 + dst_line_offset;
74 v128_t sh = wasm_i64x2_splat(shift);
75 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
78 s = wasm_v128_load(sp);
80 t = wasm_i64x2_extend_low_i32x4(s);
81 t = wasm_i64x2_add(t, sh);
82 wasm_v128_store(dp, t);
84 t = wasm_i64x2_extend_high_i32x4(s);
85 t = wasm_i64x2_add(t, sh);
86 wasm_v128_store(dp + 2, t);
94 const si64 *sp = src_line->
i64 + src_line_offset;
95 si32 *dp = dst_line->
i32 + dst_line_offset;
96 v128_t sh = wasm_i64x2_splat(shift);
97 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
100 s0 = wasm_v128_load(sp);
101 s0 = wasm_i64x2_add(s0, sh);
102 s1 = wasm_v128_load(sp + 2);
103 s1 = wasm_i64x2_add(s1, sh);
104 s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
105 wasm_v128_store(dp, s0);
112 const ui32 src_line_offset,
114 const ui32 dst_line_offset,
121 const si32 *sp = src_line->
i32 + src_line_offset;
122 si32 *dp = dst_line->
i32 + dst_line_offset;
123 v128_t sh = wasm_i32x4_splat((
si32)(-shift));
124 v128_t zero = wasm_i32x4_splat(0);
125 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
127 v128_t s = wasm_v128_load(sp);
128 v128_t c = wasm_i32x4_lt(s, zero);
129 v128_t v_m_sh = wasm_i32x4_sub(sh, s);
130 v_m_sh = wasm_v128_and(c, v_m_sh);
131 s = wasm_v128_andnot(c, s);
132 s = wasm_v128_or(s, v_m_sh);
133 wasm_v128_store(dp, s);
138 const si32 *sp = src_line->
i32 + src_line_offset;
139 si64 *dp = dst_line->
i64 + dst_line_offset;
140 v128_t sh = wasm_i64x2_splat(-shift);
141 v128_t zero = wasm_i32x4_splat(0);
142 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
144 v128_t s, u, c, v_m_sh;
145 s = wasm_v128_load(sp);
147 u = wasm_i64x2_extend_low_i32x4(s);
148 c = wasm_i64x2_lt(u, zero);
149 v_m_sh = wasm_i64x2_sub(sh, u);
150 v_m_sh = wasm_v128_and(c, v_m_sh);
151 u = wasm_v128_andnot(c, u);
152 u = wasm_v128_or(u, v_m_sh);
154 wasm_v128_store(dp, u);
156 u = wasm_i64x2_extend_high_i32x4(s);
157 c = wasm_i64x2_lt(u, zero);
158 v_m_sh = wasm_i64x2_sub(sh, u);
159 v_m_sh = wasm_v128_and(c, v_m_sh);
160 u = wasm_v128_andnot(c, u);
161 u = wasm_v128_or(u, v_m_sh);
163 wasm_v128_store(dp + 2, u);
171 const si64 *sp = src_line->
i64 + src_line_offset;
172 si32 *dp = dst_line->
i32 + dst_line_offset;
173 v128_t sh = wasm_i64x2_splat(-shift);
174 v128_t zero = wasm_i32x4_splat(0);
175 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
179 v128_t s, t0, t1, p, n, m, tm;
180 s = wasm_v128_load(sp);
181 m = wasm_i64x2_lt(s, zero);
182 tm = wasm_i64x2_sub(sh, s);
183 n = wasm_v128_and(m, tm);
184 p = wasm_v128_andnot(m, s);
185 t0 = wasm_v128_or(n, p);
187 s = wasm_v128_load(sp + 2);
188 m = wasm_i64x2_lt(s, zero);
189 tm = wasm_i64x2_sub(sh, s);
190 n = wasm_v128_and(m, tm);
191 p = wasm_v128_andnot(m, s);
192 t1 = wasm_v128_or(n, p);
194 t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
195 wasm_v128_store(dp, t0);
204 v128_t shift = wasm_f32x4_splat(0.5f);
205 v128_t m = wasm_f32x4_splat(mul);
206 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
208 v128_t t = wasm_v128_load(sp);
209 v128_t s = wasm_f32x4_convert_i32x4(t);
210 s = wasm_f32x4_mul(s, m);
211 s = wasm_f32x4_sub(s, shift);
212 wasm_v128_store(dp, s);
220 v128_t m = wasm_f32x4_splat(mul);
221 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
223 v128_t t = wasm_v128_load(sp);
224 v128_t s = wasm_f32x4_convert_i32x4(t);
225 s = wasm_f32x4_mul(s, m);
226 wasm_v128_store(dp, s);
235 v128_t shift = wasm_f32x4_splat(0.5f);
236 v128_t m = wasm_f32x4_splat(mul);
237 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
239 v128_t t = wasm_v128_load(sp);
240 v128_t s = wasm_f32x4_add(t, shift);
241 s = wasm_f32x4_mul(s, m);
242 s = wasm_f32x4_add(s, shift);
243 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
252 v128_t shift = wasm_f32x4_splat(0.5f);
253 v128_t m = wasm_f32x4_splat(mul);
254 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
256 v128_t t = wasm_v128_load(sp);
257 v128_t s = wasm_f32x4_mul(t, m);
258 s = wasm_f32x4_add(s, shift);
259 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
288 for (
int i = (repeat + 3) >> 2; i > 0; --i)
290 v128_t mr = wasm_v128_load(rp);
291 v128_t mg = wasm_v128_load(gp);
292 v128_t mb = wasm_v128_load(bp);
293 v128_t t = wasm_i32x4_add(mr, mb);
294 t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
295 wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
296 t = wasm_i32x4_sub(mb, mg);
297 wasm_v128_store(cbp, t);
298 t = wasm_i32x4_sub(mr, mg);
299 wasm_v128_store(crp, t);
301 rp += 4; gp += 4; bp += 4;
302 yp += 4; cbp += 4; crp += 4;
315 for (
int i = (repeat + 3) >> 2; i > 0; --i)
317 v128_t mr32 = wasm_v128_load(rp);
318 v128_t mg32 = wasm_v128_load(gp);
319 v128_t mb32 = wasm_v128_load(bp);
320 v128_t mr, mg, mb, t;
321 mr = wasm_i64x2_extend_low_i32x4(mr32);
322 mg = wasm_i64x2_extend_low_i32x4(mg32);
323 mb = wasm_i64x2_extend_low_i32x4(mb32);
325 t = wasm_i64x2_add(mr, mb);
326 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
327 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
328 t = wasm_i64x2_sub(mb, mg);
329 wasm_v128_store(cbp, t);
330 t = wasm_i64x2_sub(mr, mg);
331 wasm_v128_store(crp, t);
333 yp += 2; cbp += 2; crp += 2;
335 mr = wasm_i64x2_extend_high_i32x4(mr32);
336 mg = wasm_i64x2_extend_high_i32x4(mg32);
337 mb = wasm_i64x2_extend_high_i32x4(mb32);
339 t = wasm_i64x2_add(mr, mb);
340 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
341 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
342 t = wasm_i64x2_sub(mb, mg);
343 wasm_v128_store(cbp, t);
344 t = wasm_i64x2_sub(mr, mg);
345 wasm_v128_store(crp, t);
347 rp += 4; gp += 4; bp += 4;
348 yp += 2; cbp += 2; crp += 2;
377 for (
int i = (repeat + 3) >> 2; i > 0; --i)
379 v128_t my = wasm_v128_load(yp);
380 v128_t mcb = wasm_v128_load(cbp);
381 v128_t mcr = wasm_v128_load(crp);
383 v128_t t = wasm_i32x4_add(mcb, mcr);
384 t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
385 wasm_v128_store(gp, t);
386 v128_t u = wasm_i32x4_add(mcb, t);
387 wasm_v128_store(bp, u);
388 u = wasm_i32x4_add(mcr, t);
389 wasm_v128_store(rp, u);
391 yp += 4; cbp += 4; crp += 4;
392 rp += 4; gp += 4; bp += 4;
405 for (
int i = (repeat + 3) >> 2; i > 0; --i)
407 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
408 my = wasm_v128_load(yp);
409 mcb = wasm_v128_load(cbp);
410 mcr = wasm_v128_load(crp);
412 tg0 = wasm_i64x2_add(mcb, mcr);
413 tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
414 tb0 = wasm_i64x2_add(mcb, tg0);
415 tr0 = wasm_i64x2_add(mcr, tg0);
417 yp += 2; cbp += 2; crp += 2;
419 my = wasm_v128_load(yp);
420 mcb = wasm_v128_load(cbp);
421 mcr = wasm_v128_load(crp);
423 tg1 = wasm_i64x2_add(mcb, mcr);
424 tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
425 tb1 = wasm_i64x2_add(mcb, tg1);
426 tr1 = wasm_i64x2_add(mcr, tg1);
428 tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
429 tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
430 tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
432 wasm_v128_store(rp, tr0);
433 wasm_v128_store(gp, tg0);
434 wasm_v128_store(bp, tb0);
436 yp += 2; cbp += 2; crp += 2;
437 rp += 4; gp += 4; bp += 4;
444 float *y,
float *cb,
float *cr,
ui32 repeat)
451 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
453 v128_t mr = wasm_v128_load(r);
454 v128_t mb = wasm_v128_load(b);
455 v128_t my = wasm_f32x4_mul(alpha_rf, mr);
456 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_gf, wasm_v128_load(g)));
457 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_bf, mb));
458 wasm_v128_store(y, my);
459 wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
460 wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
462 r += 4; g += 4; b += 4;
463 y += 4; cb += 4; cr += 4;
469 float *r,
float *g,
float *b,
ui32 repeat)
475 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
477 v128_t my = wasm_v128_load(y);
478 v128_t mcr = wasm_v128_load(cr);
479 v128_t mcb = wasm_v128_load(cb);
480 v128_t mg = wasm_f32x4_sub(my, wasm_f32x4_mul(gamma_cr2g, mcr));
481 wasm_v128_store(g, wasm_f32x4_sub(mg, wasm_f32x4_mul(gamma_cb2g, mcb)));
482 wasm_v128_store(r, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cr2r, mcr)));
483 wasm_v128_store(b, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cb2b, mcb)));
485 y += 4; cb += 4; cr += 4;
486 r += 4; g += 4; b += 4;
void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void wasm_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void wasm_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void wasm_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF