OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_colour_wasm.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2021, Aous Naman
6// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2021, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_wasm.cpp
34// Author: Aous Naman
35// Date: 9 February 2021
36//***************************************************************************/
37
38#include <cmath>
39#include <wasm_simd128.h>
40
41#include "ojph_defs.h"
42#include "ojph_mem.h"
43#include "ojph_colour.h"
44#include "ojph_colour_local.h"
45
46namespace ojph {
47 namespace local {
48
50 void wasm_rev_convert(const line_buf *src_line,
51 const ui32 src_line_offset,
52 line_buf *dst_line,
53 const ui32 dst_line_offset,
54 si64 shift, ui32 width)
55 {
56 if (src_line->flags & line_buf::LFT_32BIT)
57 {
58 if (dst_line->flags & line_buf::LFT_32BIT)
59 {
60 const si32 *sp = src_line->i32 + src_line_offset;
61 si32 *dp = dst_line->i32 + dst_line_offset;
62 v128_t sh = wasm_i32x4_splat((si32)shift);
63 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
64 {
65 v128_t s = wasm_v128_load(sp);
66 s = wasm_i32x4_add(s, sh);
67 wasm_v128_store(dp, s);
68 }
69 }
70 else
71 {
72 const si32 *sp = src_line->i32 + src_line_offset;
73 si64 *dp = dst_line->i64 + dst_line_offset;
74 v128_t sh = wasm_i64x2_splat(shift);
75 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
76 {
77 v128_t s, t;
78 s = wasm_v128_load(sp);
79
80 t = wasm_i64x2_extend_low_i32x4(s);
81 t = wasm_i64x2_add(t, sh);
82 wasm_v128_store(dp, t);
83
84 t = wasm_i64x2_extend_high_i32x4(s);
85 t = wasm_i64x2_add(t, sh);
86 wasm_v128_store(dp + 2, t);
87 }
88 }
89 }
90 else
91 {
92 assert(src_line->flags | line_buf::LFT_64BIT);
93 assert(dst_line->flags | line_buf::LFT_32BIT);
94 const si64 *sp = src_line->i64 + src_line_offset;
95 si32 *dp = dst_line->i32 + dst_line_offset;
96 v128_t sh = wasm_i64x2_splat(shift);
97 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
98 {
99 v128_t s0, s1;
100 s0 = wasm_v128_load(sp);
101 s0 = wasm_i64x2_add(s0, sh);
102 s1 = wasm_v128_load(sp + 2);
103 s1 = wasm_i64x2_add(s1, sh);
104 s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
105 wasm_v128_store(dp, s0);
106 }
107 }
108 }
109
112 const ui32 src_line_offset,
113 line_buf *dst_line,
114 const ui32 dst_line_offset,
115 si64 shift, ui32 width)
116 {
117 if (src_line->flags & line_buf::LFT_32BIT)
118 {
119 if (dst_line->flags & line_buf::LFT_32BIT)
120 {
121 const si32 *sp = src_line->i32 + src_line_offset;
122 si32 *dp = dst_line->i32 + dst_line_offset;
123 v128_t sh = wasm_i32x4_splat((si32)(-shift));
124 v128_t zero = wasm_i32x4_splat(0);
125 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
126 {
127 v128_t s = wasm_v128_load(sp);
128 v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value
129 v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value
130 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
131 s = wasm_v128_andnot(c, s); // keep only +ve or 0
132 s = wasm_v128_or(s, v_m_sh); // combine
133 wasm_v128_store(dp, s);
134 }
135 }
136 else
137 {
138 const si32 *sp = src_line->i32 + src_line_offset;
139 si64 *dp = dst_line->i64 + dst_line_offset;
140 v128_t sh = wasm_i64x2_splat(-shift);
141 v128_t zero = wasm_i32x4_splat(0);
142 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
143 {
144 v128_t s, u, c, v_m_sh;
145 s = wasm_v128_load(sp);
146
147 u = wasm_i64x2_extend_low_i32x4(s);
148 c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value
149 v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value
150 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
151 u = wasm_v128_andnot(c, u); // keep only +ve or 0
152 u = wasm_v128_or(u, v_m_sh); // combine
153
154 wasm_v128_store(dp, u);
155
156 u = wasm_i64x2_extend_high_i32x4(s);
157 c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value
158 v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value
159 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
160 u = wasm_v128_andnot(c, u); // keep only +ve or 0
161 u = wasm_v128_or(u, v_m_sh); // combine
162
163 wasm_v128_store(dp + 2, u);
164 }
165 }
166 }
167 else
168 {
169 assert(src_line->flags | line_buf::LFT_64BIT);
170 assert(dst_line->flags | line_buf::LFT_32BIT);
171 const si64 *sp = src_line->i64 + src_line_offset;
172 si32 *dp = dst_line->i32 + dst_line_offset;
173 v128_t sh = wasm_i64x2_splat(-shift);
174 v128_t zero = wasm_i32x4_splat(0);
175 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
176 {
177 // s for source, t for target, p for positive, n for negative,
178 // m for mask, and tm for temp
179 v128_t s, t0, t1, p, n, m, tm;
180 s = wasm_v128_load(sp);
181 m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value
182 tm = wasm_i64x2_sub(sh, s); // - shift - value
183 n = wasm_v128_and(m, tm); // -ve
184 p = wasm_v128_andnot(m, s); // +ve
185 t0 = wasm_v128_or(n, p);
186
187 s = wasm_v128_load(sp + 2);
188 m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value
189 tm = wasm_i64x2_sub(sh, s); // - shift - value
190 n = wasm_v128_and(m, tm); // -ve
191 p = wasm_v128_andnot(m, s); // +ve
192 t1 = wasm_v128_or(n, p);
193
194 t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
195 wasm_v128_store(dp, t0);
196 }
197 }
198 }
199
201 void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
202 ui32 width)
203 {
204 v128_t shift = wasm_f32x4_splat(0.5f);
205 v128_t m = wasm_f32x4_splat(mul);
206 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
207 {
208 v128_t t = wasm_v128_load(sp);
209 v128_t s = wasm_f32x4_convert_i32x4(t);
210 s = wasm_f32x4_mul(s, m);
211 s = wasm_f32x4_sub(s, shift);
212 wasm_v128_store(dp, s);
213 }
214 }
215
217 void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
218 ui32 width)
219 {
220 v128_t m = wasm_f32x4_splat(mul);
221 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
222 {
223 v128_t t = wasm_v128_load(sp);
224 v128_t s = wasm_f32x4_convert_i32x4(t);
225 s = wasm_f32x4_mul(s, m);
226 wasm_v128_store(dp, s);
227 }
228 }
229
231 void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
232 ui32 width)
233 {
234 // rounding mode is always set to _MM_ROUND_NEAREST
235 v128_t shift = wasm_f32x4_splat(0.5f);
236 v128_t m = wasm_f32x4_splat(mul);
237 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
238 {
239 v128_t t = wasm_v128_load(sp);
240 v128_t s = wasm_f32x4_add(t, shift);
241 s = wasm_f32x4_mul(s, m);
242 s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
243 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
244 }
245 }
246
248 void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
249 ui32 width)
250 {
251 // rounding mode is always set to _MM_ROUND_NEAREST
252 v128_t shift = wasm_f32x4_splat(0.5f);
253 v128_t m = wasm_f32x4_splat(mul);
254 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
255 {
256 v128_t t = wasm_v128_load(sp);
257 v128_t s = wasm_f32x4_mul(t, m);
258 s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
259 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
260 }
261 }
262
265 const line_buf *g,
266 const line_buf *b,
267 line_buf *y, line_buf *cb, line_buf *cr,
268 ui32 repeat)
269 {
270 assert((y->flags & line_buf::LFT_REVERSIBLE) &&
276
277 if (y->flags & line_buf::LFT_32BIT)
278 {
279 assert((y->flags & line_buf::LFT_32BIT) &&
280 (cb->flags & line_buf::LFT_32BIT) &&
281 (cr->flags & line_buf::LFT_32BIT) &&
282 (r->flags & line_buf::LFT_32BIT) &&
283 (g->flags & line_buf::LFT_32BIT) &&
284 (b->flags & line_buf::LFT_32BIT));
285 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
286 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
287
288 for (int i = (repeat + 3) >> 2; i > 0; --i)
289 {
290 v128_t mr = wasm_v128_load(rp);
291 v128_t mg = wasm_v128_load(gp);
292 v128_t mb = wasm_v128_load(bp);
293 v128_t t = wasm_i32x4_add(mr, mb);
294 t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
295 wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
296 t = wasm_i32x4_sub(mb, mg);
297 wasm_v128_store(cbp, t);
298 t = wasm_i32x4_sub(mr, mg);
299 wasm_v128_store(crp, t);
300
301 rp += 4; gp += 4; bp += 4;
302 yp += 4; cbp += 4; crp += 4;
303 }
304 }
305 else
306 {
307 assert((y->flags & line_buf::LFT_64BIT) &&
308 (cb->flags & line_buf::LFT_64BIT) &&
309 (cr->flags & line_buf::LFT_64BIT) &&
310 (r->flags & line_buf::LFT_32BIT) &&
311 (g->flags & line_buf::LFT_32BIT) &&
313 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
314 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
315 for (int i = (repeat + 3) >> 2; i > 0; --i)
316 {
317 v128_t mr32 = wasm_v128_load(rp);
318 v128_t mg32 = wasm_v128_load(gp);
319 v128_t mb32 = wasm_v128_load(bp);
320 v128_t mr, mg, mb, t;
321 mr = wasm_i64x2_extend_low_i32x4(mr32);
322 mg = wasm_i64x2_extend_low_i32x4(mg32);
323 mb = wasm_i64x2_extend_low_i32x4(mb32);
324
325 t = wasm_i64x2_add(mr, mb);
326 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
327 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
328 t = wasm_i64x2_sub(mb, mg);
329 wasm_v128_store(cbp, t);
330 t = wasm_i64x2_sub(mr, mg);
331 wasm_v128_store(crp, t);
332
333 yp += 2; cbp += 2; crp += 2;
334
335 mr = wasm_i64x2_extend_high_i32x4(mr32);
336 mg = wasm_i64x2_extend_high_i32x4(mg32);
337 mb = wasm_i64x2_extend_high_i32x4(mb32);
338
339 t = wasm_i64x2_add(mr, mb);
340 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
341 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
342 t = wasm_i64x2_sub(mb, mg);
343 wasm_v128_store(cbp, t);
344 t = wasm_i64x2_sub(mr, mg);
345 wasm_v128_store(crp, t);
346
347 rp += 4; gp += 4; bp += 4;
348 yp += 2; cbp += 2; crp += 2;
349 }
350 }
351 }
352
355 const line_buf *cb,
356 const line_buf *cr,
357 line_buf *r, line_buf *g, line_buf *b,
358 ui32 repeat)
359 {
360 assert((y->flags & line_buf::LFT_REVERSIBLE) &&
366
367 if (y->flags & line_buf::LFT_32BIT)
368 {
369 assert((y->flags & line_buf::LFT_32BIT) &&
370 (cb->flags & line_buf::LFT_32BIT) &&
371 (cr->flags & line_buf::LFT_32BIT) &&
372 (r->flags & line_buf::LFT_32BIT) &&
373 (g->flags & line_buf::LFT_32BIT) &&
375 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
376 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
377 for (int i = (repeat + 3) >> 2; i > 0; --i)
378 {
379 v128_t my = wasm_v128_load(yp);
380 v128_t mcb = wasm_v128_load(cbp);
381 v128_t mcr = wasm_v128_load(crp);
382
383 v128_t t = wasm_i32x4_add(mcb, mcr);
384 t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
385 wasm_v128_store(gp, t);
386 v128_t u = wasm_i32x4_add(mcb, t);
387 wasm_v128_store(bp, u);
388 u = wasm_i32x4_add(mcr, t);
389 wasm_v128_store(rp, u);
390
391 yp += 4; cbp += 4; crp += 4;
392 rp += 4; gp += 4; bp += 4;
393 }
394 }
395 else
396 {
397 assert((y->flags & line_buf::LFT_64BIT) &&
398 (cb->flags & line_buf::LFT_64BIT) &&
399 (cr->flags & line_buf::LFT_64BIT) &&
400 (r->flags & line_buf::LFT_32BIT) &&
401 (g->flags & line_buf::LFT_32BIT) &&
403 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
404 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
405 for (int i = (repeat + 3) >> 2; i > 0; --i)
406 {
407 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
408 my = wasm_v128_load(yp);
409 mcb = wasm_v128_load(cbp);
410 mcr = wasm_v128_load(crp);
411
412 tg0 = wasm_i64x2_add(mcb, mcr);
413 tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
414 tb0 = wasm_i64x2_add(mcb, tg0);
415 tr0 = wasm_i64x2_add(mcr, tg0);
416
417 yp += 2; cbp += 2; crp += 2;
418
419 my = wasm_v128_load(yp);
420 mcb = wasm_v128_load(cbp);
421 mcr = wasm_v128_load(crp);
422
423 tg1 = wasm_i64x2_add(mcb, mcr);
424 tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
425 tb1 = wasm_i64x2_add(mcb, tg1);
426 tr1 = wasm_i64x2_add(mcr, tg1);
427
428 tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
429 tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
430 tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
431
432 wasm_v128_store(rp, tr0);
433 wasm_v128_store(gp, tg0);
434 wasm_v128_store(bp, tb0);
435
436 yp += 2; cbp += 2; crp += 2;
437 rp += 4; gp += 4; bp += 4;
438 }
439 }
440 }
441
443 void wasm_ict_forward(const float *r, const float *g, const float *b,
444 float *y, float *cb, float *cr, ui32 repeat)
445 {
446 v128_t alpha_rf = wasm_f32x4_splat(CT_CNST::ALPHA_RF);
447 v128_t alpha_gf = wasm_f32x4_splat(CT_CNST::ALPHA_GF);
448 v128_t alpha_bf = wasm_f32x4_splat(CT_CNST::ALPHA_BF);
449 v128_t beta_cbf = wasm_f32x4_splat(CT_CNST::BETA_CbF);
450 v128_t beta_crf = wasm_f32x4_splat(CT_CNST::BETA_CrF);
451 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
452 {
453 v128_t mr = wasm_v128_load(r);
454 v128_t mb = wasm_v128_load(b);
455 v128_t my = wasm_f32x4_mul(alpha_rf, mr);
456 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_gf, wasm_v128_load(g)));
457 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_bf, mb));
458 wasm_v128_store(y, my);
459 wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
460 wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
461
462 r += 4; g += 4; b += 4;
463 y += 4; cb += 4; cr += 4;
464 }
465 }
466
468 void wasm_ict_backward(const float *y, const float *cb, const float *cr,
469 float *r, float *g, float *b, ui32 repeat)
470 {
471 v128_t gamma_cr2g = wasm_f32x4_splat(CT_CNST::GAMMA_CR2G);
472 v128_t gamma_cb2g = wasm_f32x4_splat(CT_CNST::GAMMA_CB2G);
473 v128_t gamma_cr2r = wasm_f32x4_splat(CT_CNST::GAMMA_CR2R);
474 v128_t gamma_cb2b = wasm_f32x4_splat(CT_CNST::GAMMA_CB2B);
475 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
476 {
477 v128_t my = wasm_v128_load(y);
478 v128_t mcr = wasm_v128_load(cr);
479 v128_t mcb = wasm_v128_load(cb);
480 v128_t mg = wasm_f32x4_sub(my, wasm_f32x4_mul(gamma_cr2g, mcr));
481 wasm_v128_store(g, wasm_f32x4_sub(mg, wasm_f32x4_mul(gamma_cb2g, mcb)));
482 wasm_v128_store(r, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cr2r, mcr)));
483 wasm_v128_store(b, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cb2b, mcb)));
484
485 y += 4; cb += 4; cr += 4;
486 r += 4; g += 4; b += 4;
487 }
488 }
489
490 }
491}
si64 * i64
Definition: ojph_mem.h:173
si32 * i32
Definition: ojph_mem.h:172
void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void wasm_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void wasm_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void wasm_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
int64_t si64
Definition: ojph_defs.h:57
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF