OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_colour_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_sse2.cpp
34// Author: Aous Naman
35// Date: 11 October 2019
36//***************************************************************************/
37
38#include <climits>
39#include <cmath>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_colour.h"
45
46#include <emmintrin.h>
47
48namespace ojph {
49 namespace local {
50
52 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
53 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
54 {
55 // note than m must be obtained using
56 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
57 __m128i x = _mm_srli_epi64(a, amt);
58 x = _mm_xor_si128(x, m);
59 __m128i result = _mm_sub_epi64(x, m);
60 return result;
61 }
62
64 static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
65 {
66 __m128i t;
67 t = _mm_cmplt_epi32(a, zero); // get -ve
68 t = _mm_unpacklo_epi32(a, t);
69 return t;
70 }
71
73 static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
74 {
75 __m128i t;
76 t = _mm_cmplt_epi32(a, zero); // get -ve
77 t = _mm_unpackhi_epi32(a, t);
78 return t;
79 }
80
82 void sse2_rev_convert(const line_buf *src_line,
83 const ui32 src_line_offset,
84 line_buf *dst_line,
85 const ui32 dst_line_offset,
86 si64 shift, ui32 width)
87 {
88 if (src_line->flags & line_buf::LFT_32BIT)
89 {
90 if (dst_line->flags & line_buf::LFT_32BIT)
91 {
92 const si32 *sp = src_line->i32 + src_line_offset;
93 si32 *dp = dst_line->i32 + dst_line_offset;
94 __m128i sh = _mm_set1_epi32((si32)shift);
95 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
96 {
97 __m128i s = _mm_loadu_si128((__m128i*)sp);
98 s = _mm_add_epi32(s, sh);
99 _mm_storeu_si128((__m128i*)dp, s);
100 }
101 }
102 else
103 {
104 const si32 *sp = src_line->i32 + src_line_offset;
105 si64 *dp = dst_line->i64 + dst_line_offset;
106 __m128i zero = _mm_setzero_si128();
107 __m128i sh = _mm_set1_epi64x(shift);
108 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
109 {
110 __m128i s, t;
111 s = _mm_loadu_si128((__m128i*)sp);
112
113 t = sse2_cvtlo_epi32_epi64(s, zero);
114 t = _mm_add_epi64(t, sh);
115 _mm_storeu_si128((__m128i*)dp, t);
116
117 t = sse2_cvthi_epi32_epi64(s, zero);
118 t = _mm_add_epi64(t, sh);
119 _mm_storeu_si128((__m128i*)dp + 1, t);
120 }
121 }
122 }
123 else
124 {
125 assert(src_line->flags | line_buf::LFT_64BIT);
126 assert(dst_line->flags | line_buf::LFT_32BIT);
127 const si64 *sp = src_line->i64 + src_line_offset;
128 si32 *dp = dst_line->i32 + dst_line_offset;
129 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
130 __m128i sh = _mm_set1_epi64x(shift);
131 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
132 {
133 __m128i s, t;
134 s = _mm_loadu_si128((__m128i*)sp);
135 s = _mm_add_epi64(s, sh);
136
137 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
138 t = _mm_and_si128(low_bits, t);
139
140 s = _mm_loadu_si128((__m128i*)sp + 1);
141 s = _mm_add_epi64(s, sh);
142
143 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
144 s = _mm_andnot_si128(low_bits, s);
145
146 t = _mm_or_si128(s, t);
147 _mm_storeu_si128((__m128i*)dp, t);
148 }
149 }
150 }
151
154 const ui32 src_line_offset,
155 line_buf *dst_line,
156 const ui32 dst_line_offset,
157 si64 shift, ui32 width)
158 {
159 if (src_line->flags & line_buf::LFT_32BIT)
160 {
161 if (dst_line->flags & line_buf::LFT_32BIT)
162 {
163 const si32 *sp = src_line->i32 + src_line_offset;
164 si32 *dp = dst_line->i32 + dst_line_offset;
165 __m128i sh = _mm_set1_epi32((si32)(-shift));
166 __m128i zero = _mm_setzero_si128();
167 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
168 {
169 __m128i s = _mm_loadu_si128((__m128i*)sp);
170 __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value
171 __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
172 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
173 s = _mm_andnot_si128(c, s); // keep only +ve or 0
174 s = _mm_or_si128(s, v_m_sh); // combine
175 _mm_storeu_si128((__m128i*)dp, s);
176 }
177 }
178 else
179 {
180 const si32 *sp = src_line->i32 + src_line_offset;
181 si64 *dp = dst_line->i64 + dst_line_offset;
182 __m128i sh = _mm_set1_epi64x(-shift);
183 __m128i zero = _mm_setzero_si128();
184 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
185 {
186 __m128i s, t, u, c, v_m_sh;
187 s = _mm_loadu_si128((__m128i*)sp);
188
189 t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1
190 u = _mm_unpacklo_epi32(s, t); // correct 64bit data
191 c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value
192
193 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
194 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
195 u = _mm_andnot_si128(c, u); // keep only +ve or 0
196 u = _mm_or_si128(u, v_m_sh); // combine
197
198 _mm_storeu_si128((__m128i*)dp, u);
199 u = _mm_unpackhi_epi32(s, t); // correct 64bit data
200 c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value
201
202 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
203 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
204 u = _mm_andnot_si128(c, u); // keep only +ve or 0
205 u = _mm_or_si128(u, v_m_sh); // combine
206
207 _mm_storeu_si128((__m128i*)dp + 1, u);
208 }
209 }
210 }
211 else
212 {
213 assert(src_line->flags | line_buf::LFT_64BIT);
214 assert(dst_line->flags | line_buf::LFT_32BIT);
215 const si64 *sp = src_line->i64 + src_line_offset;
216 si32 *dp = dst_line->i32 + dst_line_offset;
217 __m128i sh = _mm_set1_epi64x(-shift);
218 __m128i zero = _mm_setzero_si128();
219 __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
220 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
221 {
222 // s for source, t for target, p for positive, n for negative,
223 // m for mask, and tm for temp
224 __m128i s, t, p, n, m, tm;
225 s = _mm_loadu_si128((__m128i*)sp);
226
227 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
228 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
229 tm = _mm_sub_epi64(sh, s); // - shift - value
230 n = _mm_and_si128(m, tm); // -ve
231 p = _mm_andnot_si128(m, s); // +ve
232 tm = _mm_or_si128(n, p);
233 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
234 t = _mm_and_si128(half_mask, tm);
235
236 s = _mm_loadu_si128((__m128i*)sp + 1);
237 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
238 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
239 tm = _mm_sub_epi64(sh, s); // - shift - value
240 n = _mm_and_si128(m, tm); // -ve
241 p = _mm_andnot_si128(m, s); // +ve
242 tm = _mm_or_si128(n, p);
243 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
244 tm = _mm_andnot_si128(half_mask, tm);
245
246 t = _mm_or_si128(t, tm);
247 _mm_storeu_si128((__m128i*)dp, t);
248 }
249 }
250 }
251
253 void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
254 ui32 width)
255 {
256 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
257 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
258 __m128 shift = _mm_set1_ps(0.5f);
259 __m128 m = _mm_set1_ps(mul);
260 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
261 {
262 __m128 t = _mm_loadu_ps(sp);
263 __m128 s = _mm_add_ps(t, shift);
264 s = _mm_mul_ps(s, m);
265 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
266 }
267 _MM_SET_ROUNDING_MODE(rounding_mode);
268 }
269
271 void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
272 ui32 width)
273 {
274 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
275 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
276 __m128 m = _mm_set1_ps(mul);
277 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
278 {
279 __m128 t = _mm_loadu_ps(sp);
280 __m128 s = _mm_mul_ps(t, m);
281 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
282 }
283 _MM_SET_ROUNDING_MODE(rounding_mode);
284 }
285
288 const line_buf *g,
289 const line_buf *b,
290 line_buf *y, line_buf *cb, line_buf *cr,
291 ui32 repeat)
292 {
293 assert((y->flags & line_buf::LFT_REVERSIBLE) &&
299
300 if (y->flags & line_buf::LFT_32BIT)
301 {
302 assert((y->flags & line_buf::LFT_32BIT) &&
303 (cb->flags & line_buf::LFT_32BIT) &&
304 (cr->flags & line_buf::LFT_32BIT) &&
305 (r->flags & line_buf::LFT_32BIT) &&
306 (g->flags & line_buf::LFT_32BIT) &&
307 (b->flags & line_buf::LFT_32BIT));
308 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
309 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
310 for (int i = (repeat + 3) >> 2; i > 0; --i)
311 {
312 __m128i mr = _mm_load_si128((__m128i*)rp);
313 __m128i mg = _mm_load_si128((__m128i*)gp);
314 __m128i mb = _mm_load_si128((__m128i*)bp);
315 __m128i t = _mm_add_epi32(mr, mb);
316 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
317 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
318 t = _mm_sub_epi32(mb, mg);
319 _mm_store_si128((__m128i*)cbp, t);
320 t = _mm_sub_epi32(mr, mg);
321 _mm_store_si128((__m128i*)crp, t);
322
323 rp += 4; gp += 4; bp += 4;
324 yp += 4; cbp += 4; crp += 4;
325 }
326 }
327 else
328 {
329 assert((y->flags & line_buf::LFT_64BIT) &&
330 (cb->flags & line_buf::LFT_64BIT) &&
331 (cr->flags & line_buf::LFT_64BIT) &&
332 (r->flags & line_buf::LFT_32BIT) &&
333 (g->flags & line_buf::LFT_32BIT) &&
335 __m128i zero = _mm_setzero_si128();
336 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
337 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
338 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
339 for (int i = (repeat + 3) >> 2; i > 0; --i)
340 {
341 __m128i mr32 = _mm_load_si128((__m128i*)rp);
342 __m128i mg32 = _mm_load_si128((__m128i*)gp);
343 __m128i mb32 = _mm_load_si128((__m128i*)bp);
344 __m128i mr, mg, mb, t;
345 mr = sse2_cvtlo_epi32_epi64(mr32, zero);
346 mg = sse2_cvtlo_epi32_epi64(mg32, zero);
347 mb = sse2_cvtlo_epi32_epi64(mb32, zero);
348
349 t = _mm_add_epi64(mr, mb);
350 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
351 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
352 t = _mm_sub_epi64(mb, mg);
353 _mm_store_si128((__m128i*)cbp, t);
354 t = _mm_sub_epi64(mr, mg);
355 _mm_store_si128((__m128i*)crp, t);
356
357 yp += 2; cbp += 2; crp += 2;
358
359 mr = sse2_cvthi_epi32_epi64(mr32, zero);
360 mg = sse2_cvthi_epi32_epi64(mg32, zero);
361 mb = sse2_cvthi_epi32_epi64(mb32, zero);
362
363 t = _mm_add_epi64(mr, mb);
364 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
365 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
366 t = _mm_sub_epi64(mb, mg);
367 _mm_store_si128((__m128i*)cbp, t);
368 t = _mm_sub_epi64(mr, mg);
369 _mm_store_si128((__m128i*)crp, t);
370
371 rp += 4; gp += 4; bp += 4;
372 yp += 2; cbp += 2; crp += 2;
373 }
374 }
375 }
376
379 const line_buf *cb,
380 const line_buf *cr,
381 line_buf *r, line_buf *g, line_buf *b,
382 ui32 repeat)
383 {
384 assert((y->flags & line_buf::LFT_REVERSIBLE) &&
390
391 if (y->flags & line_buf::LFT_32BIT)
392 {
393 assert((y->flags & line_buf::LFT_32BIT) &&
394 (cb->flags & line_buf::LFT_32BIT) &&
395 (cr->flags & line_buf::LFT_32BIT) &&
396 (r->flags & line_buf::LFT_32BIT) &&
397 (g->flags & line_buf::LFT_32BIT) &&
399 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
400 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
401 for (int i = (repeat + 3) >> 2; i > 0; --i)
402 {
403 __m128i my = _mm_load_si128((__m128i*)yp);
404 __m128i mcb = _mm_load_si128((__m128i*)cbp);
405 __m128i mcr = _mm_load_si128((__m128i*)crp);
406
407 __m128i t = _mm_add_epi32(mcb, mcr);
408 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
409 _mm_store_si128((__m128i*)gp, t);
410 __m128i u = _mm_add_epi32(mcb, t);
411 _mm_store_si128((__m128i*)bp, u);
412 u = _mm_add_epi32(mcr, t);
413 _mm_store_si128((__m128i*)rp, u);
414
415 yp += 4; cbp += 4; crp += 4;
416 rp += 4; gp += 4; bp += 4;
417 }
418 }
419 else
420 {
421 assert((y->flags & line_buf::LFT_64BIT) &&
422 (cb->flags & line_buf::LFT_64BIT) &&
423 (cr->flags & line_buf::LFT_64BIT) &&
424 (r->flags & line_buf::LFT_32BIT) &&
425 (g->flags & line_buf::LFT_32BIT) &&
427 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
428 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
429 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
430 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
431 for (int i = (repeat + 3) >> 2; i > 0; --i)
432 {
433 __m128i my, mcb, mcr, tr, tg, tb;
434 my = _mm_load_si128((__m128i*)yp);
435 mcb = _mm_load_si128((__m128i*)cbp);
436 mcr = _mm_load_si128((__m128i*)crp);
437
438 tg = _mm_add_epi64(mcb, mcr);
439 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
440 tb = _mm_add_epi64(mcb, tg);
441 tr = _mm_add_epi64(mcr, tg);
442
443 __m128i mr, mg, mb;
444 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
445 mr = _mm_and_si128(low_bits, mr);
446 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
447 mg = _mm_and_si128(low_bits, mg);
448 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
449 mb = _mm_and_si128(low_bits, mb);
450
451 yp += 2; cbp += 2; crp += 2;
452
453 my = _mm_load_si128((__m128i*)yp);
454 mcb = _mm_load_si128((__m128i*)cbp);
455 mcr = _mm_load_si128((__m128i*)crp);
456
457 tg = _mm_add_epi64(mcb, mcr);
458 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
459 tb = _mm_add_epi64(mcb, tg);
460 tr = _mm_add_epi64(mcr, tg);
461
462 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
463 tr = _mm_andnot_si128(low_bits, tr);
464 mr = _mm_or_si128(mr, tr);
465 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
466 tg = _mm_andnot_si128(low_bits, tg);
467 mg = _mm_or_si128(mg, tg);
468 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
469 tb = _mm_andnot_si128(low_bits, tb);
470 mb = _mm_or_si128(mb, tb);
471
472 _mm_store_si128((__m128i*)rp, mr);
473 _mm_store_si128((__m128i*)gp, mg);
474 _mm_store_si128((__m128i*)bp, mb);
475
476 yp += 2; cbp += 2; crp += 2;
477 rp += 4; gp += 4; bp += 4;
478 }
479 }
480 }
481 }
482}
si64 * i64
Definition: ojph_mem.h:173
si32 * i32
Definition: ojph_mem.h:172
void sse2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
static __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
void sse2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
int64_t si64
Definition: ojph_defs.h:57
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54