OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_decoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_decoder_avx2.cpp
35//***************************************************************************/
36
37//***************************************************************************/
42#include <string>
43#include <iostream>
44
45#include <cassert>
46#include <cstring>
47#include "ojph_block_common.h"
48#include "ojph_block_decoder.h"
49#include "ojph_arch.h"
50#include "ojph_message.h"
51
52#include <immintrin.h>
53
54namespace ojph {
55 namespace local {
56
57 //************************************************************************/
64 struct dec_mel_st {
65 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
66 k(0), num_runs(0), runs(0)
67 {}
68 // data decoding machinery
69 ui8* data;
70 ui64 tmp;
71 int bits;
72 int size;
73 bool unstuff;
74 int k;
75
76 // queue of decoded runs
77 int num_runs;
78 ui64 runs;
79 };
80
81 //************************************************************************/
93 static inline
94 void mel_read(dec_mel_st *melp)
95 {
96 if (melp->bits > 32) //there are enough bits in the tmp variable
97 return; // return without reading new data
98
99 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
100 if (melp->size > 4) { // if there is data in the MEL segment
101 val = *(ui32*)melp->data; // read 32 bits from MEL data
102 melp->data += 4; // advance pointer
103 melp->size -= 4; // reduce counter
104 }
105 else if (melp->size > 0)
106 { // 4 or less
107 int i = 0;
108 while (melp->size > 1) {
109 ui32 v = *melp->data++; // read one byte at a time
110 ui32 m = ~(0xFFu << i); // mask of location
111 val = (val & m) | (v << i);// put one byte in its correct location
112 --melp->size;
113 i += 8;
114 }
115 // size equal to 1
116 ui32 v = *melp->data++; // the one before the last is different
117 v |= 0xF; // MEL and VLC segments can overlap
118 ui32 m = ~(0xFFu << i);
119 val = (val & m) | (v << i);
120 --melp->size;
121 }
122
123 // next we unstuff them before adding them to the buffer
124 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
125 // the previously read byte requires
126 // unstuffing
127
128 // data is unstuffed and accumulated in t
129 // bits has the number of bits in t
130 ui32 t = val & 0xFF;
131 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
132 bits -= unstuff; // there is one less bit in t if unstuffing is needed
133 t = t << (8 - unstuff); // move up to make room for the next byte
134
135 //this is a repeat of the above
136 t |= (val>>8) & 0xFF;
137 unstuff = (((val >> 8) & 0xFF) == 0xFF);
138 bits -= unstuff;
139 t = t << (8 - unstuff);
140
141 t |= (val>>16) & 0xFF;
142 unstuff = (((val >> 16) & 0xFF) == 0xFF);
143 bits -= unstuff;
144 t = t << (8 - unstuff);
145
146 t |= (val>>24) & 0xFF;
147 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
148
149 // move t to tmp, and push the result all the way up, so we read from
150 // the MSB
151 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
152 melp->bits += bits; //increment the number of bits in tmp
153 }
154
155 //************************************************************************/
170 static inline
172 {
173 static const int mel_exp[13] = { //MEL exponents
174 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
175 };
176
177 if (melp->bits < 6) // if there are less than 6 bits in tmp
178 mel_read(melp); // then read from the MEL bitstream
179 // 6 bits is the largest decodable MEL cwd
180
181 //repeat so long that there is enough decodable bits in tmp,
182 // and the runs store is not full (num_runs < 8)
183 while (melp->bits >= 6 && melp->num_runs < 8)
184 {
185 int eval = mel_exp[melp->k]; // number of bits associated with state
186 int run = 0;
187 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
188 { //one is found
189 run = 1 << eval;
190 run--; // consecutive runs of 0 events - 1
191 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
192 melp->tmp <<= 1; // consume one bit from tmp
193 melp->bits -= 1;
194 run = run << 1; // a stretch of zeros not terminating in one
195 }
196 else
197 { //0 is found
198 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
199 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
200 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
201 melp->bits -= eval + 1;
202 run = (run << 1) + 1; // a stretch of zeros terminating with one
203 }
204 eval = melp->num_runs * 7; // 7 bits per run
205 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
206 melp->runs |= ((ui64)run) << eval; // store the value in runs
207 melp->num_runs++; // increment count
208 }
209 }
210
211 //************************************************************************/
221 static inline
222 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
223 {
224 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
225 melp->bits = 0; // 0 bits in tmp
226 melp->tmp = 0; //
227 melp->unstuff = false; // no unstuffing
228 melp->size = scup - 1; // size is the length of MEL+VLC-1
229 melp->k = 0; // 0 for state
230 melp->num_runs = 0; // num_runs is 0
231 melp->runs = 0; //
232
233 //This code is borrowed; original is for a different architecture
234 //These few lines take care of the case where data is not at a multiple
235 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
236 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
237 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
238 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
239 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
240 //set data to 0xFF
241 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
242 // see the standard
243 melp->data += melp->size-- > 0; //increment if the end is not reached
244 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
245 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
246 melp->bits += d_bits; //increment tmp by number of bits
247 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
248 //unstuffing
249 }
250 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
251 // is the MSB
252 }
253
254 //************************************************************************/
260 static inline
262 {
263 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
264 mel_decode(melp);
265
266 int t = melp->runs & 0x7F; //retrieve one run
267 melp->runs >>= 7; // remove the retrieved run
268 melp->num_runs--;
269 return t; // return run
270 }
271
272 //************************************************************************/
276 struct rev_struct {
277 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
278 {}
279 //storage
280 ui8* data;
281 ui64 tmp;
282 ui32 bits;
283 int size;
284 bool unstuff;
286 };
287
288 //************************************************************************/
308 static inline
310 {
311 //process 4 bytes at a time
312 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
313 return; // reading 32 bits can overflow vlcp->tmp
314 ui32 val = 0;
315 //the next line (the if statement) needs to be tested first
316 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
317 {
318 // (vlcp->data - 3) move pointer back to read 32 bits at once
319 val = *(ui32*)(vlcp->data - 3); // then read 32 bits
320 vlcp->data -= 4; // move data pointer back by 4
321 vlcp->size -= 4; // reduce available byte by 4
322 }
323 else if (vlcp->size > 0)
324 { // 4 or less
325 int i = 24;
326 while (vlcp->size > 0) {
327 ui32 v = *vlcp->data--; // read one byte at a time
328 val |= (v << i); // put byte in its correct location
329 --vlcp->size;
330 i -= 8;
331 }
332 }
333
334 __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
335 tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
336 tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
337
338 __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
339 bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
340 unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
341 unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
342
343 __m128i val_7f = _mm_set1_epi32(0x7F);
344 __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
345 unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
346 unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
347
348 __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
349 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
350 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
351 ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3);
352
353 __m128i final_shift = _mm_slli_si128(inc_sum, 4);
354 tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
355 tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
356
357 ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1);
358
359 vlcp->unstuff = unstuff_next;
360 vlcp->tmp |= tmp << vlcp->bits;
361 vlcp->bits += total_bits;
362 }
363
364 //************************************************************************/
378 static inline
379 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
380 {
381 //first byte has only the upper 4 bits
382 vlcp->data = data + lcup - 2;
383
384 //size can not be larger than this, in fact it should be smaller
385 vlcp->size = scup - 2;
386
387 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
388 vlcp->tmp = d >> 4; // both initialize and set
389 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
390 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
391
392 //This code is designed for an architecture that read address should
393 // align to the read size (address multiple of 4 if read size is 4)
394 //These few lines take care of the case where data is not at a multiple
395 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
396 // To read 32 bits, read from (vlcp->data - 3)
397 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
398 int tnum = num < vlcp->size ? num : vlcp->size;
399 for (int i = 0; i < tnum; ++i) {
400 ui64 d;
401 d = *vlcp->data--; // read one byte and move read pointer
402 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
403 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
404 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
405 vlcp->bits += d_bits;
406 vlcp->unstuff = d > 0x8F; // for next byte
407 }
408 vlcp->size -= tnum;
409 rev_read(vlcp); // read another 32 buts
410 }
411
412 //************************************************************************/
419 static inline
421 {
422 if (vlcp->bits < 32) // if there are less then 32 bits, read more
423 {
424 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
425 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
426 rev_read(vlcp); // read another 32
427 }
428 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
429 }
430
431 //************************************************************************/
437 static inline
439 {
440 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
441 vlcp->tmp >>= num_bits; // remove bits
442 vlcp->bits -= num_bits; // decrement the number of bits
443 return (ui32)vlcp->tmp;
444 }
445
446 //************************************************************************/
457 static inline
459 {
460 //process 4 bytes at a time
461 if (mrp->bits > 32)
462 return;
463 ui32 val = 0;
464 if (mrp->size > 3) // If there are 3 byte or more
465 { // (mrp->data - 3) move pointer back to read 32 bits at once
466 val = *(ui32*)(mrp->data - 3); // read 32 bits
467 mrp->data -= 4; // move back pointer
468 mrp->size -= 4; // reduce count
469 }
470 else if (mrp->size > 0)
471 {
472 int i = 24;
473 while (mrp->size > 0) {
474 ui32 v = *mrp->data--; // read one byte at a time
475 val |= (v << i); // put byte in its correct location
476 --mrp->size;
477 i -= 8;
478 }
479 }
480
481 //accumulate in tmp, and keep count in bits
482 ui32 bits, tmp = val >> 24;
483
484 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
485 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
486 bool unstuff = (val >> 24) > 0x8F;
487
488 //process the next byte
489 tmp |= ((val >> 16) & 0xFF) << bits;
490 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
491 unstuff = ((val >> 16) & 0xFF) > 0x8F;
492
493 tmp |= ((val >> 8) & 0xFF) << bits;
494 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
495 unstuff = ((val >> 8) & 0xFF) > 0x8F;
496
497 tmp |= (val & 0xFF) << bits;
498 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
499 unstuff = (val & 0xFF) > 0x8F;
500
501 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
502 mrp->bits += bits;
503 mrp->unstuff = unstuff; // next byte
504 }
505
506 //************************************************************************/
521 static inline
522 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
523 {
524 mrp->data = data + lcup + len2 - 1;
525 mrp->size = len2;
526 mrp->unstuff = true;
527 mrp->bits = 0;
528 mrp->tmp = 0;
529
530 //This code is designed for an architecture that read address should
531 // align to the read size (address multiple of 4 if read size is 4)
532 //These few lines take care of the case where data is not at a multiple
533 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
534 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
535 for (int i = 0; i < num; ++i) {
536 ui64 d;
537 //read a byte, 0 if no more data
538 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
539 //check if unstuffing is needed
540 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
541 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
542 mrp->bits += d_bits;
543 mrp->unstuff = d > 0x8F; // for next byte
544 }
545 rev_read_mrp(mrp);
546 }
547
548 //************************************************************************/
555 static inline
557 {
558 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
559 {
560 rev_read_mrp(mrp); // read 30-32 bits from mrp
561 if (mrp->bits < 32) // if there is a space of 32 bits
562 rev_read_mrp(mrp); // read more
563 }
564 return (ui32)mrp->tmp; // return the head of mrp->tmp
565 }
566
567 //************************************************************************/
573 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
574 {
575 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
576 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
577 mrp->bits -= num_bits;
578 return (ui32)mrp->tmp; // return data after consumption
579 }
580
581 //************************************************************************/
586 const ui8* data;
587 ui8 tmp[48];
590 int size;
591 };
592
593 //************************************************************************/
611 template<int X>
612 static inline
614 {
615 assert(msp->bits <= 128);
616
617 __m128i offset, val, validity, all_xff;
618 val = _mm_loadu_si128((__m128i*)msp->data);
619 int bytes = msp->size >= 16 ? 16 : msp->size;
620 validity = _mm_set1_epi8((char)bytes);
621 msp->data += bytes;
622 msp->size -= bytes;
623 int bits = 128;
624 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
625 validity = _mm_cmpgt_epi8(validity, offset);
626 all_xff = _mm_set1_epi8(-1);
627 if (X == 0xFF) // the compiler should remove this if statement
628 {
629 __m128i t = _mm_xor_si128(validity, all_xff); // complement
630 val = _mm_or_si128(t, val); // fill with 0xFF
631 }
632 else if (X == 0)
633 val = _mm_and_si128(validity, val); // fill with zeros
634 else
635 assert(0);
636
637 __m128i ff_bytes;
638 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
639 ff_bytes = _mm_and_si128(ff_bytes, validity);
640 ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
641 flags <<= 1; // unstuff following byte
642 ui32 next_unstuff = flags >> 16;
643 flags |= msp->unstuff;
644 flags &= 0xFFFF;
645 while (flags)
646 { // bit unstuffing occurs on average once every 256 bytes
647 // therefore it is not an issue if it is a bit slow
648 // here we process 16 bytes
649 --bits; // consuming one stuffing bit
650
651 ui32 loc = 31 - count_leading_zeros(flags);
652 flags ^= 1 << loc;
653
654 __m128i m, t, c;
655 t = _mm_set1_epi8((char)loc);
656 m = _mm_cmpgt_epi8(offset, t);
657
658 t = _mm_and_si128(m, val); // keep bits at locations larger than loc
659 c = _mm_srli_epi64(t, 1); // 1 bits left
660 t = _mm_srli_si128(t, 8); // 8 bytes left
661 t = _mm_slli_epi64(t, 63); // keep the MSB only
662 t = _mm_or_si128(t, c); // combine the above 3 steps
663
664 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
665 }
666
667 // combine with earlier data
668 assert(msp->bits >= 0 && msp->bits <= 128);
669 int cur_bytes = msp->bits >> 3;
670 int cur_bits = msp->bits & 7;
671 __m128i b1, b2;
672 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
673 b2 = _mm_slli_si128(val, 8); // 8 bytes right
674 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
675 b1 = _mm_or_si128(b1, b2);
676 b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
677 b2 = _mm_or_si128(b1, b2);
678 _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
679
680 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
681 cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
682 int upper = _mm_extract_epi16(val, 7);
683 upper >>= consumed_bits - 128 + 16;
684 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
685
686 msp->bits += (ui32)bits;
687 msp->unstuff = next_unstuff; // next unstuff
688 assert(msp->unstuff == 0 || msp->unstuff == 1);
689 }
690
691 //************************************************************************/
700 template<int X>
701 static inline
702 void frwd_init(frwd_struct_avx2 *msp, const ui8* data, int size)
703 {
704 msp->data = data;
705 _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
706 _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
707 _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
708
709 msp->bits = 0;
710 msp->unstuff = 0;
711 msp->size = size;
712
713 frwd_read<X>(msp); // read 128 bits more
714 }
715
716 //************************************************************************/
722 static inline
723 void frwd_advance(frwd_struct_avx2 *msp, ui32 num_bits)
724 {
725 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
726 msp->bits -= num_bits;
727
728 __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
729 num_bits &= 63;
730
731 __m128i v0, v1, c0, c1, t;
732 v0 = _mm_loadu_si128(p);
733 v1 = _mm_loadu_si128(p + 1);
734
735 // shift right by num_bits
736 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
737 t = _mm_srli_si128(v0, 8);
738 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
739 c0 = _mm_or_si128(c0, t);
740 t = _mm_slli_si128(v1, 8);
741 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
742 c0 = _mm_or_si128(c0, t);
743
744 _mm_storeu_si128((__m128i*)msp->tmp, c0);
745
746 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
747 t = _mm_srli_si128(v1, 8);
748 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
749 c1 = _mm_or_si128(c1, t);
750
751 _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
752 }
753
754 //************************************************************************/
761 template<int X>
762 static inline
764 {
765 if (msp->bits <= 128)
766 {
767 frwd_read<X>(msp);
768 if (msp->bits <= 128) //need to test
769 frwd_read<X>(msp);
770 }
771 __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
772 return t;
773 }
774
775 //************************************************************************/
785 static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
786 __m256i row = _mm256_setzero_si256();
787
788 // we keeps e_k, e_1, and rho in w2
789 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
790 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
791
792 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
793 {
794 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
795
796 // U_q holds U_q for this quad
797 // flags has e_k, e_1, and rho such that e_k is sitting in the
798 // 0x8000, e_1 in 0x800, and rho in 0x80
799
800 // next e_k and m_n
801 __m256i m_n;
802 __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
803 m_n = _mm256_sub_epi32(U_q, w0);
804 m_n = _mm256_andnot_si256(insig, m_n);
805
806 // find cumulative sums
807 // to find at which bit in ms_vec the sample starts
808 __m256i inc_sum = m_n; // inclusive scan
809 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
810 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
811 int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
812 int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
813
814 __m128i ms_vec0 = _mm_setzero_si128();
815 __m128i ms_vec1 = _mm_setzero_si128();
816 if (total_mn1) {
817 ms_vec0 = frwd_fetch<0xFF>(magsgn);
818 frwd_advance(magsgn, (ui32)total_mn1);
819 }
820 if (total_mn2) {
821 ms_vec1 = frwd_fetch<0xFF>(magsgn);
822 frwd_advance(magsgn, (ui32)total_mn2);
823 }
824
825 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
826
827 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
828
829 // find the starting byte and starting bit
830 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
831 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
832 byte_idx = _mm256_shuffle_epi8(byte_idx,
833 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
834 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
835 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
836 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
837 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
838
839 // shift samples values to correct location
840 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
841
842 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
843 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
844
845 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
846 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
847 d0 = _mm256_mullo_epi16(d0, bit_shift);
848 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
849 d1 = _mm256_mullo_epi16(d1, bit_shift);
850 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
851 d0 = _mm256_or_si256(d0, d1);
852
853 // find location of e_k and mask
854 __m256i shift;
855 __m256i ones = _mm256_set1_epi32(1);
856 __m256i twos = _mm256_set1_epi32(2);
857 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
858 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
859 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
860 w0 = _mm256_sub_epi32(twos, w0);
861 shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
862 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
863
864 // next e_1
865 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
866 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
867 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
868 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
869 w0 = _mm256_slli_epi32(ms_vec, 31); // sign
870 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
871 __m256i tvn = ms_vec;
872 ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
873 ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
874 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
875 row = _mm256_andnot_si256(insig, ms_vec); // significant only
876
877 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
878
879 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
880
881 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
882 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
883 }
884 return row;
885 }
886
887
888 //************************************************************************/
899 static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
900
901 __m256i w0; // workers
902 __m256i insig; // lanes hold FF's if samples are insignificant
903 __m256i flags; // lanes hold e_k, e_1, and rho
904
905 __m256i row = _mm256_setzero_si256();
906 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
907 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
908 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
909 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
910 // we keeps e_k, e_1, and rho in w2
911 flags = _mm256_and_si256(w0,
912 _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
913 (si16)0x8880, 0x4440, 0x2220, 0x1110,
914 (si16)0x8880, 0x4440, 0x2220, 0x1110,
915 (si16)0x8880, 0x4440, 0x2220, 0x1110));
916 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
917 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
918 {
919 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
920 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
921 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
922 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
923
924 // U_q holds U_q for this quad
925 // flags has e_k, e_1, and rho such that e_k is sitting in the
926 // 0x8000, e_1 in 0x800, and rho in 0x80
927
928 // next e_k and m_n
929 __m256i m_n;
930 w0 = _mm256_srli_epi16(flags, 15); // e_k
931 m_n = _mm256_sub_epi16(U_q_avx, w0);
932 m_n = _mm256_andnot_si256(insig, m_n);
933
934 // find cumulative sums
935 // to find at which bit in ms_vec the sample starts
936 __m256i inc_sum = m_n; // inclusive scan
937 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
938 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
939 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
940 int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
941 int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
942 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
943
944 __m128i ms_vec0 = _mm_setzero_si128();
945 __m128i ms_vec1 = _mm_setzero_si128();
946 if (total_mn1) {
947 ms_vec0 = frwd_fetch<0xFF>(magsgn);
948 frwd_advance(magsgn, (ui32)total_mn1);
949 }
950 if (total_mn2) {
951 ms_vec1 = frwd_fetch<0xFF>(magsgn);
952 frwd_advance(magsgn, (ui32)total_mn2);
953 }
954
955 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
956
957 // find the starting byte and starting bit
958 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
959 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
960 byte_idx = _mm256_shuffle_epi8(byte_idx,
961 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
962 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
963 0x0606, 0x0404, 0x0202, 0x0000));
964 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
965 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
966 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
967 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
968
969 // shift samples values to correct location
970 __m256i bit_shift = _mm256_shuffle_epi8(
971 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
972 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
973 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
974 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
975 d0 = _mm256_mullo_epi16(d0, bit_shift);
976 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
977 d1 = _mm256_mullo_epi16(d1, bit_shift);
978 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
979 d0 = _mm256_or_si256(d0, d1);
980
981 // find location of e_k and mask
982 __m256i shift, t0, t1, Uq0, Uq1;
983 __m256i ones = _mm256_set1_epi16(1);
984 __m256i twos = _mm256_set1_epi16(2);
985 __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
986 Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
987 Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
988 w0 = _mm256_sub_epi16(twos, w0);
989 t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
990 t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
991 {//no _mm256_sllv_epi16 in avx2
992 __m128i t_0_sse = _mm256_castsi256_si128(t0);
993 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
994 __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
995 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
996 t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
997
998 t_0_sse = _mm256_castsi256_si128(t1);
999 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
1000 t_1_sse = _mm256_extracti128_si256(t1, 0x1);
1001 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
1002 t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
1003 }
1004 shift = _mm256_or_si256(t0, t1);
1005 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
1006
1007 // next e_1
1008 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
1009 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
1010 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
1011 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
1012 w0 = _mm256_slli_epi16(ms_vec, 15); // sign
1013 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
1014 __m256i tvn = ms_vec;
1015 ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
1016 ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
1017 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
1018 row = _mm256_andnot_si256(insig, ms_vec); // significant only
1019
1020 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
1021
1022 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
1023 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
1024 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
1025 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
1026 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
1027 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
1028 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
1029
1030 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
1031 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
1032 }
1033 return row;
1034 }
1035
1036 // https://stackoverflow.com/a/58827596
1037 inline __m256i avx2_lzcnt_epi32(__m256i v) {
1038 // prevent value from being rounded up to the next power of two
1039 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
1040
1041 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
1042 v = _mm256_srli_epi32(v, 23); // shift down the exponent
1043 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
1044 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
1045
1046 return v;
1047 }
1048
1049 //************************************************************************/
1066 bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
1067 ui32 missing_msbs, ui32 num_passes,
1068 ui32 lengths1, ui32 lengths2,
1069 ui32 width, ui32 height, ui32 stride,
1070 bool stripe_causal)
1071 {
1072 static bool insufficient_precision = false;
1073 static bool modify_code = false;
1074 static bool truncate_spp_mrp = false;
1075
1076 if (num_passes > 1 && lengths2 == 0)
1077 {
1078 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1079 "one coding pass, but zero length for "
1080 "2nd and potential 3rd pass.");
1081 num_passes = 1;
1082 }
1083
1084 if (num_passes > 3)
1085 {
1086 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1087 "This codeblocks has %d passes.",
1088 num_passes);
1089 return false;
1090 }
1091
1092 if (missing_msbs > 30) // p < 0
1093 {
1094 if (insufficient_precision == false)
1095 {
1096 insufficient_precision = true;
1097 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1098 "codeblock. This message will not be "
1099 "displayed again.");
1100 }
1101 return false;
1102 }
1103 else if (missing_msbs == 30) // p == 0
1104 { // not enough precision to decode and set the bin center to 1
1105 if (modify_code == false) {
1106 modify_code = true;
1107 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1108 "pass. The code can be modified to support "
1109 "this case. This message will not be "
1110 "displayed again.");
1111 }
1112 return false; // 32 bits are not enough to decode this
1113 }
1114 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1115 {
1116 if (num_passes > 1) {
1117 num_passes = 1;
1118 if (truncate_spp_mrp == false) {
1119 truncate_spp_mrp = true;
1120 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1121 "nor MagRef passes; both will be skipped. "
1122 "This message will not be displayed "
1123 "again.");
1124 }
1125 }
1126 }
1127 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1128 // There is a way to handle the case of p == 0, but a different path
1129 // is required
1130
1131 if (lengths1 < 2)
1132 {
1133 OJPH_WARN(0x00010006, "Wrong codeblock length.");
1134 return false;
1135 }
1136
1137 // read scup and fix the bytes there
1138 int lcup, scup;
1139 lcup = (int)lengths1; // length of CUP
1140 //scup is the length of MEL + VLC
1141 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1142 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1143 return false;
1144
1145 // The temporary storage scratch holds two types of data in an
1146 // interleaved fashion. The interleaving allows us to use one
1147 // memory pointer.
1148 // We have one entry for a decoded VLC code, and one entry for UVLC.
1149 // Entries are 16 bits each, corresponding to one quad,
1150 // but since we want to use XMM registers of the SSE family
1151 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1152 // the width is no smaller than 16 bytes (or 8 entries), and the
1153 // height is 512 quads
1154 // Each VLC entry contains, in the following order, starting
1155 // from MSB
1156 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1157 // Each entry in UVLC contains u_q
1158 // One extra row to handle the case of SPP propagating downwards
1159 // when codeblock width is 4
1160 ui16 scratch[8 * 513] = {0}; // 8+ kB
1161
1162 // We need an extra two entries (one inf and one u_q) beyond
1163 // the last column.
1164 // If the block width is 4 (2 quads), then we use sstr of 8
1165 // (enough for 4 quads). If width is 8 (4 quads) we use
1166 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1167 // quads), we use 24 (enough for 12 quads).
1168 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1169
1170 assert((stride & 0x3) == 0);
1171
1172 ui32 mmsbp2 = missing_msbs + 2;
1173
1174 // The cleanup pass is decoded in two steps; in step one,
1175 // the VLC and MEL segments are decoded, generating a record that
1176 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1177 // This information should be sufficient for the next step.
1178 // In step 2, we decode the MagSgn segment.
1179
1180 // step 1 decoding VLC and MEL segments
1181 {
1182 // init structures
1183 dec_mel_st mel;
1184 mel_init(&mel, coded_data, lcup, scup);
1185 rev_struct vlc;
1186 rev_init(&vlc, coded_data, lcup, scup);
1187
1188 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1189 // data represented as runs of 0 events
1190 // See mel_decode description
1191
1192 ui32 vlc_val;
1193 ui32 c_q = 0;
1194 ui16 *sp = scratch;
1195 //initial quad row
1196 for (ui32 x = 0; x < width; sp += 4)
1197 {
1198 // decode VLC
1200
1201 // first quad
1202 vlc_val = rev_fetch(&vlc);
1203
1204 //decode VLC using the context c_q and the head of VLC bitstream
1205 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1206
1207 // if context is zero, use one MEL event
1208 if (c_q == 0) //zero context
1209 {
1210 run -= 2; //subtract 2, since events number if multiplied by 2
1211
1212 // Is the run terminated in 1? if so, use decoded VLC code,
1213 // otherwise, discard decoded data, since we will decoded again
1214 // using a different context
1215 t0 = (run == -1) ? t0 : 0;
1216
1217 // is run -1 or -2? this means a run has been consumed
1218 if (run < 0)
1219 run = mel_get_run(&mel); // get another run
1220 }
1221 //run -= (c_q == 0) ? 2 : 0;
1222 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1223 //if (run < 0)
1224 // run = mel_get_run(&mel); // get another run
1225 sp[0] = t0;
1226 x += 2;
1227
1228 // prepare context for the next quad; eqn. 1 in ITU T.814
1229 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1230
1231 //remove data from vlc stream (0 bits are removed if vlc is not used)
1232 vlc_val = rev_advance(&vlc, t0 & 0x7);
1233
1234 //second quad
1235 ui16 t1 = 0;
1236
1237 //decode VLC using the context c_q and the head of VLC bitstream
1238 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1239
1240 // if context is zero, use one MEL event
1241 if (c_q == 0 && x < width) //zero context
1242 {
1243 run -= 2; //subtract 2, since events number if multiplied by 2
1244
1245 // if event is 0, discard decoded t1
1246 t1 = (run == -1) ? t1 : 0;
1247
1248 if (run < 0) // have we consumed all events in a run
1249 run = mel_get_run(&mel); // if yes, then get another run
1250 }
1251 t1 = x < width ? t1 : 0;
1252 //run -= (c_q == 0 && x < width) ? 2 : 0;
1253 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1254 //if (run < 0)
1255 // run = mel_get_run(&mel); // get another run
1256 sp[2] = t1;
1257 x += 2;
1258
1259 //prepare context for the next quad, eqn. 1 in ITU T.814
1260 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1261
1262 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1263 vlc_val = rev_advance(&vlc, t1 & 0x7);
1264
1265 // decode u
1267 // uvlc_mode is made up of u_offset bits from the quad pair
1268 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1269 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1270 { // the MEL run of events
1271 run -= 2; //subtract 2, since events number if multiplied by 2
1272
1273 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1274 // is 0x40
1275
1276 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1277 run = mel_get_run(&mel);
1278 }
1279 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1280 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1281 //if (run < 0)
1282 // run = mel_get_run(&mel); // get another run
1283
1284 //decode uvlc_mode to get u for both quads
1285 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1286 //remove total prefix length
1287 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1288 uvlc_entry >>= 3;
1289 //extract suffixes for quad 0 and 1
1290 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1291 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1292 vlc_val = rev_advance(&vlc, len);
1293 ojph_unused(vlc_val); //static code analysis: unused value
1294 uvlc_entry >>= 4;
1295 // quad 0 length
1296 len = uvlc_entry & 0x7; // quad 0 suffix length
1297 uvlc_entry >>= 3;
1298 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1299 sp[1] = u_q;
1300 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1301 sp[3] = u_q;
1302 }
1303 sp[0] = sp[1] = 0;
1304
1305 //non initial quad rows
1306 for (ui32 y = 2; y < height; y += 2)
1307 {
1308 c_q = 0; // context
1309 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1310
1311 for (ui32 x = 0; x < width; sp += 4)
1312 {
1313 // decode VLC
1315
1316 // sigma_q (n, ne, nf)
1317 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1318 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1319
1320 // first quad
1321 vlc_val = rev_fetch(&vlc);
1322
1323 //decode VLC using the context c_q and the head of VLC bitstream
1324 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1325
1326 // if context is zero, use one MEL event
1327 if (c_q == 0) //zero context
1328 {
1329 run -= 2; //subtract 2, since events number is multiplied by 2
1330
1331 // Is the run terminated in 1? if so, use decoded VLC code,
1332 // otherwise, discard decoded data, since we will decoded again
1333 // using a different context
1334 t0 = (run == -1) ? t0 : 0;
1335
1336 // is run -1 or -2? this means a run has been consumed
1337 if (run < 0)
1338 run = mel_get_run(&mel); // get another run
1339 }
1340 //run -= (c_q == 0) ? 2 : 0;
1341 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1342 //if (run < 0)
1343 // run = mel_get_run(&mel); // get another run
1344 sp[0] = t0;
1345 x += 2;
1346
1347 // prepare context for the next quad; eqn. 2 in ITU T.814
1348 // sigma_q (w, sw)
1349 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1350 // sigma_q (nw)
1351 c_q |= sp[0 - (si32)sstr] & 0x80;
1352 // sigma_q (n, ne, nf)
1353 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1354 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1355
1356 //remove data from vlc stream (0 bits are removed if vlc is unused)
1357 vlc_val = rev_advance(&vlc, t0 & 0x7);
1358
1359 //second quad
1360 ui16 t1 = 0;
1361
1362 //decode VLC using the context c_q and the head of VLC bitstream
1363 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1364
1365 // if context is zero, use one MEL event
1366 if (c_q == 0 && x < width) //zero context
1367 {
1368 run -= 2; //subtract 2, since events number if multiplied by 2
1369
1370 // if event is 0, discard decoded t1
1371 t1 = (run == -1) ? t1 : 0;
1372
1373 if (run < 0) // have we consumed all events in a run
1374 run = mel_get_run(&mel); // if yes, then get another run
1375 }
1376 t1 = x < width ? t1 : 0;
1377 //run -= (c_q == 0 && x < width) ? 2 : 0;
1378 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1379 //if (run < 0)
1380 // run = mel_get_run(&mel); // get another run
1381 sp[2] = t1;
1382 x += 2;
1383
1384 // partial c_q, will be completed when we process the next quad
1385 // sigma_q (w, sw)
1386 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1387 // sigma_q (nw)
1388 c_q |= sp[2 - (si32)sstr] & 0x80;
1389
1390 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1391 vlc_val = rev_advance(&vlc, t1 & 0x7);
1392
1393 // decode u
1395 // uvlc_mode is made up of u_offset bits from the quad pair
1396 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1397 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1398 //remove total prefix length
1399 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1400 uvlc_entry >>= 3;
1401 //extract suffixes for quad 0 and 1
1402 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1403 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1404 vlc_val = rev_advance(&vlc, len);
1405 ojph_unused(vlc_val); //static code analysis: unused value
1406 uvlc_entry >>= 4;
1407 // quad 0 length
1408 len = uvlc_entry & 0x7; // quad 0 suffix length
1409 uvlc_entry >>= 3;
1410 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1411 sp[1] = u_q;
1412 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1413 sp[3] = u_q;
1414 }
1415 sp[0] = sp[1] = 0;
1416 }
1417 }
1418
1419 // step2 we decode magsgn
1420 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1421 // The 32 bit path decode 16 bits data, for which one would think
1422 // 16 bits are enough, because we want to put in the center of the
1423 // bin.
1424 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1425 // no bitplanes are missing, then we can decoding using the 16 bit
1426 // path, but we are not doing this here.
1427 if (mmsbp2 >= 16)
1428 {
1429 // We allocate a scratch row for storing v_n values.
1430 // We have 512 quads horizontally.
1431 // We may go beyond the last entry by up to 4 entries.
1432 // Here we allocate additional 8 entries.
1433 // There are two rows in this structure, the bottom
1434 // row is used to store processed entries.
1435 const int v_n_size = 512 + 16;
1436 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1437
1438 frwd_struct_avx2 magsgn;
1439 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1440
1441 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1442
1443 {
1444 ui16 *sp = scratch;
1445 ui32 *vp = v_n_scratch;
1446 ui32 *dp = decoded_data;
1447 vp[0] = 2; // for easy calculation of emax
1448
1449 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1450 {
1451 __m128i vn = _mm_set1_epi32(2);
1452
1453 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1454 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1455
1456 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1457 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1458 if (!_mm256_testz_si256(w, w)) {
1459 return false;
1460 }
1461
1462 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1463 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1464 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1465 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1466
1467 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1468 w0 = _mm_or_si128(w0, vn);
1469 _mm_storeu_si128((__m128i*)vp, w0);
1470 }
1471 }
1472
1473 for (ui32 y = 2; y < height; y += 2)
1474 {
1475 {
1476 // perform 31 - count_leading_zeros(*vp) here
1477 ui32 *vp = v_n_scratch;
1478 ui16* sp = scratch + (y >> 1) * sstr;
1479
1480 const __m256i avx_31 = _mm256_set1_epi32(31);
1481 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1482 const __m256i avx_1 = _mm256_set1_epi32(1);
1483 const __m256i avx_0 = _mm256_setzero_si256();
1484
1485 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1486 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1487 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1488 v = _mm256_or_si256(v, v_p1);
1489 v = avx2_lzcnt_epi32(v);
1490 v = _mm256_sub_epi32(avx_31, v);
1491
1492 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1493 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1494 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1495 gamma = _mm256_and_si256(gamma, w0);
1496 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1497
1498 v = _mm256_andnot_si256(gamma, v);
1499 v = _mm256_max_epi32(v, avx_1);
1500
1501 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1502 v = _mm256_add_epi32(inf_u_q, v);
1503
1504 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1505 if (!_mm256_testz_si256(w0, w0)) {
1506 return false;
1507 }
1508
1509 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1510 }
1511 }
1512
1513 ui32 *vp = v_n_scratch;
1514 ui16 *sp = scratch + (y >> 1) * sstr;
1515 ui32 *dp = decoded_data + y * stride;
1516 vp[0] = 2; // for easy calculation of emax
1517
1518 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1519 //process two quads
1520 __m128i vn = _mm_set1_epi32(2);
1521
1522 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1523 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1524
1525 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1526 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1527
1528 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1529 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1530 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1531 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1532
1533 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1534 w0 = _mm_or_si128(w0, vn);
1535 _mm_storeu_si128((__m128i*)vp, w0);
1536 }
1537 }
1538 }
1539 else {
1540
1541 // reduce bitplane by 16 because we now have 16 bits instead of 32
1542 p -= 16;
1543
1544 // We allocate a scratch row for storing v_n values.
1545 // We have 512 quads horizontally.
1546 // We may go beyond the last entry by up to 8 entries.
1547 // Therefore we allocate additional 8 entries.
1548 // There are two rows in this structure, the bottom
1549 // row is used to store processed entries.
1550 const int v_n_size = 512 + 16;
1551 ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB
1552 ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB
1553
1554 frwd_struct_avx2 magsgn;
1555 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1556
1557 {
1558 ui16 *sp = scratch;
1559 ui16 *vp = v_n_scratch;
1560 ui32 *dp = decoded_data;
1561 vp[0] = 2; // for easy calculation of emax
1562
1563 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
1565 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1566 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
1567 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1568 if (!_mm_testz_si128(w, w)) {
1569 return false;
1570 }
1571
1572 __m128i vn = _mm_set1_epi16(2);
1573 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1574
1575 w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1576 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1577
1578 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1579 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1580
1581 _mm256_storeu_si256((__m256i*)dp, w0);
1582 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1583 }
1584 }
1585
1586 for (ui32 y = 2; y < height; y += 2) {
1587 {
1588 // perform 15 - count_leading_zeros(*vp) here
1589 ui16 *vp = v_n_scratch;
1590 ui32 *vp_32 = v_n_scratch_32;
1591
1592 ui16* sp = scratch + (y >> 1) * sstr;
1593 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1594 const __m256i avx_31 = _mm256_set1_epi32(31);
1595 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1596 const __m256i avx_1 = _mm256_set1_epi32(1);
1597 const __m256i avx_0 = _mm256_setzero_si256();
1598
1599 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
1600 __m128i v = _mm_loadu_si128((__m128i*)vp);
1601 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
1602 v = _mm_or_si128(v, v_p1);
1603
1604 __m256i v_avx = _mm256_cvtepu16_epi32(v);
1605 v_avx = avx2_lzcnt_epi32(v_avx);
1606 v_avx = _mm256_sub_epi32(avx_31, v_avx);
1607
1608 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1609 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1610 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1611 gamma = _mm256_and_si256(gamma, w0);
1612 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1613
1614 v_avx = _mm256_andnot_si256(gamma, v_avx);
1615 v_avx = _mm256_max_epi32(v_avx, avx_1);
1616
1617 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1618 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
1619
1620 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
1621 if (!_mm256_testz_si256(w0, w0)) {
1622 return false;
1623 }
1624
1625 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1626 }
1627 }
1628
1629 ui16 *vp = v_n_scratch;
1630 ui32* vp_32 = v_n_scratch_32;
1631 ui16 *sp = scratch + (y >> 1) * sstr;
1632 ui32 *dp = decoded_data + y * stride;
1633 vp[0] = 2; // for easy calculation of emax
1634
1635 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1637 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1638 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1639
1640 __m128i vn = _mm_set1_epi16(2);
1641 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1642
1643 __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1644 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1645
1646 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1647 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1648
1649 _mm256_storeu_si256((__m256i*)dp, w0);
1650 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1651 }
1652 }
1653
1654 // increase bitplane back by 16 because we need to process 32 bits
1655 p += 16;
1656 }
1657
1658 if (num_passes > 1)
1659 {
1660 // We use scratch again, we can divide it into multiple regions
1661 // sigma holds all the significant samples, and it cannot
1662 // be modified after it is set. it will be used during the
1663 // Magnitude Refinement Pass
1664 ui16* const sigma = scratch;
1665
1666 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1667 // ui16 contains 4 columns
1668 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1669
1670 // We re-arrange quad significance, where each 4 consecutive
1671 // bits represent one quad, into column significance, where,
1672 // each 4 consequtive bits represent one column of 4 rows
1673 {
1674 ui32 y;
1675
1676 const __m128i mask_3 = _mm_set1_epi32(0x30);
1677 const __m128i mask_C = _mm_set1_epi32(0xC0);
1678 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1679 for (y = 0; y < height; y += 4)
1680 {
1681 ui16* sp = scratch + (y >> 1) * sstr;
1682 ui16* dp = sigma + (y >> 2) * mstr;
1683 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1684 {
1685 __m128i s0, s1, u3, uC, t0, t1;
1686
1687 s0 = _mm_loadu_si128((__m128i*)(sp));
1688 u3 = _mm_and_si128(s0, mask_3);
1689 u3 = _mm_srli_epi32(u3, 4);
1690 uC = _mm_and_si128(s0, mask_C);
1691 uC = _mm_srli_epi32(uC, 2);
1692 t0 = _mm_or_si128(u3, uC);
1693
1694 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1695 u3 = _mm_and_si128(s1, mask_3);
1696 u3 = _mm_srli_epi32(u3, 2);
1697 uC = _mm_and_si128(s1, mask_C);
1698 t1 = _mm_or_si128(u3, uC);
1699
1700 __m128i r = _mm_or_si128(t0, t1);
1701 r = _mm_shuffle_epi8(r, shuffle_mask);
1702
1703 // _mm_storeu_si32 is not defined, so we use this workaround
1704 _mm_store_ss((float*)dp, _mm_castsi128_ps(r));
1705 }
1706 dp[0] = 0; // set an extra entry on the right with 0
1707 }
1708 {
1709 // reset one row after the codeblock
1710 ui16* dp = sigma + (y >> 2) * mstr;
1711 __m128i zero = _mm_setzero_si128();
1712 for (ui32 x = 0; x < width; x += 32, dp += 8)
1713 _mm_store_si128((__m128i*)dp, zero);
1714 dp[0] = 0; // set an extra entry on the right with 0
1715 }
1716 }
1717
1718 // We perform Significance Propagation Pass here
1719 {
1720 // This stores significance information of the previous
1721 // 4 rows. Significance information in this array includes
1722 // all signicant samples in bitplane p - 1; that is,
1723 // significant samples for bitplane p (discovered during the
1724 // cleanup pass and stored in sigma) and samples that have recently
1725 // became significant (during the SPP) in bitplane p-1.
1726 // We store enough for the widest row, containing 1024 columns,
1727 // which is equivalent to 256 of ui16, since each stores 4 columns.
1728 // We add an extra 8 entries, just in case we need more
1729 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1730
1731 frwd_struct_avx2 sigprop;
1732 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1733
1734 for (ui32 y = 0; y < height; y += 4)
1735 {
1736 ui32 pattern = 0xFFFFu; // a pattern needed samples
1737 if (height - y < 4) {
1738 pattern = 0x7777u;
1739 if (height - y < 3) {
1740 pattern = 0x3333u;
1741 if (height - y < 2)
1742 pattern = 0x1111u;
1743 }
1744 }
1745
1746 // prev holds sign. info. for the previous quad, together
1747 // with the rows on top of it and below it.
1748 ui32 prev = 0;
1749 ui16 *prev_sig = prev_row_sig;
1750 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1751 ui32 *dpp = decoded_data + y * stride;
1752 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1753 {
1754 // only rows and columns inside the stripe are included
1755 si32 s = (si32)x + 4 - (si32)width;
1756 s = ojph_max(s, 0);
1757 pattern = pattern >> (s * 4);
1758
1759 // We first find locations that need to be tested (potential
1760 // SPP members); these location will end up in mbr
1761 // In each iteration, we produce 16 bits because cwd can have
1762 // up to 16 bits of significance information, followed by the
1763 // corresponding 16 bits of sign information; therefore, it is
1764 // sufficient to fetch 32 bit data per loop.
1765
1766 // Althougth we are interested in 16 bits only, we load 32 bits.
1767 // For the 16 bits we are producing, we need the next 4 bits --
1768 // We need data for at least 5 columns out of 8.
1769 // Therefore loading 32 bits is easier than loading 16 bits
1770 // twice.
1771 ui32 ps = *(ui32*)prev_sig;
1772 ui32 ns = *(ui32*)(cur_sig + mstr);
1773 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1774 if (!stripe_causal)
1775 u |= (ns & 0x11111111) << 3; // the row below
1776
1777 ui32 cs = *(ui32*)cur_sig;
1778 // vertical integration
1779 ui32 mbr = cs; // this sig. info.
1780 mbr |= (cs & 0x77777777) << 1; //above neighbors
1781 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1782 mbr |= u;
1783 // horizontal integration
1784 ui32 t = mbr;
1785 mbr |= t << 4; // neighbors on the left
1786 mbr |= t >> 4; // neighbors on the right
1787 mbr |= prev >> 12; // significance of previous group
1788
1789 // remove outside samples, and already significant samples
1790 mbr &= pattern;
1791 mbr &= ~cs;
1792
1793 // find samples that become significant during the SPP
1794 ui32 new_sig = mbr;
1795 if (new_sig)
1796 {
1797 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1798 ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
1799
1800 ui32 cnt = 0;
1801 ui32 col_mask = 0xFu;
1802 ui32 inv_sig = ~cs & pattern;
1803 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1804 {
1805 if ((col_mask & new_sig) == 0)
1806 continue;
1807
1808 //scan one column
1809 ui32 sample_mask = 0x1111u & col_mask;
1810 if (new_sig & sample_mask)
1811 {
1812 new_sig &= ~sample_mask;
1813 if (cwd & 1)
1814 {
1815 ui32 t = 0x33u << i;
1816 new_sig |= t & inv_sig;
1817 }
1818 cwd >>= 1; ++cnt;
1819 }
1820
1821 sample_mask <<= 1;
1822 if (new_sig & sample_mask)
1823 {
1824 new_sig &= ~sample_mask;
1825 if (cwd & 1)
1826 {
1827 ui32 t = 0x76u << i;
1828 new_sig |= t & inv_sig;
1829 }
1830 cwd >>= 1; ++cnt;
1831 }
1832
1833 sample_mask <<= 1;
1834 if (new_sig & sample_mask)
1835 {
1836 new_sig &= ~sample_mask;
1837 if (cwd & 1)
1838 {
1839 ui32 t = 0xECu << i;
1840 new_sig |= t & inv_sig;
1841 }
1842 cwd >>= 1; ++cnt;
1843 }
1844
1845 sample_mask <<= 1;
1846 if (new_sig & sample_mask)
1847 {
1848 new_sig &= ~sample_mask;
1849 if (cwd & 1)
1850 {
1851 ui32 t = 0xC8u << i;
1852 new_sig |= t & inv_sig;
1853 }
1854 cwd >>= 1; ++cnt;
1855 }
1856 }
1857
1858 if (new_sig)
1859 {
1860 cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1861
1862 // Spread new_sig, such that each bit is in one byte with a
1863 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1864 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1865 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1866 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1867 new_sig_vec = _mm_and_si128(new_sig_vec,
1868 _mm_set1_epi64x((si64)0x8040201008040201));
1869 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1870 _mm_set1_epi64x((si64)0x8040201008040201));
1871
1872 // find cumulative sums
1873 // to find which bit in cwd we should extract
1874 __m128i inc_sum = new_sig_vec; // inclusive scan
1875 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1876 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1877 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1878 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1879 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1880 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1881 // exclusive scan
1882 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1883
1884 // Spread cwd, such that each bit is in one byte
1885 // with a value of 0 or 1.
1886 cwd_vec = _mm_set1_epi16((si16)cwd);
1887 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1888 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1889 cwd_vec = _mm_and_si128(cwd_vec,
1890 _mm_set1_epi64x((si64)0x8040201008040201));
1891 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1892 _mm_set1_epi64x((si64)0x8040201008040201));
1893 cwd_vec = _mm_abs_epi8(cwd_vec);
1894
1895 // Obtain bit from cwd_vec correspondig to ex_sum
1896 // Basically, collect needed bits from cwd_vec
1897 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1898
1899 // load data and set spp coefficients
1900 __m128i m =
1901 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1902 __m128i val = _mm_set1_epi32(3 << (p - 2));
1903 ui32 *dp = dpp;
1904 for (int c = 0; c < 4; ++ c) {
1905 __m128i s0, s0_ns, s0_val;
1906 // load coefficients
1907 s0 = _mm_load_si128((__m128i*)dp);
1908
1909 // epi32 is -1 only for coefficient that
1910 // are changed during the SPP
1911 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1912 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1913
1914 // obtain sign for coefficients in SPP
1915 s0_val = _mm_shuffle_epi8(v, m);
1916 s0_val = _mm_slli_epi32(s0_val, 31);
1917 s0_val = _mm_or_si128(s0_val, val);
1918 s0_val = _mm_and_si128(s0_val, s0_ns);
1919
1920 // update vector
1921 s0 = _mm_or_si128(s0, s0_val);
1922 // store coefficients
1923 _mm_store_si128((__m128i*)dp, s0);
1924 // prepare for next row
1925 dp += stride;
1926 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1927 }
1928 }
1929 frwd_advance(&sigprop, cnt);
1930 }
1931
1932 new_sig |= cs;
1933 *prev_sig = (ui16)(new_sig);
1934
1935 // vertical integration for the new sig. info.
1936 t = new_sig;
1937 new_sig |= (t & 0x7777) << 1; //above neighbors
1938 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1939 // add sig. info. from the row on top and below
1940 prev = new_sig | u;
1941 // we need only the bits in 0xF000
1942 prev &= 0xF000;
1943 }
1944 }
1945 }
1946
1947 // We perform Magnitude Refinement Pass here
1948 if (num_passes > 2)
1949 {
1950 rev_struct magref;
1951 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
1952
1953 for (ui32 y = 0; y < height; y += 4)
1954 {
1955 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1956 ui32 *dpp = decoded_data + y * stride;
1957 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1958 {
1959 //Process one entry from sigma array at a time
1960 // Each nibble (4 bits) in the sigma array represents 4 rows,
1961 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
1962 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1963 int total_bits = 0;
1964 if (sig) // if any of the 32 bits are set
1965 {
1966 // We work on 4 rows, with 4 samples each, since
1967 // data is 32 bit (4 bytes)
1968
1969 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1970 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1971 sig_vec = _mm_shuffle_epi8(sig_vec,
1972 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1973 sig_vec = _mm_and_si128(sig_vec,
1974 _mm_set1_epi64x((si64)0x8040201008040201));
1975 sig_vec = _mm_cmpeq_epi8(sig_vec,
1976 _mm_set1_epi64x((si64)0x8040201008040201));
1977 sig_vec = _mm_abs_epi8(sig_vec);
1978
1979 // find cumulative sums
1980 // to find which bit in cwd we should extract
1981 __m128i inc_sum = sig_vec; // inclusive scan
1982 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1983 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1984 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1985 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1986 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1987 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
1988
1989 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
1990 // cwd_vec. Then, convert these to a form suitable
1991 // for coefficient modifications; in particular, a value
1992 // of 0 is presented as binary 11, and a value of 1 is
1993 // represented as binary 01
1994 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1995 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1996 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1997 cwd_vec = _mm_and_si128(cwd_vec,
1998 _mm_set1_epi64x((si64)0x8040201008040201));
1999 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2000 _mm_set1_epi64x((si64)0x8040201008040201));
2001 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2002 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2003 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2004
2005 // load data and insert the mrp bit
2006 __m128i m =
2007 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2008 ui32 *dp = dpp;
2009 for (int c = 0; c < 4; ++c) {
2010 __m128i s0, s0_sig, s0_idx, s0_val;
2011 // load coefficients
2012 s0 = _mm_load_si128((__m128i*)dp);
2013 // find significant samples in this row
2014 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2015 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2016 // get MRP bit index, and MRP pattern
2017 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2018 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2019 // keep data from significant samples only
2020 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2021 // move mrp bits to correct position, and employ
2022 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
2023 s0 = _mm_xor_si128(s0, s0_val);
2024 // store coefficients
2025 _mm_store_si128((__m128i*)dp, s0);
2026 // prepare for next row
2027 dp += stride;
2028 m = _mm_add_epi32(m, _mm_set1_epi32(1));
2029 }
2030 }
2031 // consume data according to the number of bits set
2032 rev_advance_mrp(&magref, (ui32)total_bits);
2033 }
2034 }
2035 }
2036 }
2037
2038 return true;
2039 }
2040 }
2041}
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2 *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 32 bit data
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
__m256i avx2_lzcnt_epi32(__m256i v)
static __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2 *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition: ojph_defs.h:57
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition: ojph_arch.h:199
int32_t si32
Definition: ojph_defs.h:55
int16_t si16
Definition: ojph_defs.h:53
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define ojph_unused(x)
Definition: ojph_defs.h:78
#define OJPH_WARN(t,...)
Definition: ojph_message.h:285
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui8 tmp[48]
temporary buffer of read data + 16 extra
ui32 bits
number of bits stored in tmp
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data