99 ui32 val = 0xFFFFFFFF;
100 if (melp->size > 4) {
101 val = *(
ui32*)melp->data;
105 else if (melp->size > 0)
108 while (melp->size > 1) {
109 ui32 v = *melp->data++;
110 ui32 m = ~(0xFFu << i);
111 val = (val & m) | (v << i);
116 ui32 v = *melp->data++;
118 ui32 m = ~(0xFFu << i);
119 val = (val & m) | (v << i);
124 int bits = 32 - melp->unstuff;
131 bool unstuff = ((val & 0xFF) == 0xFF);
133 t = t << (8 - unstuff);
136 t |= (val>>8) & 0xFF;
137 unstuff = (((val >> 8) & 0xFF) == 0xFF);
139 t = t << (8 - unstuff);
141 t |= (val>>16) & 0xFF;
142 unstuff = (((val >> 16) & 0xFF) == 0xFF);
144 t = t << (8 - unstuff);
146 t |= (val>>24) & 0xFF;
147 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
151 melp->tmp |= ((
ui64)t) << (64 - bits - melp->bits);
173 static const int mel_exp[13] = {
174 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
183 while (melp->bits >= 6 && melp->num_runs < 8)
185 int eval = mel_exp[melp->k];
187 if (melp->tmp & (1ull<<63))
191 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;
198 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
199 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0;
200 melp->tmp <<= eval + 1;
201 melp->bits -= eval + 1;
202 run = (run << 1) + 1;
204 eval = melp->num_runs * 7;
205 melp->runs &= ~((
ui64)0x3F << eval);
206 melp->runs |= ((
ui64)run) << eval;
224 melp->data = bbuf + lcup - scup;
227 melp->unstuff =
false;
228 melp->size = scup - 1;
236 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
237 for (
int i = 0; i < num; ++i) {
238 assert(melp->unstuff ==
false || melp->data[0] <= 0x8F);
239 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;
241 if (melp->size == 1) d |= 0xF;
243 melp->data += melp->size-- > 0;
244 int d_bits = 8 - melp->unstuff;
245 melp->tmp = (melp->tmp << d_bits) | d;
246 melp->bits += d_bits;
247 melp->unstuff = ((d & 0xFF) == 0xFF);
250 melp->tmp <<= (64 - melp->bits);
263 if (melp->num_runs == 0)
266 int t = melp->runs & 0x7F;
319 val = *(
ui32*)(vlcp->data - 3);
323 else if (vlcp->size > 0)
326 while (vlcp->size > 0) {
327 ui32 v = *vlcp->data--;
334 __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
335 tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
336 tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
338 __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
339 bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
340 unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
341 unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
343 __m128i val_7f = _mm_set1_epi32(0x7F);
344 __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
345 unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
346 unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
348 __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
349 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
350 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
351 ui32 total_bits = (
ui32)_mm_extract_epi32(inc_sum, 3);
353 __m128i final_shift = _mm_slli_si128(inc_sum, 4);
354 tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
355 tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
357 ui64 tmp = (
ui32)_mm_cvtsi128_si32(tmp_vec) | (
ui32)_mm_extract_epi32(tmp_vec, 1);
359 vlcp->unstuff = unstuff_next;
360 vlcp->tmp |= tmp << vlcp->bits;
361 vlcp->bits += total_bits;
382 vlcp->data = data + lcup - 2;
385 vlcp->size = scup - 2;
387 ui32 d = *vlcp->data--;
389 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7);
390 vlcp->unstuff = (d | 0xF) > 0x8F;
397 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
398 int tnum = num < vlcp->size ? num : vlcp->size;
399 for (
int i = 0; i < tnum; ++i) {
403 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
404 vlcp->tmp |= d << vlcp->bits;
405 vlcp->bits += d_bits;
406 vlcp->unstuff = d > 0x8F;
428 return (
ui32)vlcp->tmp;
440 assert(num_bits <= vlcp->bits);
441 vlcp->tmp >>= num_bits;
442 vlcp->bits -= num_bits;
443 return (
ui32)vlcp->tmp;
466 val = *(
ui32*)(mrp->data - 3);
470 else if (mrp->size > 0)
473 while (mrp->size > 0) {
474 ui32 v = *mrp->data--;
482 ui32 bits, tmp = val >> 24;
485 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
486 bool unstuff = (val >> 24) > 0x8F;
489 tmp |= ((val >> 16) & 0xFF) << bits;
490 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
491 unstuff = ((val >> 16) & 0xFF) > 0x8F;
493 tmp |= ((val >> 8) & 0xFF) << bits;
494 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
495 unstuff = ((val >> 8) & 0xFF) > 0x8F;
497 tmp |= (val & 0xFF) << bits;
498 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
499 unstuff = (val & 0xFF) > 0x8F;
501 mrp->tmp |= (
ui64)tmp << mrp->bits;
503 mrp->unstuff = unstuff;
524 mrp->data = data + lcup + len2 - 1;
534 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
535 for (
int i = 0; i < num; ++i) {
538 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
540 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
541 mrp->tmp |= d << mrp->bits;
543 mrp->unstuff = d > 0x8F;
564 return (
ui32)mrp->tmp;
575 assert(num_bits <= mrp->bits);
576 mrp->tmp >>= num_bits;
577 mrp->bits -= num_bits;
578 return (
ui32)mrp->tmp;
615 assert(msp->
bits <= 128);
617 __m128i offset, val, validity, all_xff;
618 val = _mm_loadu_si128((__m128i*)msp->
data);
619 int bytes = msp->
size >= 16 ? 16 : msp->
size;
620 validity = _mm_set1_epi8((
char)bytes);
624 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
625 validity = _mm_cmpgt_epi8(validity, offset);
626 all_xff = _mm_set1_epi8(-1);
629 __m128i t = _mm_xor_si128(validity, all_xff);
630 val = _mm_or_si128(t, val);
633 val = _mm_and_si128(validity, val);
638 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
639 ff_bytes = _mm_and_si128(ff_bytes, validity);
640 ui32 flags = (
ui32)_mm_movemask_epi8(ff_bytes);
642 ui32 next_unstuff = flags >> 16;
655 t = _mm_set1_epi8((
char)loc);
656 m = _mm_cmpgt_epi8(offset, t);
658 t = _mm_and_si128(m, val);
659 c = _mm_srli_epi64(t, 1);
660 t = _mm_srli_si128(t, 8);
661 t = _mm_slli_epi64(t, 63);
662 t = _mm_or_si128(t, c);
664 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
668 assert(msp->
bits >= 0 && msp->
bits <= 128);
669 int cur_bytes = msp->
bits >> 3;
670 int cur_bits = msp->
bits & 7;
672 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
673 b2 = _mm_slli_si128(val, 8);
674 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
675 b1 = _mm_or_si128(b1, b2);
676 b2 = _mm_loadu_si128((__m128i*)(msp->
tmp + cur_bytes));
677 b2 = _mm_or_si128(b1, b2);
678 _mm_storeu_si128((__m128i*)(msp->
tmp + cur_bytes), b2);
680 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
681 cur_bytes = (msp->
bits + (
ui32)consumed_bits + 7) >> 3;
682 int upper = _mm_extract_epi16(val, 7);
683 upper >>= consumed_bits - 128 + 16;
684 msp->
tmp[cur_bytes] = (
ui8)upper;
705 _mm_storeu_si128((__m128i *)msp->
tmp, _mm_setzero_si128());
706 _mm_storeu_si128((__m128i *)msp->
tmp + 1, _mm_setzero_si128());
707 _mm_storeu_si128((__m128i *)msp->
tmp + 2, _mm_setzero_si128());
725 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
726 msp->
bits -= num_bits;
728 __m128i *p = (__m128i*)(msp->
tmp + ((num_bits >> 3) & 0x18));
731 __m128i v0, v1, c0, c1, t;
732 v0 = _mm_loadu_si128(p);
733 v1 = _mm_loadu_si128(p + 1);
736 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
737 t = _mm_srli_si128(v0, 8);
738 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
739 c0 = _mm_or_si128(c0, t);
740 t = _mm_slli_si128(v1, 8);
741 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
742 c0 = _mm_or_si128(c0, t);
744 _mm_storeu_si128((__m128i*)msp->
tmp, c0);
746 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
747 t = _mm_srli_si128(v1, 8);
748 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
749 c1 = _mm_or_si128(c1, t);
751 _mm_storeu_si128((__m128i*)msp->
tmp + 1, c1);
765 if (msp->
bits <= 128)
768 if (msp->
bits <= 128)
771 __m128i t = _mm_loadu_si128((__m128i*)msp->
tmp);
786 __m256i row = _mm256_setzero_si256();
789 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
790 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
792 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
794 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
802 __m256i w0 = _mm256_srli_epi32(flags, 15);
803 m_n = _mm256_sub_epi32(U_q, w0);
804 m_n = _mm256_andnot_si256(insig, m_n);
808 __m256i inc_sum = m_n;
809 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
810 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
811 int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
812 int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
814 __m128i ms_vec0 = _mm_setzero_si128();
815 __m128i ms_vec1 = _mm_setzero_si128();
817 ms_vec0 = frwd_fetch<0xFF>(magsgn);
821 ms_vec1 = frwd_fetch<0xFF>(magsgn);
825 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
827 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4);
830 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
831 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
832 byte_idx = _mm256_shuffle_epi8(byte_idx,
833 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
834 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
835 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
836 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
837 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
840 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
842 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
843 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
845 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
846 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
847 d0 = _mm256_mullo_epi16(d0, bit_shift);
848 d0 = _mm256_srli_epi16(d0, 8);
849 d1 = _mm256_mullo_epi16(d1, bit_shift);
850 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((
si32)0xFF00FF00));
851 d0 = _mm256_or_si256(d0, d1);
855 __m256i ones = _mm256_set1_epi32(1);
856 __m256i twos = _mm256_set1_epi32(2);
857 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
858 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
859 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
860 w0 = _mm256_sub_epi32(twos, w0);
861 shift = _mm256_sllv_epi32(w0, U_q_m1);
862 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
865 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
866 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
867 w0 = _mm256_andnot_si256(w0, shift);
868 ms_vec = _mm256_or_si256(ms_vec, w0);
869 w0 = _mm256_slli_epi32(ms_vec, 31);
870 ms_vec = _mm256_or_si256(ms_vec, ones);
871 __m256i tvn = ms_vec;
872 ms_vec = _mm256_add_epi32(ms_vec, twos);
873 ms_vec = _mm256_slli_epi32(ms_vec, (
si32)p - 1);
874 ms_vec = _mm256_or_si256(ms_vec, w0);
875 row = _mm256_andnot_si256(insig, ms_vec);
877 ms_vec = _mm256_andnot_si256(insig, tvn);
879 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
881 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
882 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
905 __m256i row = _mm256_setzero_si256();
906 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
907 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
908 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
909 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
911 flags = _mm256_and_si256(w0,
912 _mm256_set_epi16((
si16)0x8880, 0x4440, 0x2220, 0x1110,
913 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
914 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
915 (
si16)0x8880, 0x4440, 0x2220, 0x1110));
916 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
917 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
919 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
920 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
921 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
922 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
930 w0 = _mm256_srli_epi16(flags, 15);
931 m_n = _mm256_sub_epi16(U_q_avx, w0);
932 m_n = _mm256_andnot_si256(insig, m_n);
936 __m256i inc_sum = m_n;
937 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
938 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
939 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
940 int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
941 int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
942 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2);
944 __m128i ms_vec0 = _mm_setzero_si128();
945 __m128i ms_vec1 = _mm_setzero_si128();
947 ms_vec0 = frwd_fetch<0xFF>(magsgn);
951 ms_vec1 = frwd_fetch<0xFF>(magsgn);
955 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
958 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
959 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
960 byte_idx = _mm256_shuffle_epi8(byte_idx,
961 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
962 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
963 0x0606, 0x0404, 0x0202, 0x0000));
964 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
965 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
966 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
967 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
970 __m256i bit_shift = _mm256_shuffle_epi8(
971 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
972 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
973 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
974 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
975 d0 = _mm256_mullo_epi16(d0, bit_shift);
976 d0 = _mm256_srli_epi16(d0, 8);
977 d1 = _mm256_mullo_epi16(d1, bit_shift);
978 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((
si16)0xFF00));
979 d0 = _mm256_or_si256(d0, d1);
982 __m256i shift, t0, t1, Uq0, Uq1;
983 __m256i ones = _mm256_set1_epi16(1);
984 __m256i twos = _mm256_set1_epi16(2);
985 __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
986 Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
987 Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
988 w0 = _mm256_sub_epi16(twos, w0);
989 t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
990 t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
992 __m128i t_0_sse = _mm256_castsi256_si128(t0);
993 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
994 __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
995 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
996 t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
998 t_0_sse = _mm256_castsi256_si128(t1);
999 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
1000 t_1_sse = _mm256_extracti128_si256(t1, 0x1);
1001 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
1002 t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
1004 shift = _mm256_or_si256(t0, t1);
1005 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
1008 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
1009 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
1010 w0 = _mm256_andnot_si256(w0, shift);
1011 ms_vec = _mm256_or_si256(ms_vec, w0);
1012 w0 = _mm256_slli_epi16(ms_vec, 15);
1013 ms_vec = _mm256_or_si256(ms_vec, ones);
1014 __m256i tvn = ms_vec;
1015 ms_vec = _mm256_add_epi16(ms_vec, twos);
1016 ms_vec = _mm256_slli_epi16(ms_vec, (
si32)p - 1);
1017 ms_vec = _mm256_or_si256(ms_vec, w0);
1018 row = _mm256_andnot_si256(insig, ms_vec);
1020 ms_vec = _mm256_andnot_si256(insig, tvn);
1022 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
1023 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
1024 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
1025 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
1026 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
1027 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
1028 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
1030 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
1031 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
1039 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
1041 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
1042 v = _mm256_srli_epi32(v, 23);
1043 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
1044 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
1067 ui32 missing_msbs,
ui32 num_passes,
1072 static bool insufficient_precision =
false;
1073 static bool modify_code =
false;
1074 static bool truncate_spp_mrp =
false;
1076 if (num_passes > 1 && lengths2 == 0)
1078 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1079 "one coding pass, but zero length for "
1080 "2nd and potential 3rd pass.");
1086 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1087 "This codeblocks has %d passes.",
1092 if (missing_msbs > 30)
1094 if (insufficient_precision ==
false)
1096 insufficient_precision =
true;
1097 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1098 "codeblock. This message will not be "
1099 "displayed again.");
1103 else if (missing_msbs == 30)
1105 if (modify_code ==
false) {
1107 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1108 "pass. The code can be modified to support "
1109 "this case. This message will not be "
1110 "displayed again.");
1114 else if (missing_msbs == 29)
1116 if (num_passes > 1) {
1118 if (truncate_spp_mrp ==
false) {
1119 truncate_spp_mrp =
true;
1120 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1121 "nor MagRef passes; both will be skipped. "
1122 "This message will not be displayed "
1127 ui32 p = 30 - missing_msbs;
1133 OJPH_WARN(0x00010006,
"Wrong codeblock length.");
1139 lcup = (int)lengths1;
1141 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1142 if (scup < 2 || scup > lcup || scup > 4079)
1160 ui16 scratch[8 * 513] = {0};
1168 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1170 assert((stride & 0x3) == 0);
1172 ui32 mmsbp2 = missing_msbs + 2;
1184 mel_init(&mel, coded_data, lcup, scup);
1186 rev_init(&vlc, coded_data, lcup, scup);
1196 for (
ui32 x = 0; x < width; sp += 4)
1215 t0 = (run == -1) ? t0 : 0;
1229 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1238 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1241 if (c_q == 0 && x < width)
1246 t1 = (run == -1) ? t1 : 0;
1251 t1 = x < width ? t1 : 0;
1260 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1268 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1269 if (uvlc_mode == 0xc0)
1273 uvlc_mode += (run == -1) ? 0x40 : 0;
1290 ui32 len = uvlc_entry & 0xF;
1291 ui32 tmp = vlc_val & ((1 << len) - 1);
1296 len = uvlc_entry & 0x7;
1298 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1300 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1306 for (
ui32 y = 2; y < height; y += 2)
1309 ui16 *sp = scratch + (y >> 1) * sstr;
1311 for (
ui32 x = 0; x < width; sp += 4)
1317 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1318 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1334 t0 = (run == -1) ? t0 : 0;
1349 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1351 c_q |= sp[0 - (
si32)sstr] & 0x80;
1353 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1354 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1363 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1366 if (c_q == 0 && x < width)
1371 t1 = (run == -1) ? t1 : 0;
1376 t1 = x < width ? t1 : 0;
1386 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1388 c_q |= sp[2 - (
si32)sstr] & 0x80;
1396 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1402 ui32 len = uvlc_entry & 0xF;
1403 ui32 tmp = vlc_val & ((1 << len) - 1);
1408 len = uvlc_entry & 0x7;
1410 ui16 u_q = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1412 u_q = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1435 const int v_n_size = 512 + 16;
1436 ui32 v_n_scratch[2 * v_n_size] = {0};
1439 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1441 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1445 ui32 *vp = v_n_scratch;
1446 ui32 *dp = decoded_data;
1449 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1451 __m128i vn = _mm_set1_epi32(2);
1453 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1454 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1456 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1457 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1458 if (!_mm256_testz_si256(w, w)) {
1463 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1464 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1465 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1467 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1468 w0 = _mm_or_si128(w0, vn);
1469 _mm_storeu_si128((__m128i*)vp, w0);
1473 for (
ui32 y = 2; y < height; y += 2)
1477 ui32 *vp = v_n_scratch;
1478 ui16* sp = scratch + (y >> 1) * sstr;
1480 const __m256i avx_31 = _mm256_set1_epi32(31);
1481 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1482 const __m256i avx_1 = _mm256_set1_epi32(1);
1483 const __m256i avx_0 = _mm256_setzero_si256();
1485 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1486 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1487 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1488 v = _mm256_or_si256(v, v_p1);
1490 v = _mm256_sub_epi32(avx_31, v);
1492 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1493 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1494 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1495 gamma = _mm256_and_si256(gamma, w0);
1496 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1498 v = _mm256_andnot_si256(gamma, v);
1499 v = _mm256_max_epi32(v, avx_1);
1501 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1502 v = _mm256_add_epi32(inf_u_q, v);
1504 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1505 if (!_mm256_testz_si256(w0, w0)) {
1509 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1513 ui32 *vp = v_n_scratch;
1514 ui16 *sp = scratch + (y >> 1) * sstr;
1515 ui32 *dp = decoded_data + y * stride;
1518 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1520 __m128i vn = _mm_set1_epi32(2);
1522 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1523 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1525 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1526 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1529 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1530 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1531 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1533 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1534 w0 = _mm_or_si128(w0, vn);
1535 _mm_storeu_si128((__m128i*)vp, w0);
1550 const int v_n_size = 512 + 16;
1551 ui16 v_n_scratch[v_n_size] = {0};
1552 ui32 v_n_scratch_32[v_n_size] = {0};
1555 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1559 ui16 *vp = v_n_scratch;
1560 ui32 *dp = decoded_data;
1563 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
1565 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1566 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
1567 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1568 if (!_mm_testz_si128(w, w)) {
1572 __m128i vn = _mm_set1_epi16(2);
1575 w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1576 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1578 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1579 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1581 _mm256_storeu_si256((__m256i*)dp, w0);
1582 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1586 for (
ui32 y = 2; y < height; y += 2) {
1589 ui16 *vp = v_n_scratch;
1590 ui32 *vp_32 = v_n_scratch_32;
1592 ui16* sp = scratch + (y >> 1) * sstr;
1593 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1594 const __m256i avx_31 = _mm256_set1_epi32(31);
1595 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1596 const __m256i avx_1 = _mm256_set1_epi32(1);
1597 const __m256i avx_0 = _mm256_setzero_si256();
1599 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
1600 __m128i v = _mm_loadu_si128((__m128i*)vp);
1601 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
1602 v = _mm_or_si128(v, v_p1);
1604 __m256i v_avx = _mm256_cvtepu16_epi32(v);
1606 v_avx = _mm256_sub_epi32(avx_31, v_avx);
1608 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1609 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1610 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1611 gamma = _mm256_and_si256(gamma, w0);
1612 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1614 v_avx = _mm256_andnot_si256(gamma, v_avx);
1615 v_avx = _mm256_max_epi32(v_avx, avx_1);
1617 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1618 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
1620 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
1621 if (!_mm256_testz_si256(w0, w0)) {
1625 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1629 ui16 *vp = v_n_scratch;
1630 ui32* vp_32 = v_n_scratch_32;
1631 ui16 *sp = scratch + (y >> 1) * sstr;
1632 ui32 *dp = decoded_data + y * stride;
1635 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1637 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1638 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1640 __m128i vn = _mm_set1_epi16(2);
1643 __m128i w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1644 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1646 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1647 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1649 _mm256_storeu_si256((__m256i*)dp, w0);
1650 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1664 ui16*
const sigma = scratch;
1666 ui32 mstr = (width + 3u) >> 2;
1668 mstr = ((mstr + 2u) + 7u) & ~7u;
1676 const __m128i mask_3 = _mm_set1_epi32(0x30);
1677 const __m128i mask_C = _mm_set1_epi32(0xC0);
1678 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1679 for (y = 0; y < height; y += 4)
1681 ui16* sp = scratch + (y >> 1) * sstr;
1682 ui16* dp = sigma + (y >> 2) * mstr;
1683 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1685 __m128i s0, s1, u3, uC, t0, t1;
1687 s0 = _mm_loadu_si128((__m128i*)(sp));
1688 u3 = _mm_and_si128(s0, mask_3);
1689 u3 = _mm_srli_epi32(u3, 4);
1690 uC = _mm_and_si128(s0, mask_C);
1691 uC = _mm_srli_epi32(uC, 2);
1692 t0 = _mm_or_si128(u3, uC);
1694 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1695 u3 = _mm_and_si128(s1, mask_3);
1696 u3 = _mm_srli_epi32(u3, 2);
1697 uC = _mm_and_si128(s1, mask_C);
1698 t1 = _mm_or_si128(u3, uC);
1700 __m128i r = _mm_or_si128(t0, t1);
1701 r = _mm_shuffle_epi8(r, shuffle_mask);
1704 _mm_store_ss((
float*)dp, _mm_castsi128_ps(r));
1710 ui16* dp = sigma + (y >> 2) * mstr;
1711 __m128i zero = _mm_setzero_si128();
1712 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1713 _mm_store_si128((__m128i*)dp, zero);
1729 ui16 prev_row_sig[256 + 8] = {0};
1732 frwd_init<0>(&sigprop, coded_data + lengths1, (
int)lengths2);
1734 for (
ui32 y = 0; y < height; y += 4)
1736 ui32 pattern = 0xFFFFu;
1737 if (height - y < 4) {
1739 if (height - y < 3) {
1749 ui16 *prev_sig = prev_row_sig;
1750 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1751 ui32 *dpp = decoded_data + y * stride;
1752 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1757 pattern = pattern >> (s * 4);
1772 ui32 ns = *(
ui32*)(cur_sig + mstr);
1773 ui32 u = (ps & 0x88888888) >> 3;
1775 u |= (ns & 0x11111111) << 3;
1780 mbr |= (cs & 0x77777777) << 1;
1781 mbr |= (cs & 0xEEEEEEEE) >> 1;
1797 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1798 ui32 cwd = (
ui32)_mm_extract_epi16(cwd_vec, 0);
1801 ui32 col_mask = 0xFu;
1802 ui32 inv_sig = ~cs & pattern;
1803 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1805 if ((col_mask & new_sig) == 0)
1809 ui32 sample_mask = 0x1111u & col_mask;
1810 if (new_sig & sample_mask)
1812 new_sig &= ~sample_mask;
1815 ui32 t = 0x33u << i;
1816 new_sig |= t & inv_sig;
1822 if (new_sig & sample_mask)
1824 new_sig &= ~sample_mask;
1827 ui32 t = 0x76u << i;
1828 new_sig |= t & inv_sig;
1834 if (new_sig & sample_mask)
1836 new_sig &= ~sample_mask;
1839 ui32 t = 0xECu << i;
1840 new_sig |= t & inv_sig;
1846 if (new_sig & sample_mask)
1848 new_sig &= ~sample_mask;
1851 ui32 t = 0xC8u << i;
1852 new_sig |= t & inv_sig;
1860 cwd |= (
ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1864 __m128i new_sig_vec = _mm_set1_epi16((
si16)new_sig);
1865 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1866 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1867 new_sig_vec = _mm_and_si128(new_sig_vec,
1868 _mm_set1_epi64x((
si64)0x8040201008040201));
1869 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1870 _mm_set1_epi64x((
si64)0x8040201008040201));
1874 __m128i inc_sum = new_sig_vec;
1875 inc_sum = _mm_abs_epi8(inc_sum);
1876 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1877 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1878 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1879 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1880 cnt += (
ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1882 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1886 cwd_vec = _mm_set1_epi16((
si16)cwd);
1887 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1888 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1889 cwd_vec = _mm_and_si128(cwd_vec,
1890 _mm_set1_epi64x((
si64)0x8040201008040201));
1891 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1892 _mm_set1_epi64x((
si64)0x8040201008040201));
1893 cwd_vec = _mm_abs_epi8(cwd_vec);
1897 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1901 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1902 __m128i val = _mm_set1_epi32(3 << (p - 2));
1904 for (
int c = 0; c < 4; ++ c) {
1905 __m128i s0, s0_ns, s0_val;
1907 s0 = _mm_load_si128((__m128i*)dp);
1911 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1912 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1915 s0_val = _mm_shuffle_epi8(v, m);
1916 s0_val = _mm_slli_epi32(s0_val, 31);
1917 s0_val = _mm_or_si128(s0_val, val);
1918 s0_val = _mm_and_si128(s0_val, s0_ns);
1921 s0 = _mm_or_si128(s0, s0_val);
1923 _mm_store_si128((__m128i*)dp, s0);
1926 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1933 *prev_sig = (
ui16)(new_sig);
1937 new_sig |= (t & 0x7777) << 1;
1938 new_sig |= (t & 0xEEEE) >> 1;
1951 rev_init_mrp(&magref, coded_data, (
int)lengths1, (
int)lengths2);
1953 for (
ui32 y = 0; y < height; y += 4)
1955 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1956 ui32 *dpp = decoded_data + y * stride;
1957 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
1962 ui16 sig = *cur_sig++;
1970 __m128i sig_vec = _mm_set1_epi16((
si16)sig);
1971 sig_vec = _mm_shuffle_epi8(sig_vec,
1972 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1973 sig_vec = _mm_and_si128(sig_vec,
1974 _mm_set1_epi64x((
si64)0x8040201008040201));
1975 sig_vec = _mm_cmpeq_epi8(sig_vec,
1976 _mm_set1_epi64x((
si64)0x8040201008040201));
1977 sig_vec = _mm_abs_epi8(sig_vec);
1981 __m128i inc_sum = sig_vec;
1982 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1983 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1984 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1985 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1986 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1987 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1994 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1995 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1996 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1997 cwd_vec = _mm_and_si128(cwd_vec,
1998 _mm_set1_epi64x((
si64)0x8040201008040201));
1999 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2000 _mm_set1_epi64x((
si64)0x8040201008040201));
2001 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2002 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2003 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2007 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2009 for (
int c = 0; c < 4; ++c) {
2010 __m128i s0, s0_sig, s0_idx, s0_val;
2012 s0 = _mm_load_si128((__m128i*)dp);
2014 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2015 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2017 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2018 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2020 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2022 s0_val = _mm_slli_epi32(s0_val, (
si32)p - 2);
2023 s0 = _mm_xor_si128(s0, s0_val);
2025 _mm_store_si128((__m128i*)dp, s0);
2028 m = _mm_add_epi32(m, _mm_set1_epi32(1));
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2 *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 32 bit data
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
__m256i avx2_lzcnt_epi32(__m256i v)
static __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2 *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
static ui32 count_leading_zeros(ui32 val)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui8 tmp[48]
temporary buffer of read data + 16 extra
ui32 bits
number of bits stored in tmp
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data