OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_encoder_avx512.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2023, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_encoder_avx512.cpp
35//***************************************************************************/
36
37#include <cassert>
38#include <cstring>
39#include <cstdint>
40#include <climits>
41#include <immintrin.h>
42
43#include "ojph_mem.h"
44#include "ojph_arch.h"
45#include "ojph_block_encoder.h"
46#include "ojph_message.h"
47
48#ifdef OJPH_COMPILER_MSVC
49 #define likely(x) (x)
50 #define unlikely(x) (x)
51#else
52 #define likely(x) __builtin_expect((x), 1)
53 #define unlikely(x) __builtin_expect((x), 0)
54#endif
55
56namespace ojph {
57 namespace local {
58
60 // tables
62
63 //VLC encoding
64 // index is (c_q << 8) + (rho << 4) + eps
65 // data is (cwd << 8) + (cwd_len << 4) + eps
66 // table 0 is for the initial line of quads
67 static ui32 vlc_tbl0[2048];
68 static ui32 vlc_tbl1[2048];
69
70 //UVLC encoding
71 static ui32 ulvc_cwd_pre[33];
72 static int ulvc_cwd_pre_len[33];
73 static ui32 ulvc_cwd_suf[33];
74 static int ulvc_cwd_suf_len[33];
75
77 static bool vlc_init_tables()
78 {
79 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
81 #include "table0.h"
82 };
83 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
84
85 si32 pattern_popcnt[16];
86 for (ui32 i = 0; i < 16; ++i)
87 pattern_popcnt[i] = (si32)population_count(i);
88
89 vlc_src_table* src_tbl = tbl0;
90 ui32 *tgt_tbl = vlc_tbl0;
91 size_t tbl_size = tbl0_size;
92 for (int i = 0; i < 2048; ++i)
93 {
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
96 tgt_tbl[i] = 0;
97 else
98 {
99 vlc_src_table *best_entry = NULL;
100 if (emb) // u_off = 1
101 {
102 int best_e_k = -1;
103 for (size_t j = 0; j < tbl_size; ++j)
104 {
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
108 {
109 //now we need to find the smallest cwd with the highest
110 // number of bits set in e_k
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
113 {
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
116 }
117 }
118 }
119 }
120 else // u_off = 0
121 {
122 for (size_t j = 0; j < tbl_size; ++j)
123 {
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
126 {
127 best_entry = src_tbl + j;
128 break;
129 }
130 }
131 }
132 assert(best_entry);
133 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
134 + best_entry->e_k);
135 }
136 }
137
138 vlc_src_table tbl1[] = {
139 #include "table1.h"
140 };
141 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
142
143 src_tbl = tbl1;
144 tgt_tbl = vlc_tbl1;
145 tbl_size = tbl1_size;
146 for (int i = 0; i < 2048; ++i)
147 {
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
150 tgt_tbl[i] = 0;
151 else
152 {
153 vlc_src_table *best_entry = NULL;
154 if (emb) // u_off = 1
155 {
156 int best_e_k = -1;
157 for (size_t j = 0; j < tbl_size; ++j)
158 {
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
162 {
163 //now we need to find the smallest cwd with the highest
164 // number of bits set in e_k
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
167 {
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
170 }
171 }
172 }
173 }
174 else // u_off = 0
175 {
176 for (size_t j = 0; j < tbl_size; ++j)
177 {
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
180 {
181 best_entry = src_tbl + j;
182 break;
183 }
184 }
185 }
186 assert(best_entry);
187 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
188 + best_entry->e_k);
189 }
190 }
191
192
193 return true;
194 }
195
197 static bool uvlc_init_tables()
198 {
199 //code goes from 0 to 31, extension and 32 are not supported here
200 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
201 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
202 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
203 ulvc_cwd_pre_len[2] = 2;
204 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
205 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
206 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
207 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
208 ulvc_cwd_suf_len[2] = 0;
209 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
210 for (int i = 5; i < 33; ++i)
211 {
212 ulvc_cwd_pre[i] = 0;
213 ulvc_cwd_pre_len[i] = 3;
214 ulvc_cwd_suf[i] = (ui32)(i-5);
215 ulvc_cwd_suf_len[i] = 5;
216 }
217 return true;
218 }
219
221 static bool tables_initialized = false;
222
225 if (!tables_initialized) {
226 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
227 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
230 }
231 return tables_initialized;
232 }
233
235 //
237 struct mel_struct {
238 //storage
239 ui8* buf; //pointer to data buffer
240 ui32 pos; //position of next writing within buf
241 ui32 buf_size; //size of buffer, which we must not exceed
242
243 // all these can be replaced by bytes
244 int remaining_bits; //number of empty bits in tmp
245 int tmp; //temporary storage of coded bits
246 int run; //number of 0 run
247 int k; //state
248 int threshold; //threshold where one bit must be coded
249 };
250
252 static inline void
253 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
254 {
255 melp->buf = data;
256 melp->pos = 0;
257 melp->buf_size = buffer_size;
258 melp->remaining_bits = 8;
259 melp->tmp = 0;
260 melp->run = 0;
261 melp->k = 0;
262 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
263 }
264
266 static inline void
268 {
269 melp->tmp = (melp->tmp << 1) + v;
270 melp->remaining_bits--;
271 if (melp->remaining_bits == 0) {
272 melp->buf[melp->pos++] = (ui8)melp->tmp;
273 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
274 melp->tmp = 0;
275 }
276 }
277
279 static inline void
280 mel_encode(mel_struct* melp, bool bit)
281 {
282 //MEL exponent
283 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
284
285 if (bit == false) {
286 ++melp->run;
287 if (melp->run >= melp->threshold) {
288 mel_emit_bit(melp, 1);
289 melp->run = 0;
290 melp->k = ojph_min(12, melp->k + 1);
291 melp->threshold = 1 << mel_exp[melp->k];
292 }
293 } else {
294 mel_emit_bit(melp, 0);
295 int t = mel_exp[melp->k];
296 while (t > 0) {
297 mel_emit_bit(melp, (melp->run >> --t) & 1);
298 }
299 melp->run = 0;
300 melp->k = ojph_max(0, melp->k - 1);
301 melp->threshold = 1 << mel_exp[melp->k];
302 }
303 }
304
306 //
309 //storage
310 ui8* buf; //pointer to data buffer
311 ui32 pos; //position of next writing within buf
312 ui32 buf_size; //size of buffer, which we must not exceed
313
314 int used_bits; //number of occupied bits in tmp
315 ui64 tmp; //temporary storage of coded bits
316 bool last_greater_than_8F; //true if last byte us greater than 0x8F
317 };
318
320 static inline void
321 vlc_init(vlc_struct_avx512* vlcp, ui32 buffer_size, ui8* data)
322 {
323 vlcp->buf = data + buffer_size - 1; //points to last byte
324 vlcp->pos = 1; //locations will be all -pos
325 vlcp->buf_size = buffer_size;
326
327 vlcp->buf[0] = 0xFF;
328 vlcp->used_bits = 4;
329 vlcp->tmp = 0xF;
330 vlcp->last_greater_than_8F = true;
331 }
332
334 static inline void
335 vlc_encode(vlc_struct_avx512* vlcp, ui32 cwd, int cwd_len)
336 {
337 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
338 vlcp->used_bits += cwd_len;
339
340 while (vlcp->used_bits >= 8) {
341 ui8 tmp;
342
343 if (unlikely(vlcp->last_greater_than_8F)) {
344 tmp = vlcp->tmp & 0x7F;
345
346 if (likely(tmp != 0x7F)) {
347 tmp = vlcp->tmp & 0xFF;
348 *(vlcp->buf - vlcp->pos) = tmp;
349 vlcp->last_greater_than_8F = tmp > 0x8F;
350 vlcp->tmp >>= 8;
351 vlcp->used_bits -= 8;
352 } else {
353 *(vlcp->buf - vlcp->pos) = tmp;
354 vlcp->last_greater_than_8F = false;
355 vlcp->tmp >>= 7;
356 vlcp->used_bits -= 7;
357 }
358
359 } else {
360 tmp = vlcp->tmp & 0xFF;
361 *(vlcp->buf - vlcp->pos) = tmp;
362 vlcp->last_greater_than_8F = tmp > 0x8F;
363 vlcp->tmp >>= 8;
364 vlcp->used_bits -= 8;
365 }
366
367 vlcp->pos++;
368 }
369 }
370
372 //
374 static inline void
376 {
377 if (melp->run > 0)
378 mel_emit_bit(melp, 1);
379
380 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
381 *(vlcp->buf - vlcp->pos) = 0x7f;
382 vlcp->pos++;
383 vlcp->tmp >>= 7;
384 vlcp->used_bits -= 7;
385 }
386
387 melp->tmp = melp->tmp << melp->remaining_bits;
388 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
389 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
390 if ((mel_mask | vlc_mask) == 0)
391 return; //last mel byte cannot be 0xFF, since then
392 //melp->remaining_bits would be < 8
393 if (melp->pos >= melp->buf_size)
394 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
395 ui8 vlcp_tmp = (ui8)vlcp->tmp;
396 int fuse = melp->tmp | vlcp_tmp;
397 if ( ( ((fuse ^ melp->tmp) & mel_mask)
398 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
399 && (fuse != 0xFF) && vlcp->pos > 1)
400 {
401 melp->buf[melp->pos++] = (ui8)fuse;
402 }
403 else
404 {
405 if (vlcp->pos >= vlcp->buf_size)
406 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
407 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
408 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
409 vlcp->pos++;
410 }
411 }
412
414//
416 struct ms_struct {
417 //storage
418 ui8* buf; //pointer to data buffer
419 ui32 pos; //position of next writing within buf
420 ui32 buf_size; //size of buffer, which we must not exceed
421
422 int max_bits; //maximum number of bits that can be store in tmp
423 int used_bits; //number of occupied bits in tmp
424 ui32 tmp; //temporary storage of coded bits
425 };
426
428 static inline void
429 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
430 {
431 msp->buf = data;
432 msp->pos = 0;
433 msp->buf_size = buffer_size;
434 msp->max_bits = 8;
435 msp->used_bits = 0;
436 msp->tmp = 0;
437 }
438
440 static inline void
441 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
442 {
443 while (cwd_len > 0)
444 {
445 if (msp->pos >= msp->buf_size)
446 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
447 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
448 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
449 msp->used_bits += t;
450 cwd >>= t;
451 cwd_len -= t;
452 if (msp->used_bits >= msp->max_bits)
453 {
454 msp->buf[msp->pos++] = (ui8)msp->tmp;
455 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
456 msp->tmp = 0;
457 msp->used_bits = 0;
458 }
459 }
460 }
461
463 static inline void
465 {
466 if (msp->used_bits)
467 {
468 int t = msp->max_bits - msp->used_bits; //unused bits
469 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
470 msp->used_bits += t;
471 if (msp->tmp != 0xFF)
472 {
473 if (msp->pos >= msp->buf_size)
474 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
475 msp->buf[msp->pos++] = (ui8)msp->tmp;
476 }
477 }
478 else if (msp->max_bits == 7)
479 msp->pos--;
480 }
481
482#define ZERO _mm512_setzero_epi32()
483#define ONE _mm512_set1_epi32(1)
484
485#if 0
486static void print_epi32(const char *msg, __m512i &val)
487{
488 uint32_t A[16] = {0};
489
490 _mm512_store_epi32(A, val);
491
492 printf("%s: ", msg);
493 for (int i = 0; i < 16; ++i) {
494 printf("%X ", A[i]);
495 }
496 printf("\n");
497}
498#endif
499
500static void proc_pixel(__m512i *src_vec, ui32 p,
501 __m512i *eq_vec, __m512i *s_vec,
502 __m512i &rho_vec, __m512i &e_qmax_vec)
503{
504 __m512i val_vec[4];
505 __m512i _eq_vec[4];
506 __m512i _s_vec[4];
507 __m512i _rho_vec[4];
508 ui16 val_mask[4];
509
510 for (ui32 i = 0; i < 4; ++i) {
511 /* val = t + t; //multiply by 2 and get rid of sign */
512 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
513
514 /* val >>= p; // 2 \mu_p + x */
515 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
516
517 /* val &= ~1u; // 2 \mu_p */
518 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((int)~1u));
519
520 /* if (val) { */
521 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i], ZERO);
522
523 /* rho[i] = 1 << i;
524 * rho is processed below.
525 */
526
527 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
528 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
529 _eq_vec[i] = _mm512_mask_lzcnt_epi32(ZERO, val_mask[i], val_vec[i]);
530 _eq_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i],
531 _mm512_set1_epi32(32), _eq_vec[i]);
532
533 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
534 * e_qmax is processed below
535 */
536
537 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
538 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
539 _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
540 _s_vec[i] =
541 _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
542 /* } */
543 }
544
545 val_vec[0] = _mm512_mask_mov_epi32(ZERO, val_mask[0], ONE);
546 val_vec[1] = _mm512_mask_mov_epi32(ZERO, val_mask[1], ONE);
547 val_vec[2] = _mm512_mask_mov_epi32(ZERO, val_mask[2], ONE);
548 val_vec[3] = _mm512_mask_mov_epi32(ZERO, val_mask[3], ONE);
549 e_qmax_vec = ZERO;
550
551 const __m512i idx[2] = {
552 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
553 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
554 };
555
556 /* Reorder from
557 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5]...[0,14], [0,15]
558 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5]...[1,14], [1,15]
559 * *_vec[2]:[0,16], [0,17], [0,18], [0,19], [0,20], [0,21]...[0,30], [0,31]
560 * *_vec[3]:[1,16], [1,17], [1,18], [1,19], [1,20], [1,21]...[1,30], [1,31]
561 * to
562 * *_vec[0]:[0, 0], [0, 2] ... [0,14], [0,16], [0,18] ... [0,30]
563 * *_vec[1]:[1, 0], [1, 2] ... [1,14], [1,16], [1,18] ... [1,30]
564 * *_vec[2]:[0, 1], [0, 3] ... [0,15], [0,17], [0,19] ... [0,31]
565 * *_vec[3]:[1, 1], [1, 3] ... [1,15], [1,17], [1,19] ... [1,31]
566 */
567 for (ui32 i = 0; i < 4; ++i) {
568 ui32 e_idx = i >> 1;
569 ui32 o_idx = i & 0x1;
570
571 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
572 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
573 idx[e_idx],
574 _eq_vec[o_idx + 2]);
575
576 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
577 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
578 idx[e_idx],
579 _s_vec[o_idx + 2]);
580
581 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
582 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
583 idx[e_idx],
584 val_vec[o_idx + 2]);
585 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
586
587 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
588 }
589
590 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
591 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
592 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
593}
594
595/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
596 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
597 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
598 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
599 *
600 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
601 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
602 *
603 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
604 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
605 *
606 * [..]
607 */
608static void rotate_matrix(__m512i *matrix)
609{
610 __m512i _matrix[4];
611 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
612 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
613 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
614 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
615
616 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
617 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
618 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
619 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
620
621 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
622 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
623 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
624 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
625
626 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
627 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
628 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
629 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
630}
631
632static void proc_ms_encode(ms_struct *msp,
633 __m512i &tuple_vec,
634 __m512i &uq_vec,
635 __m512i &rho_vec,
636 __m512i *s_vec)
637{
638 __m512i m_vec[4];
639
640 /* Prepare parameters for ms_encode */
641 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
642 auto tmp = _mm512_and_epi32(tuple_vec, ONE);
643 tmp = _mm512_sub_epi32(uq_vec, tmp);
644 auto tmp1 = _mm512_and_epi32(rho_vec, ONE);
645 auto mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
646 m_vec[0] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
647
648 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
649 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
650 tmp = _mm512_srli_epi32(tmp, 1);
651 tmp = _mm512_sub_epi32(uq_vec, tmp);
652 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
653 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
654 m_vec[1] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
655
656 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
657 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
658 tmp = _mm512_srli_epi32(tmp, 2);
659 tmp = _mm512_sub_epi32(uq_vec, tmp);
660 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
661 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
662 m_vec[2] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
663
664 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
665 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
666 tmp = _mm512_srli_epi32(tmp, 3);
667 tmp = _mm512_sub_epi32(uq_vec, tmp);
668 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
669 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
670 m_vec[3] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
671
672 rotate_matrix(m_vec);
673 /* s_vec from
674 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
675 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
676 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
677 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
678 * to
679 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
680 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
681 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
682 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
683 */
684 rotate_matrix(s_vec);
685
686 ui32 cwd[16];
687 int cwd_len[16];
688 ui64 _cwd = 0;
689 int _cwd_len = 0;
690
691 /* Each iteration process 8 bytes * 2 lines */
692 for (ui32 i = 0; i < 4; ++i) {
693 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
694 * cwd_len = m
695 */
696 _mm512_store_epi32(cwd_len, m_vec[i]);
697 tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
698 tmp = _mm512_sub_epi32(tmp, ONE);
699 tmp = _mm512_and_epi32(tmp, s_vec[i]);
700 _mm512_store_epi32(cwd, tmp);
701
702 for (ui32 j = 0; j < 8; ++j) {
703 ui32 idx = j * 2;
704 _cwd = cwd[idx];
705 _cwd_len = cwd_len[idx];
706 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
707 _cwd_len += cwd_len[idx + 1];
708 ms_encode(msp, _cwd, _cwd_len);
709 }
710 }
711}
712
713static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
714 __m512i &e_qmax_vec)
715{
716 /* if (u_q[i] > 0) {
717 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
718 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
719 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
720 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
721 * }
722 */
723 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
724
725 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
726 auto tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
727 auto eps_vec = _mm512_mask_mov_epi32(ZERO, u_q_mask, tmp);
728
729 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
730 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
731 tmp = _mm512_slli_epi32(tmp, 1);
732 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
733
734 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
735 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
736 tmp = _mm512_slli_epi32(tmp, 2);
737 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
738
739 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
740 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
741 tmp = _mm512_slli_epi32(tmp, 3);
742
743 return _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
744}
745
746static void update_lep(ui32 x, __m512i &prev_e_val_vec,
747 __m512i *eq_vec, __m512i *e_val_vec,
748 const __m512i left_shift)
749{
750 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
751 * lep[0] = (ui8)e_q[3];
752 * Compare e_q[1] with e_q[3] of the prevous round.
753 */
754 auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
755 left_shift, eq_vec[3]);
756 prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
757 eq_vec[3]);
758 e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
759}
760
761
762static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
763 __m512i &rho_vec, __m512i *cx_val_vec,
764 const __m512i left_shift)
765{
766 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
767 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
768 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
769 */
770 auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
771 left_shift, rho_vec);
772 prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
773 rho_vec);
774
775 tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
776 tmp = _mm512_srli_epi32(tmp, 3);
777
778 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
779 tmp1 = _mm512_srli_epi32(tmp1, 1);
780 cx_val_vec[x] = _mm512_or_epi32(tmp, tmp1);
781}
782
783static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
784 __m512i &eps_vec, ui32 *vlc_tbl)
785{
786 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
787 auto tmp = _mm512_slli_epi32(cq_vec, 8);
788 auto tmp1 = _mm512_slli_epi32(rho_vec, 4);
789 tmp = _mm512_add_epi32(tmp, tmp1);
790 tmp = _mm512_add_epi32(tmp, eps_vec);
791 return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
792}
793
794static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
795 const __m512i right_shift)
796{
797 ojph_unused(x);
798 ojph_unused(cx_val_vec);
799 ojph_unused(right_shift);
800
801 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
802 auto tmp = _mm512_srli_epi32(rho_vec, 1);
803 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(1));
804 return _mm512_or_epi32(tmp, tmp1);
805}
806
807static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
808 const __m512i right_shift)
809{
810 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
811 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
812 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
813 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
814 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
815 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
816 tmp = _mm512_slli_epi32(tmp, 2);
817 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
818 tmp = _mm512_add_epi32(tmp1, tmp);
819
820 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
821 tmp1 = _mm512_srli_epi32(tmp1, 1);
822 tmp = _mm512_or_epi32(tmp, tmp1);
823
824 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
825 tmp1 = _mm512_srli_epi32(tmp1, 2);
826
827 return _mm512_or_epi32(tmp, tmp1);
828}
829
830using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
831
832static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
833 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
834 const __m512i right_shift)
835{
836 /* Prepare mel_encode params */
837 /* if (c_q[i] == 0) { */
838 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
839 /* mel_encode(&mel, rho[i] != 0); */
840 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
841 /* } */
842
843 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
844 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
845 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
846 auto mel_bit2 = (ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
847
848 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
849 auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
850 mel_need_encode2 =
851 mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
852
853 ui32 i_max = 16 - (ignore / 2);
854
855 for (ui32 i = 0; i < i_max; i += 2) {
856 auto mask = 1 << i;
857 if (0 != (mel_need_encode & mask)) {
858 mel_encode(melp, mel_bit & mask);
859 }
860
861 if (i + 1 < i_max) {
862 auto mask = 1 << (i + 1);
863 if (0 != (mel_need_encode & mask)) {
864 mel_encode(melp, mel_bit & mask);
865 }
866 }
867
868 if (0 != (mel_need_encode2 & mask)) {
869 mel_encode(melp, mel_bit2 & mask);
870 }
871 }
872}
873
874static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
875 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
876 const __m512i right_shift)
877{
878 ojph_unused(u_q_vec);
879 ojph_unused(right_shift);
880
881 /* Prepare mel_encode params */
882 /* if (c_q[i] == 0) { */
883 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
884 /* mel_encode(&mel, rho[i] != 0); */
885 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
886 /* } */
887
888 ui32 i_max = 16 - (ignore / 2);
889
890 for (ui32 i = 0; i < i_max; ++i) {
891 auto mask = 1 << i;
892 if (0 != (mel_need_encode & mask)) {
893 mel_encode(melp, mel_bit & mask);
894 }
895 }
896}
897
898using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
899 __m512i, ui32, const __m512i);
900
901static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
902 ui32 *u_q, ui32 ignore)
903{
904 ui32 i_max = 16 - (ignore / 2);
905
906 for (ui32 i = 0; i < i_max; i += 2) {
907 /* 7 bits */
908 ui32 val = tuple[i + 0] >> 4;
909 int size = tuple[i + 0] & 7;
910
911 if (i + 1 < i_max) {
912 /* 7 bits */
913 val |= (tuple[i + 1] >> 4) << size;
914 size += tuple[i + 1] & 7;
915 }
916
917 if (u_q[i] > 2 && u_q[i + 1] > 2) {
918 /* 3 bits */
919 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
920 size += ulvc_cwd_pre_len[u_q[i] - 2];
921
922 /* 3 bits */
923 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
924 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
925
926 /* 5 bits */
927 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
928 size += ulvc_cwd_suf_len[u_q[i] - 2];
929
930 /* 5 bits */
931 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
932 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
933
934 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
935 /* 3 bits */
936 val |= (ulvc_cwd_pre[u_q[i]]) << size;
937 size += ulvc_cwd_pre_len[u_q[i]];
938
939 /* 1 bit */
940 val |= (u_q[i + 1] - 1) << size;
941 size += 1;
942
943 /* 5 bits */
944 val |= (ulvc_cwd_suf[u_q[i]]) << size;
945 size += ulvc_cwd_suf_len[u_q[i]];
946
947 } else {
948 /* 3 bits */
949 val |= (ulvc_cwd_pre[u_q[i]]) << size;
950 size += ulvc_cwd_pre_len[u_q[i]];
951
952 /* 3 bits */
953 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
954 size += ulvc_cwd_pre_len[u_q[i + 1]];
955
956 /* 5 bits */
957 val |= (ulvc_cwd_suf[u_q[i]]) << size;
958 size += ulvc_cwd_suf_len[u_q[i]];
959
960 /* 5 bits */
961 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
962 size += ulvc_cwd_suf_len[u_q[i + 1]];
963 }
964
965 vlc_encode(vlcp, val, size);
966 }
967}
968
969static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
970 ui32 *u_q, ui32 ignore)
971{
972 ui32 i_max = 16 - (ignore / 2);
973
974 for (ui32 i = 0; i < i_max; i += 2) {
975 /* 7 bits */
976 ui32 val = tuple[i + 0] >> 4;
977 int size = tuple[i + 0] & 7;
978
979 if (i + 1 < i_max) {
980 /* 7 bits */
981 val |= (tuple[i + 1] >> 4) << size;
982 size += tuple[i + 1] & 7;
983 }
984
985 /* 3 bits */
986 val |= ulvc_cwd_pre[u_q[i]] << size;
987 size += ulvc_cwd_pre_len[u_q[i]];
988
989 /* 3 bits */
990 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
991 size += ulvc_cwd_pre_len[u_q[i + 1]];
992
993 /* 5 bits */
994 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
995 size += ulvc_cwd_suf_len[u_q[i + 0]];
996
997 /* 5 bits */
998 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
999 size += ulvc_cwd_suf_len[u_q[i + 1]];
1000
1001 vlc_encode(vlcp, val, size);
1002 }
1003}
1004
1005using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
1006
1007void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1008 ui32 num_passes, ui32 _width, ui32 height,
1009 ui32 stride, ui32* lengths,
1011 ojph::coded_lists *& coded)
1012{
1013 ojph_unused(num_passes); //currently not used
1014
1015 ui32 width = (_width + 31) & ~31u;
1016 ui32 ignore = width - _width;
1017 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1018 const int mel_vlc_size = 3072; //more than enough
1019 const int mel_size = 192;
1020 const int vlc_size = mel_vlc_size - mel_size;
1021
1022 ui8 ms_buf[ms_size];
1023 ui8 mel_vlc_buf[mel_vlc_size];
1024 ui8 *mel_buf = mel_vlc_buf;
1025 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1026
1027 mel_struct mel;
1028 mel_init(&mel, mel_size, mel_buf);
1030 vlc_init(&vlc, vlc_size, vlc_buf);
1031 ms_struct ms;
1032 ms_init(&ms, ms_size, ms_buf);
1033
1034 ui32 p = 30 - missing_msbs;
1035
1036 //e_val: E values for a line (these are the highest set bit)
1037 //cx_val: is the context values
1038 //Each byte stores the info for the 2 sample. For E, it is maximum
1039 // of the two samples, while for cx, it is the OR of these two samples.
1040 //The maximum is between the pixel at the bottom left of one quad
1041 // and the bottom right of the earlier quad. The same is true for cx.
1042 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1043 // one for the non-existing earlier quad, and one for beyond the
1044 // the end
1045 const __m512i right_shift = _mm512_set_epi32(
1046 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1047 );
1048
1049 const __m512i left_shift = _mm512_set_epi32(
1050 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1051 );
1052
1053 __m512i e_val_vec[33];
1054 for (ui32 i = 0; i < 32; ++i) {
1055 e_val_vec[i] = ZERO;
1056 }
1057 __m512i prev_e_val_vec = ZERO;
1058
1059 __m512i cx_val_vec[33];
1060 __m512i prev_cx_val_vec = ZERO;
1061
1062 __m512i prev_cq_vec = ZERO;
1063
1064 __m512i tmp;
1065 __m512i tmp1;
1066
1067 __m512i eq_vec[4];
1068 __m512i s_vec[4];
1069 __m512i src_vec[4];
1070 __m512i rho_vec;
1071 __m512i e_qmax_vec;
1072 __m512i kappa_vec;
1073
1074 ui32 n_loop = (width + 31) / 32;
1075
1076 ui32 *vlc_tbl = vlc_tbl0;
1077 fn_proc_cq proc_cq = proc_cq1;
1078 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1079 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1080
1081 /* 2 lines per iteration */
1082 for (ui32 y = 0; y < height; y += 2)
1083 {
1084 e_val_vec[n_loop] = prev_e_val_vec;
1085 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1086 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1087 tmp = _mm512_srli_epi32(tmp, 3);
1088 cx_val_vec[n_loop] = tmp;
1089
1090 prev_e_val_vec = ZERO;
1091 prev_cx_val_vec = ZERO;
1092
1093 ui32 *sp = buf + y * stride;
1094
1095 /* 32 bytes per iteration */
1096 for (ui32 x = 0; x < n_loop; ++x) {
1097
1098 // mask to stop loading unnecessary data
1099 si32 true_x = (si32)x << 5;
1100 ui32 mask32 = 0xFFFFFFFFu;
1101 si32 entries = true_x + 32 - (si32)_width;
1102 mask32 >>= ((entries >= 0) ? entries : 0);
1103 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1104 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1105
1106 /* t = sp[i]; */
1107 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1108 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1109
1110 if (y + 1 < height) {
1111 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1112 src_vec[3] =
1113 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1114 } else {
1115 src_vec[1] = ZERO;
1116 src_vec[3] = ZERO;
1117 }
1118 sp += 32;
1119
1120 /* src_vec layout:
1121 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5]...[0,15]
1122 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5]...[1,15]
1123 * src_vec[2]:[0,16],[0,17],[0,18],[0,19],[0,20],[0,21]...[0,31]
1124 * src_vec[3]:[1,16],[1,17],[1,18],[1,19],[1,20],[1,21]...[1,31]
1125 */
1126 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1127
1128 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1129 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1130 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1131 e_val_vec[x + 1]);
1132 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1133 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1134 max_e_vec = _mm512_sub_epi32(max_e_vec, ONE);
1135
1136 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1137 tmp = _mm512_max_epi32(max_e_vec, ONE);
1138 tmp1 = _mm512_sub_epi32(rho_vec, ONE);
1139 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1140 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
1141 kappa_vec = _mm512_mask_mov_epi32(ONE, mask, tmp);
1142
1143 /* cq[1 - 16] = cq_vec
1144 * cq[0] = prev_cq_vec[0]
1145 */
1146 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1147 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1148 left_shift, tmp);
1149 prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1150 tmp);
1151
1152 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1153 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1154
1155 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1156 /* u_q[i] = Uq[i] - kappa[i]; */
1157 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1158 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1159
1160 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1161 __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1162 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1163
1164 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1165 right_shift);
1166
1167 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1168
1169 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1170 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1171 ui32 u_q[16];
1172 ui32 tuple[16];
1173 /* The tuple is scaled by 4 due to:
1174 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1175 * So in the vlc_encode, the tuple will only be scaled by 2.
1176 */
1177 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1178 _mm512_store_epi32(tuple, tuple_vec);
1179 _mm512_store_epi32(u_q, u_q_vec);
1180 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1181 }
1182
1183 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1184 tmp = _mm512_slli_epi32(tmp, 2);
1185 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1186
1187 proc_cq = proc_cq2;
1188 vlc_tbl = vlc_tbl1;
1189 proc_mel_encode = proc_mel_encode2;
1190 proc_vlc_encode = proc_vlc_encode2;
1191 }
1192
1193 ms_terminate(&ms);
1194 terminate_mel_vlc(&mel, &vlc);
1195
1196 //copy to elastic
1197 lengths[0] = mel.pos + vlc.pos + ms.pos;
1198 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1199 memcpy(coded->buf, ms.buf, ms.pos);
1200 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1201 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1202
1203 // put in the interface locator word
1204 ui32 num_bytes = mel.pos + vlc.pos;
1205 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1206 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1207 coded->buf[lengths[0]-2] =
1208 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1209
1210 coded->avail_size -= lengths[0];
1211}
1212
1213} /* namespace local */
1214} /* namespace ojph */
1215
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition: ojph_mem.cpp:115
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
void(*)(vlc_struct_avx2 *, ui32 *, ui32 *, ui32) fn_proc_vlc_encode
static void ms_terminate(ms_struct *msp)
static int ulvc_cwd_suf_len[33]
bool initialize_block_encoder_tables_avx512()
void(*)(mel_struct *, __m256i &, __m256i &, __m256i, ui32, const __m256i) fn_proc_mel_encode
static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, const __m256i right_shift)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void proc_pixel(__m256i *src_vec, ui32 p, __m256i *eq_vec, __m256i *s_vec, __m256i &rho_vec, __m256i &e_qmax_vec)
static ui32 ulvc_cwd_suf[33]
static void proc_ms_encode(ms_struct *msp, __m256i &tuple_vec, __m256i &uq_vec, __m256i &rho_vec, __m256i *s_vec)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void update_lep(ui32 x, __m256i &prev_e_val_vec, __m256i *eq_vec, __m256i *e_val_vec, const __m256i left_shift)
static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, const __m256i right_shift)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec, __m256i &e_qmax_vec)
static void rotate_matrix(__m256i *matrix)
static ui32 ulvc_cwd_pre[33]
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
__m256i(*)(ui32, __m256i *, __m256i &, const __m256i) fn_proc_cq
static int ulvc_cwd_pre_len[33]
static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec, __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, const __m256i right_shift)
static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec, __m256i &rho_vec, __m256i *cx_val_vec, const __m256i left_shift)
static bool tables_initialized
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec, __m256i &eps_vec, ui32 *vlc_tbl)
void ojph_encode_codeblock_avx512(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec, __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, const __m256i right_shift)
static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition: ojph_arch.h:152
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define likely(x)
#define unlikely(x)
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define ojph_min(a, b)
Definition: ojph_defs.h:76
#define ojph_unused(x)
Definition: ojph_defs.h:78
#define OJPH_ERROR(t,...)
Definition: ojph_message.h:287