diff options
Diffstat (limited to 'media/libjpeg/jcphuff.c')
-rw-r--r-- | media/libjpeg/jcphuff.c | 688 |
1 files changed, 484 insertions, 204 deletions
diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c index 046e2e18d4..872e570bff 100644 --- a/media/libjpeg/jcphuff.c +++ b/media/libjpeg/jcphuff.c @@ -4,7 +4,10 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1995-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2015, D. R. Commander. + * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander. + * Copyright (C) 2016, 2018, Matthieu Darbois. + * Copyright (C) 2020, Arm Limited. + * Copyright (C) 2021, Alex Richardson. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -18,15 +21,74 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jchuff.h" /* Declarations shared with jchuff.c */ +#include "jsimd.h" +#include "jconfigint.h" +#include <limits.h> + +#ifdef HAVE_INTRIN_H +#include <intrin.h> +#ifdef _MSC_VER +#ifdef HAVE_BITSCANFORWARD64 +#pragma intrinsic(_BitScanForward64) +#endif +#ifdef HAVE_BITSCANFORWARD +#pragma intrinsic(_BitScanForward) +#endif +#endif +#endif #ifdef C_PROGRESSIVE_SUPPORTED +/* + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be + * used for bit counting rather than the lookup table. This will reduce the + * memory footprint by 64k, which is important for some mobile applications + * that create many isolated instances of libjpeg-turbo (web browsers, for + * instance.) This may improve performance on some mobile platforms as well. + * This feature is enabled by default only on Arm processors, because some x86 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be + * shown to have a significant performance impact even on the x86 chips that + * have a fast implementation of it. When building for Armv6, you can + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler + * flags (this defines __thumb__). + */ + +/* NOTE: Both GCC and Clang define __GNUC__ */ +#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \ + defined(_M_ARM) || defined(_M_ARM64) +#if !defined(__thumb__) || defined(__thumb2__) +#define USE_CLZ_INTRINSIC +#endif +#endif + +#ifdef USE_CLZ_INTRINSIC +#if defined(_MSC_VER) && !defined(__clang__) +#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x)) +#else +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) +#endif +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) +#else +#include "jpeg_nbits_table.h" +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) +#endif + + /* Expanded entropy encoder object for progressive Huffman encoding. */ typedef struct { struct jpeg_entropy_encoder pub; /* public fields */ + /* Pointer to routine to prepare data for encode_mcu_AC_first() */ + void (*AC_first_prepare) (const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits); + /* Pointer to routine to prepare data for encode_mcu_AC_refine() */ + int (*AC_refine_prepare) (const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits); + /* Mode flag: TRUE for optimization, FALSE for actual data output */ boolean gather_statistics; @@ -79,26 +141,62 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr; #ifdef RIGHT_SHIFT_IS_UNSIGNED #define ISHIFT_TEMPS int ishift_temp; -#define IRIGHT_SHIFT(x,shft) \ - ((ishift_temp = (x)) < 0 ? \ - (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ - (ishift_temp >> (shft))) +#define IRIGHT_SHIFT(x, shft) \ + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \ + (ishift_temp >> (shft))) #else #define ISHIFT_TEMPS -#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#define IRIGHT_SHIFT(x, shft) ((x) >> (shft)) #endif +#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1))) + /* Forward declarations */ -METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo, +METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(void) encode_mcu_AC_first_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); +METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data); -METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo, +METHODDEF(int) encode_mcu_AC_refine_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); +METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data); -METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo, - JBLOCKROW *MCU_data); -METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo, - JBLOCKROW *MCU_data); -METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo); -METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo); +METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo); +METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo); + + +/* Count bit loop zeroes */ +INLINE +METHODDEF(int) +count_zeroes(size_t *x) +{ +#if defined(HAVE_BUILTIN_CTZL) + int result; + result = __builtin_ctzl(*x); + *x >>= result; +#elif defined(HAVE_BITSCANFORWARD64) + unsigned long result; + _BitScanForward64(&result, *x); + *x >>= result; +#elif defined(HAVE_BITSCANFORWARD) + unsigned long result; + _BitScanForward(&result, *x); + *x >>= result; +#else + int result = 0; + while ((*x & 1) == 0) { + ++result; + *x >>= 1; + } +#endif + return (int)result; +} /* @@ -106,9 +204,9 @@ METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo); */ METHODDEF(void) -start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) +start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; boolean is_DC_band; int ci, tbl; jpeg_component_info *compptr; @@ -126,15 +224,23 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) entropy->pub.encode_mcu = encode_mcu_DC_first; else entropy->pub.encode_mcu = encode_mcu_AC_first; + if (jsimd_can_encode_mcu_AC_first_prepare()) + entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare; + else + entropy->AC_first_prepare = encode_mcu_AC_first_prepare; } else { if (is_DC_band) entropy->pub.encode_mcu = encode_mcu_DC_refine; else { entropy->pub.encode_mcu = encode_mcu_AC_refine; + if (jsimd_can_encode_mcu_AC_refine_prepare()) + entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare; + else + entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare; /* AC refinement needs a correction bit buffer */ if (entropy->bit_buffer == NULL) entropy->bit_buffer = (char *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, MAX_CORR_BITS * sizeof(char)); } } @@ -167,14 +273,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ if (entropy->count_ptrs[tbl] == NULL) entropy->count_ptrs[tbl] = (long *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 257 * sizeof(long)); - MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long)); + memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long)); } else { /* Compute derived values for Huffman table */ /* We may do this more than once for a table, but it's not expensive */ jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl, - & entropy->derived_tbls[tbl]); + &entropy->derived_tbls[tbl]); } } @@ -198,19 +304,20 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) */ /* Emit a byte */ -#define emit_byte(entropy,val) \ - { *(entropy)->next_output_byte++ = (JOCTET) (val); \ - if (--(entropy)->free_in_buffer == 0) \ - dump_buffer(entropy); } +#define emit_byte(entropy, val) { \ + *(entropy)->next_output_byte++ = (JOCTET)(val); \ + if (--(entropy)->free_in_buffer == 0) \ + dump_buffer(entropy); \ +} LOCAL(void) -dump_buffer (phuff_entropy_ptr entropy) +dump_buffer(phuff_entropy_ptr entropy) /* Empty the output buffer; we do not support suspension in this module. */ { struct jpeg_destination_mgr *dest = entropy->cinfo->dest; - if (! (*dest->empty_output_buffer) (entropy->cinfo)) + if (!(*dest->empty_output_buffer) (entropy->cinfo)) ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND); /* After a successful buffer dump, must reset buffer pointers */ entropy->next_output_byte = dest->next_output_byte; @@ -227,11 +334,11 @@ dump_buffer (phuff_entropy_ptr entropy) */ LOCAL(void) -emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) +emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size) /* Emit some bits, unless we are in gather mode */ { /* This routine is heavily used, so it's worth coding tightly. */ - register size_t put_buffer = (size_t) code; + register size_t put_buffer = (size_t)code; register int put_bits = entropy->put_bits; /* if size is 0, caller used an invalid Huffman table entry */ @@ -241,7 +348,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) if (entropy->gather_statistics) return; /* do nothing if we're only getting stats */ - put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */ + put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */ put_bits += size; /* new number of bits in buffer */ @@ -250,7 +357,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */ while (put_bits >= 8) { - int c = (int) ((put_buffer >> 16) & 0xFF); + int c = (int)((put_buffer >> 16) & 0xFF); emit_byte(entropy, c); if (c == 0xFF) { /* need to stuff a zero byte? */ @@ -266,7 +373,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) LOCAL(void) -flush_bits (phuff_entropy_ptr entropy) +flush_bits(phuff_entropy_ptr entropy) { emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */ entropy->put_buffer = 0; /* and reset bit-buffer to empty */ @@ -279,7 +386,7 @@ flush_bits (phuff_entropy_ptr entropy) */ LOCAL(void) -emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol) +emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol) { if (entropy->gather_statistics) entropy->count_ptrs[tbl_no][symbol]++; @@ -295,14 +402,14 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol) */ LOCAL(void) -emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart, - unsigned int nbits) +emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart, + unsigned int nbits) { if (entropy->gather_statistics) return; /* no real work */ while (nbits > 0) { - emit_bits(entropy, (unsigned int) (*bufstart), 1); + emit_bits(entropy, (unsigned int)(*bufstart), 1); bufstart++; nbits--; } @@ -314,15 +421,13 @@ emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart, */ LOCAL(void) -emit_eobrun (phuff_entropy_ptr entropy) +emit_eobrun(phuff_entropy_ptr entropy) { register int temp, nbits; if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */ temp = entropy->EOBRUN; - nbits = 0; - while ((temp >>= 1)) - nbits++; + nbits = JPEG_NBITS_NONZERO(temp) - 1; /* safety check: shouldn't happen given limited correction-bit buffer */ if (nbits > 14) ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE); @@ -345,13 +450,13 @@ emit_eobrun (phuff_entropy_ptr entropy) */ LOCAL(void) -emit_restart (phuff_entropy_ptr entropy, int restart_num) +emit_restart(phuff_entropy_ptr entropy, int restart_num) { int ci; emit_eobrun(entropy); - if (! entropy->gather_statistics) { + if (!entropy->gather_statistics) { flush_bits(entropy); emit_byte(entropy, 0xFF); emit_byte(entropy, JPEG_RST0 + restart_num); @@ -375,10 +480,10 @@ emit_restart (phuff_entropy_ptr entropy, int restart_num) */ METHODDEF(boolean) -encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; - register int temp, temp2; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; + register int temp, temp2, temp3; register int nbits; int blkn, ci; int Al = cinfo->Al; @@ -403,31 +508,31 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Compute the DC value after the required point transform by Al. * This is simply an arithmetic right shift. */ - temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al); + temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al); /* DC differences are figured on the point-transformed values. */ temp = temp2 - entropy->last_dc_val[ci]; entropy->last_dc_val[ci] = temp2; /* Encode the DC coefficient difference per section G.1.2.1 */ - temp2 = temp; - if (temp < 0) { - temp = -temp; /* temp is abs value of input */ - /* For a negative input, want temp2 = bitwise complement of abs(input) */ - /* This code assumes we are on a two's complement machine */ - temp2--; - } + + /* This is a well-known technique for obtaining the absolute value without + * a branch. It is derived from an assembly language technique presented + * in "How to Optimize for the Pentium Processors", Copyright (c) 1996, + * 1997 by Agner Fog. + */ + temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + temp ^= temp3; + temp -= temp3; /* temp is abs value of input */ + /* For a negative input, want temp2 = bitwise complement of abs(input) */ + temp2 = temp ^ temp3; /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = 0; - while (temp) { - nbits++; - temp >>= 1; - } + nbits = JPEG_NBITS(temp); /* Check for out-of-range coefficient values. * Since we're encoding a difference, the range limit is twice as much. */ - if (nbits > MAX_COEF_BITS+1) + if (nbits > MAX_COEF_BITS + 1) ERREXIT(cinfo, JERR_BAD_DCT_COEF); /* Count/emit the Huffman-coded symbol for the number of bits */ @@ -436,7 +541,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Emit that number of bits of the value, if positive, */ /* or the complement of its magnitude, if negative. */ if (nbits) /* emit_bits rejects calls with size 0 */ - emit_bits(entropy, (unsigned int) temp2, nbits); + emit_bits(entropy, (unsigned int)temp2, nbits); } cinfo->dest->next_output_byte = entropy->next_output_byte; @@ -457,20 +562,115 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* + * Data preparation for encode_mcu_AC_first(). + */ + +#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \ + for (k = 0; k < Sl; k++) { \ + temp = block[jpeg_natural_order_start[k]]; \ + if (temp == 0) \ + continue; \ + /* We must apply the point transform by Al. For AC coefficients this \ + * is an integer division with rounding towards 0. To do this portably \ + * in C, we shift after obtaining the absolute value; so the code is \ + * interwoven with finding the abs value (temp) and output bits (temp2). \ + */ \ + temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp ^= temp2; \ + temp -= temp2; /* temp is abs value of input */ \ + temp >>= Al; /* apply the point transform */ \ + /* Watch out for case that nonzero coef is zero after point transform */ \ + if (temp == 0) \ + continue; \ + /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \ + temp2 ^= temp; \ + values[k] = (JCOEF)temp; \ + values[k + DCTSIZE2] = (JCOEF)temp2; \ + zerobits |= ((size_t)1U) << k; \ + } \ +} + +METHODDEF(void) +encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *bits) +{ + register int k, temp, temp2; + size_t zerobits = 0U; + int Sl0 = Sl; + +#if SIZEOF_SIZE_T == 4 + if (Sl0 > 32) + Sl0 = 32; +#endif + + COMPUTE_ABSVALUES_AC_FIRST(Sl0); + + bits[0] = zerobits; +#if SIZEOF_SIZE_T == 4 + zerobits = 0U; + + if (Sl > 32) { + Sl -= 32; + jpeg_natural_order_start += 32; + values += 32; + + COMPUTE_ABSVALUES_AC_FIRST(Sl); + } + bits[1] = zerobits; +#endif +} + +/* * MCU encoding for AC initial scan (either spectral selection, * or first pass of successive approximation). */ +#define ENCODE_COEFS_AC_FIRST(label) { \ + while (zerobits) { \ + r = count_zeroes(&zerobits); \ + cvalue += r; \ +label \ + temp = cvalue[0]; \ + temp2 = cvalue[DCTSIZE2]; \ + \ + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ + while (r > 15) { \ + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \ + r -= 16; \ + } \ + \ + /* Find the number of bits needed for the magnitude of the coefficient */ \ + nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \ + /* Check for out-of-range coefficient values */ \ + if (nbits > MAX_COEF_BITS) \ + ERREXIT(cinfo, JERR_BAD_DCT_COEF); \ + \ + /* Count/emit Huffman symbol for run length / number of bits */ \ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \ + \ + /* Emit that number of bits of the value, if positive, */ \ + /* or the complement of its magnitude, if negative. */ \ + emit_bits(entropy, (unsigned int)temp2, nbits); \ + \ + cvalue++; \ + zerobits >>= 1; \ + } \ +} + METHODDEF(boolean) -encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; register int temp, temp2; - register int nbits; - register int r, k; - int Se = cinfo->Se; + register int nbits, r; + int Sl = cinfo->Se - cinfo->Ss + 1; int Al = cinfo->Al; - JBLOCKROW block; + JCOEF values_unaligned[2 * DCTSIZE2 + 15]; + JCOEF *values; + const JCOEF *cvalue; + size_t zerobits; + size_t bits[8 / SIZEOF_SIZE_T]; entropy->next_output_byte = cinfo->dest->next_output_byte; entropy->free_in_buffer = cinfo->dest->free_in_buffer; @@ -480,66 +680,48 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (entropy->restarts_to_go == 0) emit_restart(entropy, entropy->next_restart_num); - /* Encode the MCU data block */ - block = MCU_data[0]; - - /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ +#ifdef WITH_SIMD + cvalue = values = (JCOEF *)PAD((JUINTPTR)values_unaligned, 16); +#else + /* Not using SIMD, so alignment is not needed */ + cvalue = values = values_unaligned; +#endif - r = 0; /* r = run length of zeros */ + /* Prepare data */ + entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss, + Sl, Al, values, bits); - for (k = cinfo->Ss; k <= Se; k++) { - if ((temp = (*block)[jpeg_natural_order[k]]) == 0) { - r++; - continue; - } - /* We must apply the point transform by Al. For AC coefficients this - * is an integer division with rounding towards 0. To do this portably - * in C, we shift after obtaining the absolute value; so the code is - * interwoven with finding the abs value (temp) and output bits (temp2). - */ - if (temp < 0) { - temp = -temp; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ - /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ - temp2 = ~temp; - } else { - temp >>= Al; /* apply the point transform */ - temp2 = temp; - } - /* Watch out for case that nonzero coef is zero after point transform */ - if (temp == 0) { - r++; - continue; - } + zerobits = bits[0]; +#if SIZEOF_SIZE_T == 4 + zerobits |= bits[1]; +#endif - /* Emit any pending EOBRUN */ - if (entropy->EOBRUN > 0) - emit_eobrun(entropy); - /* if run length > 15, must emit special run-length-16 codes (0xF0) */ - while (r > 15) { - emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); - r -= 16; - } + /* Emit any pending EOBRUN */ + if (zerobits && (entropy->EOBRUN > 0)) + emit_eobrun(entropy); - /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = 1; /* there must be at least one 1 bit */ - while ((temp >>= 1)) - nbits++; - /* Check for out-of-range coefficient values */ - if (nbits > MAX_COEF_BITS) - ERREXIT(cinfo, JERR_BAD_DCT_COEF); +#if SIZEOF_SIZE_T == 4 + zerobits = bits[0]; +#endif - /* Count/emit Huffman symbol for run length / number of bits */ - emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); + /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ - /* Emit that number of bits of the value, if positive, */ - /* or the complement of its magnitude, if negative. */ - emit_bits(entropy, (unsigned int) temp2, nbits); + ENCODE_COEFS_AC_FIRST((void)0;); - r = 0; /* reset zero run length */ +#if SIZEOF_SIZE_T == 4 + zerobits = bits[1]; + if (zerobits) { + int diff = ((values + DCTSIZE2 / 2) - cvalue); + r = count_zeroes(&zerobits); + r += diff; + cvalue += r; + goto first_iter_ac_first; } - if (r > 0) { /* If there are trailing zeroes, */ + ENCODE_COEFS_AC_FIRST(first_iter_ac_first:); +#endif + + if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */ entropy->EOBRUN++; /* count an EOB */ if (entropy->EOBRUN == 0x7FFF) emit_eobrun(entropy); /* force it out to avoid overflow */ @@ -569,9 +751,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) */ METHODDEF(boolean) -encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; register int temp; int blkn; int Al = cinfo->Al; @@ -591,7 +773,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* We simply emit the Al'th bit of the DC coefficient value. */ temp = (*block)[0]; - emit_bits(entropy, (unsigned int) (temp >> Al), 1); + emit_bits(entropy, (unsigned int)(temp >> Al), 1); } cinfo->dest->next_output_byte = entropy->next_output_byte; @@ -612,22 +794,148 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* + * Data preparation for encode_mcu_AC_refine(). + */ + +#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \ + /* It is convenient to make a pre-pass to determine the transformed \ + * coefficients' absolute values and the EOB position. \ + */ \ + for (k = 0; k < Sl; k++) { \ + temp = block[jpeg_natural_order_start[k]]; \ + /* We must apply the point transform by Al. For AC coefficients this \ + * is an integer division with rounding towards 0. To do this portably \ + * in C, we shift after obtaining the absolute value. \ + */ \ + temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp ^= temp2; \ + temp -= temp2; /* temp is abs value of input */ \ + temp >>= Al; /* apply the point transform */ \ + if (temp != 0) { \ + zerobits |= ((size_t)1U) << k; \ + signbits |= ((size_t)(temp2 + 1)) << k; \ + } \ + absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \ + if (temp == 1) \ + EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \ + } \ +} + +METHODDEF(int) +encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + register int k, temp, temp2; + int EOB = 0; + size_t zerobits = 0U, signbits = 0U; + int Sl0 = Sl; + +#if SIZEOF_SIZE_T == 4 + if (Sl0 > 32) + Sl0 = 32; +#endif + + COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0); + + bits[0] = zerobits; +#if SIZEOF_SIZE_T == 8 + bits[1] = signbits; +#else + bits[2] = signbits; + + zerobits = 0U; + signbits = 0U; + + if (Sl > 32) { + Sl -= 32; + jpeg_natural_order_start += 32; + absvalues += 32; + + COMPUTE_ABSVALUES_AC_REFINE(Sl, 32); + } + + bits[1] = zerobits; + bits[3] = signbits; +#endif + + return EOB; +} + + +/* * MCU encoding for AC successive approximation refinement scan. */ +#define ENCODE_COEFS_AC_REFINE(label) { \ + while (zerobits) { \ + idx = count_zeroes(&zerobits); \ + r += idx; \ + cabsvalue += idx; \ + signbits >>= idx; \ +label \ + /* Emit any required ZRLs, but not if they can be folded into EOB */ \ + while (r > 15 && (cabsvalue <= EOBPTR)) { \ + /* emit any pending EOBRUN and the BE correction bits */ \ + emit_eobrun(entropy); \ + /* Emit ZRL */ \ + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \ + r -= 16; \ + /* Emit buffered correction bits that must be associated with ZRL */ \ + emit_buffered_bits(entropy, BR_buffer, BR); \ + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \ + BR = 0; \ + } \ + \ + temp = *cabsvalue++; \ + \ + /* If the coef was previously nonzero, it only needs a correction bit. \ + * NOTE: a straight translation of the spec's figure G.7 would suggest \ + * that we also need to test r > 15. But if r > 15, we can only get here \ + * if k > EOB, which implies that this coefficient is not 1. \ + */ \ + if (temp > 1) { \ + /* The correction bit is the next bit of the absolute value. */ \ + BR_buffer[BR++] = (char)(temp & 1); \ + signbits >>= 1; \ + zerobits >>= 1; \ + continue; \ + } \ + \ + /* Emit any pending EOBRUN and the BE correction bits */ \ + emit_eobrun(entropy); \ + \ + /* Count/emit Huffman symbol for run length / number of bits */ \ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \ + \ + /* Emit output bit for newly-nonzero coef */ \ + temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \ + emit_bits(entropy, (unsigned int)temp, 1); \ + \ + /* Emit buffered correction bits that must be associated with this code */ \ + emit_buffered_bits(entropy, BR_buffer, BR); \ + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \ + BR = 0; \ + r = 0; /* reset zero run length */ \ + signbits >>= 1; \ + zerobits >>= 1; \ + } \ +} + METHODDEF(boolean) -encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; - register int temp; - register int r, k; - int EOB; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; + register int temp, r, idx; char *BR_buffer; unsigned int BR; - int Se = cinfo->Se; + int Sl = cinfo->Se - cinfo->Ss + 1; int Al = cinfo->Al; - JBLOCKROW block; - int absvalues[DCTSIZE2]; + JCOEF absvalues_unaligned[DCTSIZE2 + 15]; + JCOEF *absvalues; + const JCOEF *cabsvalue, *EOBPTR; + size_t zerobits, signbits; + size_t bits[16 / SIZEOF_SIZE_T]; entropy->next_output_byte = cinfo->dest->next_output_byte; entropy->free_in_buffer = cinfo->dest->free_in_buffer; @@ -637,26 +945,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (entropy->restarts_to_go == 0) emit_restart(entropy, entropy->next_restart_num); - /* Encode the MCU data block */ - block = MCU_data[0]; +#ifdef WITH_SIMD + cabsvalue = absvalues = (JCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16); +#else + /* Not using SIMD, so alignment is not needed */ + cabsvalue = absvalues = absvalues_unaligned; +#endif - /* It is convenient to make a pre-pass to determine the transformed - * coefficients' absolute values and the EOB position. - */ - EOB = 0; - for (k = cinfo->Ss; k <= Se; k++) { - temp = (*block)[jpeg_natural_order[k]]; - /* We must apply the point transform by Al. For AC coefficients this - * is an integer division with rounding towards 0. To do this portably - * in C, we shift after obtaining the absolute value. - */ - if (temp < 0) - temp = -temp; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ - absvalues[k] = temp; /* save abs value for main pass */ - if (temp == 1) - EOB = k; /* EOB = index of last newly-nonzero coef */ - } + /* Prepare data */ + EOBPTR = absvalues + + entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss, + Sl, Al, absvalues, bits); /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */ @@ -664,52 +963,32 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) BR = 0; /* BR = count of buffered bits added now */ BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */ - for (k = cinfo->Ss; k <= Se; k++) { - if ((temp = absvalues[k]) == 0) { - r++; - continue; - } - - /* Emit any required ZRLs, but not if they can be folded into EOB */ - while (r > 15 && k <= EOB) { - /* emit any pending EOBRUN and the BE correction bits */ - emit_eobrun(entropy); - /* Emit ZRL */ - emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); - r -= 16; - /* Emit buffered correction bits that must be associated with ZRL */ - emit_buffered_bits(entropy, BR_buffer, BR); - BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ - BR = 0; - } - - /* If the coef was previously nonzero, it only needs a correction bit. - * NOTE: a straight translation of the spec's figure G.7 would suggest - * that we also need to test r > 15. But if r > 15, we can only get here - * if k > EOB, which implies that this coefficient is not 1. - */ - if (temp > 1) { - /* The correction bit is the next bit of the absolute value. */ - BR_buffer[BR++] = (char) (temp & 1); - continue; - } - - /* Emit any pending EOBRUN and the BE correction bits */ - emit_eobrun(entropy); - - /* Count/emit Huffman symbol for run length / number of bits */ - emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); + zerobits = bits[0]; +#if SIZEOF_SIZE_T == 8 + signbits = bits[1]; +#else + signbits = bits[2]; +#endif + ENCODE_COEFS_AC_REFINE((void)0;); + +#if SIZEOF_SIZE_T == 4 + zerobits = bits[1]; + signbits = bits[3]; + + if (zerobits) { + int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue); + idx = count_zeroes(&zerobits); + signbits >>= idx; + idx += diff; + r += idx; + cabsvalue += idx; + goto first_iter_ac_refine; + } - /* Emit output bit for newly-nonzero coef */ - temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1; - emit_bits(entropy, (unsigned int) temp, 1); + ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:); +#endif - /* Emit buffered correction bits that must be associated with this code */ - emit_buffered_bits(entropy, BR_buffer, BR); - BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ - BR = 0; - r = 0; /* reset zero run length */ - } + r |= (int)((absvalues + Sl) - cabsvalue); if (r > 0 || BR > 0) { /* If there are trailing zeroes, */ entropy->EOBRUN++; /* count an EOB */ @@ -718,7 +997,8 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) * 1. overflow of the EOB counter; * 2. overflow of the correction bit buffer during the next MCU. */ - if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1)) + if (entropy->EOBRUN == 0x7FFF || + entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1)) emit_eobrun(entropy); } @@ -744,9 +1024,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) */ METHODDEF(void) -finish_pass_phuff (j_compress_ptr cinfo) +finish_pass_phuff(j_compress_ptr cinfo) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; entropy->next_output_byte = cinfo->dest->next_output_byte; entropy->free_in_buffer = cinfo->dest->free_in_buffer; @@ -765,9 +1045,9 @@ finish_pass_phuff (j_compress_ptr cinfo) */ METHODDEF(void) -finish_pass_gather_phuff (j_compress_ptr cinfo) +finish_pass_gather_phuff(j_compress_ptr cinfo) { - phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; boolean is_DC_band; int ci, tbl; jpeg_component_info *compptr; @@ -782,7 +1062,7 @@ finish_pass_gather_phuff (j_compress_ptr cinfo) /* It's important not to apply jpeg_gen_optimal_table more than once * per table, because it clobbers the input frequency counts! */ - MEMZERO(did, sizeof(did)); + memset(did, 0, sizeof(did)); for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; @@ -793,13 +1073,13 @@ finish_pass_gather_phuff (j_compress_ptr cinfo) } else { tbl = compptr->ac_tbl_no; } - if (! did[tbl]) { + if (!did[tbl]) { if (is_DC_band) - htblptr = & cinfo->dc_huff_tbl_ptrs[tbl]; + htblptr = &cinfo->dc_huff_tbl_ptrs[tbl]; else - htblptr = & cinfo->ac_huff_tbl_ptrs[tbl]; + htblptr = &cinfo->ac_huff_tbl_ptrs[tbl]; if (*htblptr == NULL) - *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo); jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]); did[tbl] = TRUE; } @@ -812,15 +1092,15 @@ finish_pass_gather_phuff (j_compress_ptr cinfo) */ GLOBAL(void) -jinit_phuff_encoder (j_compress_ptr cinfo) +jinit_phuff_encoder(j_compress_ptr cinfo) { phuff_entropy_ptr entropy; int i; entropy = (phuff_entropy_ptr) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, sizeof(phuff_entropy_encoder)); - cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; + cinfo->entropy = (struct jpeg_entropy_encoder *)entropy; entropy->pub.start_pass = start_pass_phuff; /* Mark tables unallocated */ |