diff options
Diffstat (limited to 'gfx/ycbcr/convert.patch')
-rw-r--r-- | gfx/ycbcr/convert.patch | 3143 |
1 files changed, 3143 insertions, 0 deletions
diff --git a/gfx/ycbcr/convert.patch b/gfx/ycbcr/convert.patch new file mode 100644 index 0000000000..e39f923b3c --- /dev/null +++ b/gfx/ycbcr/convert.patch @@ -0,0 +1,3143 @@ +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp +--- a/gfx/ycbcr/yuv_convert.cpp ++++ b/gfx/ycbcr/yuv_convert.cpp +@@ -6,145 +6,102 @@ + // http://www.fourcc.org/yuv.php + // The actual conversion is best described here + // http://en.wikipedia.org/wiki/YUV + // An article on optimizing YUV conversion using tables instead of multiplies + // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf + // + // YV12 is a full plane of Y and a half height, half width chroma planes + // YV16 is a full plane of Y and a full height, half width chroma planes ++// YV24 is a full plane of Y and a full height, full width chroma planes + // + // ARGB pixel format is output, which on little endian is stored as BGRA. + // The alpha is set to 255, allowing the application to use RGBA or RGB32. + +-#include "media/base/yuv_convert.h" ++#include "yuv_convert.h" + + // Header for low level row functions. +-#include "media/base/yuv_row.h" +- +-#if USE_MMX +-#if defined(_MSC_VER) +-#include <intrin.h> +-#else +-#include <mmintrin.h> +-#endif +-#endif +- +-#if USE_SSE2 +-#include <emmintrin.h> +-#endif +- +-namespace media { +- ++#include "yuv_row.h" ++#include "mozilla/SSE.h" ++ ++namespace mozilla { ++ ++namespace gfx { ++ + // 16.16 fixed point arithmetic + const int kFractionBits = 16; + const int kFractionMax = 1 << kFractionBits; + const int kFractionMask = ((1 << kFractionBits) - 1); + + // Convert a frame of YUV to 32 bit ARGB. +-void ConvertYUVToRGB32(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int height, +- int y_pitch, +- int uv_pitch, +- int rgb_pitch, +- YUVType yuv_type) { +- unsigned int y_shift = yuv_type; +- for (int y = 0; y < height; ++y) { +- uint8* rgb_row = rgb_buf + y * rgb_pitch; +- const uint8* y_ptr = y_buf + y * y_pitch; +- const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; +- const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; +- +- FastConvertYUVToRGB32Row(y_ptr, +- u_ptr, +- v_ptr, +- rgb_row, +- width); +- } ++void ConvertYCbCrToRGB32(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int pic_x, ++ int pic_y, ++ int pic_width, ++ int pic_height, ++ int y_pitch, ++ int uv_pitch, ++ int rgb_pitch, ++ YUVType yuv_type) { ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; ++ unsigned int x_shift = yuv_type == YV24 ? 0 : 1; ++ // Test for SSE because the optimized code uses movntq, which is not part of MMX. ++ bool has_sse = supports_mmx() && supports_sse(); ++ // There is no optimized YV24 SSE routine so we check for this and ++ // fall back to the C code. ++ has_sse &= yuv_type != YV24; ++ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; ++ int x_width = odd_pic_x ? pic_width - 1 : pic_width; ++ ++ for (int y = pic_y; y < pic_height + pic_y; ++y) { ++ uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; ++ const uint8* y_ptr = y_buf + y * y_pitch + pic_x; ++ const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); ++ const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); ++ ++ if (odd_pic_x) { ++ // Handle the single odd pixel manually and use the ++ // fast routines for the remaining. ++ FastConvertYUVToRGB32Row_C(y_ptr++, ++ u_ptr++, ++ v_ptr++, ++ rgb_row, ++ 1, ++ x_shift); ++ rgb_row += 4; ++ } ++ ++ if (has_sse) { ++ FastConvertYUVToRGB32Row(y_ptr, ++ u_ptr, ++ v_ptr, ++ rgb_row, ++ x_width); ++ } ++ else { ++ FastConvertYUVToRGB32Row_C(y_ptr, ++ u_ptr, ++ v_ptr, ++ rgb_row, ++ x_width, ++ x_shift); ++ } ++ } + + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. +- EMMS(); +-} +- +-#if USE_SSE2 +-// FilterRows combines two rows of the image using linear interpolation. +-// SSE2 version does 16 pixels at a time +- +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { +- __m128i zero = _mm_setzero_si128(); +- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); +- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); +- +- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); +- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); +- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); +- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); +- +- do { +- __m128i y0 = _mm_loadu_si128(y0_ptr128); +- __m128i y1 = _mm_loadu_si128(y1_ptr128); +- __m128i y2 = _mm_unpackhi_epi8(y0, zero); +- __m128i y3 = _mm_unpackhi_epi8(y1, zero); +- y0 = _mm_unpacklo_epi8(y0, zero); +- y1 = _mm_unpacklo_epi8(y1, zero); +- y0 = _mm_mullo_epi16(y0, y0_fraction); +- y1 = _mm_mullo_epi16(y1, y1_fraction); +- y2 = _mm_mullo_epi16(y2, y0_fraction); +- y3 = _mm_mullo_epi16(y3, y1_fraction); +- y0 = _mm_add_epi16(y0, y1); +- y2 = _mm_add_epi16(y2, y3); +- y0 = _mm_srli_epi16(y0, 8); +- y2 = _mm_srli_epi16(y2, 8); +- y0 = _mm_packus_epi16(y0, y2); +- *dest128++ = y0; +- ++y0_ptr128; +- ++y1_ptr128; +- } while (dest128 < end128); +-} +-#elif USE_MMX +-// MMX version does 8 pixels at a time +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { +- __m64 zero = _mm_setzero_si64(); +- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); +- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); +- +- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); +- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); +- __m64* dest64 = reinterpret_cast<__m64*>(ybuf); +- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); +- +- do { +- __m64 y0 = *y0_ptr64++; +- __m64 y1 = *y1_ptr64++; +- __m64 y2 = _mm_unpackhi_pi8(y0, zero); +- __m64 y3 = _mm_unpackhi_pi8(y1, zero); +- y0 = _mm_unpacklo_pi8(y0, zero); +- y1 = _mm_unpacklo_pi8(y1, zero); +- y0 = _mm_mullo_pi16(y0, y0_fraction); +- y1 = _mm_mullo_pi16(y1, y1_fraction); +- y2 = _mm_mullo_pi16(y2, y0_fraction); +- y3 = _mm_mullo_pi16(y3, y1_fraction); +- y0 = _mm_add_pi16(y0, y1); +- y2 = _mm_add_pi16(y2, y3); +- y0 = _mm_srli_pi16(y0, 8); +- y2 = _mm_srli_pi16(y2, 8); +- y0 = _mm_packs_pu16(y0, y2); +- *dest64++ = y0; +- } while (dest64 < end64); +-} +-#else // no MMX or SSE2 ++ if (has_sse) ++ EMMS(); ++} ++ + // C version does 8 at a time to mimic MMX code +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { ++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; +@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); + } +-#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_MMX ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction); ++#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction); ++#endif ++ ++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, ++ const uint8* y1_ptr, int source_width, ++ int source_y_fraction) { ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 ++ if (mozilla::supports_sse2()) { ++ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++ return; ++ } ++#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_MMX ++ if (mozilla::supports_mmx()) { ++ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++ return; ++ } ++#endif ++ ++ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++} + + + // Scale a frame of YUV to 32 bit ARGB. +-void ScaleYUVToRGB32(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int source_width, +- int source_height, +- int width, +- int height, +- int y_pitch, +- int uv_pitch, +- int rgb_pitch, +- YUVType yuv_type, +- Rotate view_rotate, +- ScaleFilter filter) { ++void ScaleYCbCrToRGB32(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int source_width, ++ int source_height, ++ int width, ++ int height, ++ int y_pitch, ++ int uv_pitch, ++ int rgb_pitch, ++ YUVType yuv_type, ++ Rotate view_rotate, ++ ScaleFilter filter) { ++ bool has_mmx = supports_mmx(); ++ + // 4096 allows 3 buffers to fit in 12k. + // Helps performance on CPU with 16K L1 cache. + // Large enough for 3830x2160 and 30" displays which are 2560x1600. + const int kFilterBufferSize = 4096; + // Disable filtering if the screen is too big (to avoid buffer overflows). + // This should never happen to regular users: they don't have monitors + // wider than 4096 pixels. + // TODO(fbarchard): Allow rotated videos to filter. + if (source_width > kFilterBufferSize || view_rotate) + filter = FILTER_NONE; + +- unsigned int y_shift = yuv_type; ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + // Diagram showing origin and direction of source sampling. + // ->0 4<- + // 7 3 + // + // 6 5 + // ->1 2<- + // Rotations that start at right side of image. + if ((view_rotate == ROTATE_180) || +@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, + int source_uv_fraction = + ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; + + const uint8* y_ptr = y0_ptr; + const uint8* u_ptr = u0_ptr; + const uint8* v_ptr = v0_ptr; + // Apply vertical filtering if necessary. + // TODO(fbarchard): Remove memcpy when not necessary. +- if (filter & media::FILTER_BILINEAR_V) { ++ if (filter & mozilla::gfx::FILTER_BILINEAR_V) { + if (yscale_fixed != kFractionMax && + source_y_fraction && ((source_y + 1) < source_height)) { + FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + } else { + memcpy(ybuf, y0_ptr, source_width); + } + y_ptr = ybuf; + ybuf[source_width] = ybuf[source_width-1]; +@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf, + u_ptr = ubuf; + v_ptr = vbuf; + ubuf[uv_source_width] = ubuf[uv_source_width - 1]; + vbuf[uv_source_width] = vbuf[uv_source_width - 1]; + } + if (source_dx == kFractionMax) { // Not scaled + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width); +- } else { +- if (filter & FILTER_BILINEAR_H) { ++ } else if (filter & FILTER_BILINEAR_H) { + LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { + // Specialized scalers and rotation. +-#if USE_MMX && defined(_MSC_VER) ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) ++ if(mozilla::supports_sse()) { + if (width == (source_width * 2)) { +- DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width); ++ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width); + } else if ((source_dx & kFractionMask) == 0) { + // Scaling by integer scale factor. ie half. +- ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, +- source_dx >> kFractionBits); ++ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, ++ source_dx >> kFractionBits); + } else if (source_dx_uv == source_dx) { // Not rotated. + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { +- RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, +- source_dx >> kFractionBits, +- source_dx_uv >> kFractionBits); ++ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, ++ source_dx >> kFractionBits, ++ source_dx_uv >> kFractionBits); + } ++ } ++ else { ++ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, source_dx); ++ } + #else +- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, source_dx); +-#endif +- } ++ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, source_dx); ++#endif + } + } + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. +- EMMS(); +-} +- +-} // namespace media ++ if (has_mmx) ++ EMMS(); ++} ++ ++} // namespace gfx ++} // namespace mozilla +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h +--- a/gfx/ycbcr/yuv_convert.h ++++ b/gfx/ycbcr/yuv_convert.h +@@ -1,72 +1,79 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + + #ifndef MEDIA_BASE_YUV_CONVERT_H_ + #define MEDIA_BASE_YUV_CONVERT_H_ + +-#include "base/basictypes.h" +- +-namespace media { +- ++#include "chromium_types.h" ++#include "gfxCore.h" ++ ++namespace mozilla { ++ ++namespace gfx { ++ + // Type of YUV surface. + // The value of these enums matter as they are used to shift vertical indices. + enum YUVType { +- YV16 = 0, // YV16 is half width and full height chroma channels. +- YV12 = 1, // YV12 is half width and half height chroma channels. ++ YV12 = 0, // YV12 is half width and half height chroma channels. ++ YV16 = 1, // YV16 is half width and full height chroma channels. ++ YV24 = 2 // YV24 is full width and full height chroma channels. + }; + + // Mirror means flip the image horizontally, as in looking in a mirror. + // Rotate happens after mirroring. + enum Rotate { + ROTATE_0, // Rotation off. + ROTATE_90, // Rotate clockwise. + ROTATE_180, // Rotate upside down. + ROTATE_270, // Rotate counter clockwise. + MIRROR_ROTATE_0, // Mirror horizontally. + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. + MIRROR_ROTATE_180, // Mirror vertically. +- MIRROR_ROTATE_270, // Transpose. ++ MIRROR_ROTATE_270 // Transpose. + }; + + // Filter affects how scaling looks. + enum ScaleFilter { + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. +- FILTER_BILINEAR = 3, // Bilinear filter. ++ FILTER_BILINEAR = 3 // Bilinear filter. + }; + + // Convert a frame of YUV to 32 bit ARGB. + // Pass in YV16/YV12 depending on source format +-void ConvertYUVToRGB32(const uint8* yplane, +- const uint8* uplane, +- const uint8* vplane, +- uint8* rgbframe, +- int width, +- int height, +- int ystride, +- int uvstride, +- int rgbstride, +- YUVType yuv_type); ++void ConvertYCbCrToRGB32(const uint8* yplane, ++ const uint8* uplane, ++ const uint8* vplane, ++ uint8* rgbframe, ++ int pic_x, ++ int pic_y, ++ int pic_width, ++ int pic_height, ++ int ystride, ++ int uvstride, ++ int rgbstride, ++ YUVType yuv_type); + + // Scale a frame of YUV to 32 bit ARGB. + // Supports rotation and mirroring. +-void ScaleYUVToRGB32(const uint8* yplane, +- const uint8* uplane, +- const uint8* vplane, +- uint8* rgbframe, +- int source_width, +- int source_height, +- int width, +- int height, +- int ystride, +- int uvstride, +- int rgbstride, +- YUVType yuv_type, +- Rotate view_rotate, +- ScaleFilter filter); +- +-} // namespace media +- ++void ScaleYCbCrToRGB32(const uint8* yplane, ++ const uint8* uplane, ++ const uint8* vplane, ++ uint8* rgbframe, ++ int source_width, ++ int source_height, ++ int width, ++ int height, ++ int ystride, ++ int uvstride, ++ int rgbstride, ++ YUVType yuv_type, ++ Rotate view_rotate, ++ ScaleFilter filter); ++ ++} // namespace gfx ++} // namespace mozilla ++ + #endif // MEDIA_BASE_YUV_CONVERT_H_ +diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp +new file mode 100644 +--- /dev/null ++++ b/gfx/ycbcr/yuv_convert_mmx.cpp +@@ -0,0 +1,45 @@ ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style license that can be ++// found in the LICENSE file. ++ ++#include <mmintrin.h> ++#include "yuv_row.h" ++ ++namespace mozilla { ++namespace gfx { ++ ++// FilterRows combines two rows of the image using linear interpolation. ++// MMX version does 8 pixels at a time. ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { ++ __m64 zero = _mm_setzero_si64(); ++ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); ++ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); ++ ++ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); ++ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); ++ __m64* dest64 = reinterpret_cast<__m64*>(ybuf); ++ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); ++ ++ do { ++ __m64 y0 = *y0_ptr64++; ++ __m64 y1 = *y1_ptr64++; ++ __m64 y2 = _mm_unpackhi_pi8(y0, zero); ++ __m64 y3 = _mm_unpackhi_pi8(y1, zero); ++ y0 = _mm_unpacklo_pi8(y0, zero); ++ y1 = _mm_unpacklo_pi8(y1, zero); ++ y0 = _mm_mullo_pi16(y0, y0_fraction); ++ y1 = _mm_mullo_pi16(y1, y1_fraction); ++ y2 = _mm_mullo_pi16(y2, y0_fraction); ++ y3 = _mm_mullo_pi16(y3, y1_fraction); ++ y0 = _mm_add_pi16(y0, y1); ++ y2 = _mm_add_pi16(y2, y3); ++ y0 = _mm_srli_pi16(y0, 8); ++ y2 = _mm_srli_pi16(y2, 8); ++ y0 = _mm_packs_pu16(y0, y2); ++ *dest64++ = y0; ++ } while (dest64 < end64); ++} ++ ++} ++} +diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp +new file mode 100644 +--- /dev/null ++++ b/gfx/ycbcr/yuv_convert_sse2.cpp +@@ -0,0 +1,47 @@ ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style license that can be ++// found in the LICENSE file. ++ ++#include <emmintrin.h> ++#include "yuv_row.h" ++ ++namespace mozilla { ++namespace gfx { ++ ++// FilterRows combines two rows of the image using linear interpolation. ++// SSE2 version does 16 pixels at a time. ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); ++ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); ++ ++ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); ++ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); ++ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); ++ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); ++ ++ do { ++ __m128i y0 = _mm_loadu_si128(y0_ptr128); ++ __m128i y1 = _mm_loadu_si128(y1_ptr128); ++ __m128i y2 = _mm_unpackhi_epi8(y0, zero); ++ __m128i y3 = _mm_unpackhi_epi8(y1, zero); ++ y0 = _mm_unpacklo_epi8(y0, zero); ++ y1 = _mm_unpacklo_epi8(y1, zero); ++ y0 = _mm_mullo_epi16(y0, y0_fraction); ++ y1 = _mm_mullo_epi16(y1, y1_fraction); ++ y2 = _mm_mullo_epi16(y2, y0_fraction); ++ y3 = _mm_mullo_epi16(y3, y1_fraction); ++ y0 = _mm_add_epi16(y0, y1); ++ y2 = _mm_add_epi16(y2, y3); ++ y0 = _mm_srli_epi16(y0, 8); ++ y2 = _mm_srli_epi16(y2, 8); ++ y0 = _mm_packus_epi16(y0, y2); ++ *dest128++ = y0; ++ ++y0_ptr128; ++ ++y1_ptr128; ++ } while (dest128 < end128); ++} ++ ++} ++} +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h +--- a/gfx/ycbcr/yuv_row.h ++++ b/gfx/ycbcr/yuv_row.h +@@ -5,109 +5,133 @@ + // yuv_row internal functions to handle YUV conversion and scaling to RGB. + // These functions are used from both yuv_convert.cc and yuv_scale.cc. + + // TODO(fbarchard): Write function that can handle rotation and scaling. + + #ifndef MEDIA_BASE_YUV_ROW_H_ + #define MEDIA_BASE_YUV_ROW_H_ + +-#include "base/basictypes.h" ++#include "chromium_types.h" + + extern "C" { + // Can only do 1x. + // This is the second fastest of the scalers. + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +-// Can do 1x, half size or any scale down by an integer amount. +-// Step can be negative (mirroring, rotate 180). +-// This is the third fastest of the scalers. +-void ConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int step); +- +-// Rotate is like Convert, but applies different step to Y versus U and V. +-// This allows rotation by 90 or 270, by stepping by stride. +-// This is the forth fastest of the scalers. +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, +- int ystep, +- int uvstep); ++ unsigned int x_shift); ++ ++void FastConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); ++ ++// Can do 1x, half size or any scale down by an integer amount. ++// Step can be negative (mirroring, rotate 180). ++// This is the third fastest of the scalers. ++// Only defined on Windows x86-32. ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step); ++ ++// Rotate is like Convert, but applies different step to Y versus U and V. ++// This allows rotation by 90 or 270, by stepping by stride. ++// This is the forth fastest of the scalers. ++// Only defined on Windows x86-32. ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep); + + // Doubler does 4 pixels at a time. Each pixel is replicated. + // This is the fastest of the scalers. +-void DoubleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); ++// Only defined on Windows x86-32. ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); + + // Handles arbitrary scaling up or down. + // Mirroring is supported, but not 90 or 270 degree rotation. + // Chroma is under sampled every 2 pixels for performance. + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ + // Handles arbitrary scaling up or down with bilinear filtering. + // Mirroring is supported, but not 90 or 270 degree rotation. + // Chroma is under sampled every 2 pixels for performance. + // This is the slowest of the scalers. + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++ + #if defined(_MSC_VER) + #define SIMD_ALIGNED(var) __declspec(align(16)) var + #else + #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) + #endif + extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]); + +-// Method to force C version. +-//#define USE_MMX 0 +-//#define USE_SSE2 0 +- +-#if !defined(USE_MMX) +-// Windows, Mac and Linux/BSD use MMX +-#if defined(__MMX__) || defined(_MSC_VER) +-#define USE_MMX 1 +-#else +-#define USE_MMX 0 +-#endif +-#endif +- +-#if !defined(USE_SSE2) +-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 +-#define USE_SSE2 1 +-#else +-#define USE_SSE2 0 +-#endif +-#endif +- + // x64 uses MMX2 (SSE) so emms is not required. + // Warning C4799: function has no EMMS instruction. + // EMMS() is slow and should be called by the calling function once per image. +-#if USE_MMX && !defined(ARCH_CPU_X86_64) ++#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) + #if defined(_MSC_VER) + #define EMMS() __asm emms + #pragma warning(disable: 4799) + #else + #define EMMS() asm("emms") + #endif + #else + #define EMMS() +diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp +--- a/gfx/ycbcr/yuv_row_c.cpp ++++ b/gfx/ycbcr/yuv_row_c.cpp +@@ -1,812 +1,18 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" +- +-#ifdef _DEBUG +-#include "base/logging.h" +-#else ++#include "yuv_row.h" ++ + #define DCHECK(a) +-#endif + + extern "C" { + +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) +- +-// AMD64 ABI uses register paremters. +-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi +- const uint8* u_buf, // rsi +- const uint8* v_buf, // rdx +- uint8* rgb_buf, // rcx +- int width) { // r8 +- asm( +- "jmp convertend\n" +-"convertloop:" +- "movzb (%1),%%r10\n" +- "add $0x1,%1\n" +- "movzb (%2),%%r11\n" +- "add $0x1,%2\n" +- "movq 2048(%5,%%r10,8),%%xmm0\n" +- "movzb (%0),%%r10\n" +- "movq 4096(%5,%%r11,8),%%xmm1\n" +- "movzb 0x1(%0),%%r11\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movq (%5,%%r10,8),%%xmm2\n" +- "add $0x2,%0\n" +- "movq (%5,%%r11,8),%%xmm3\n" +- "paddsw %%xmm0,%%xmm2\n" +- "paddsw %%xmm0,%%xmm3\n" +- "shufps $0x44,%%xmm3,%%xmm2\n" +- "psraw $0x6,%%xmm2\n" +- "packuswb %%xmm2,%%xmm2\n" +- "movq %%xmm2,0x0(%3)\n" +- "add $0x8,%3\n" +-"convertend:" +- "sub $0x2,%4\n" +- "jns convertloop\n" +- +-"convertnext:" +- "add $0x1,%4\n" +- "js convertdone\n" +- +- "movzb (%1),%%r10\n" +- "movq 2048(%5,%%r10,8),%%xmm0\n" +- "movzb (%2),%%r10\n" +- "movq 4096(%5,%%r10,8),%%xmm1\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movzb (%0),%%r10\n" +- "movq (%5,%%r10,8),%%xmm1\n" +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +-"convertdone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY) // %5 +- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +-); +-} +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi +- const uint8* u_buf, // rsi +- const uint8* v_buf, // rdx +- uint8* rgb_buf, // rcx +- int width, // r8 +- int source_dx) { // r9 +- asm( +- "xor %%r11,%%r11\n" +- "sub $0x2,%4\n" +- "js scalenext\n" +- +-"scaleloop:" +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- "movzb (%1,%%r10,1),%%rax\n" +- "movq 2048(%5,%%rax,8),%%xmm0\n" +- "movzb (%2,%%r10,1),%%rax\n" +- "movq 4096(%5,%%rax,8),%%xmm1\n" +- "lea (%%r11,%6),%%r10\n" +- "sar $0x10,%%r11\n" +- "movzb (%0,%%r11,1),%%rax\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movq (%5,%%rax,8),%%xmm1\n" +- "lea (%%r10,%6),%%r11\n" +- "sar $0x10,%%r10\n" +- "movzb (%0,%%r10,1),%%rax\n" +- "movq (%5,%%rax,8),%%xmm2\n" +- "paddsw %%xmm0,%%xmm1\n" +- "paddsw %%xmm0,%%xmm2\n" +- "shufps $0x44,%%xmm2,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movq %%xmm1,0x0(%3)\n" +- "add $0x8,%3\n" +- "sub $0x2,%4\n" +- "jns scaleloop\n" +- +-"scalenext:" +- "add $0x1,%4\n" +- "js scaledone\n" +- +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- "movzb (%1,%%r10,1),%%rax\n" +- "movq 2048(%5,%%rax,8),%%xmm0\n" +- "movzb (%2,%%r10,1),%%rax\n" +- "movq 4096(%5,%%rax,8),%%xmm1\n" +- "paddsw %%xmm1,%%xmm0\n" +- "sar $0x10,%%r11\n" +- "movzb (%0,%%r11,1),%%rax\n" +- "movq (%5,%%rax,8),%%xmm1\n" +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +- +-"scaledone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY), // %5 +- "r"(static_cast<long>(source_dx)) // %6 +- : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" +-); +-} +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- asm( +- "xor %%r11,%%r11\n" // x = 0 +- "sub $0x2,%4\n" +- "js .lscalenext\n" +- "cmp $0x20000,%6\n" // if source_dx >= 2.0 +- "jl .lscalehalf\n" +- "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +-".lscalehalf:" +- +-".lscaleloop:" +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- +- "movzb (%1, %%r10, 1), %%r13 \n" +- "movzb 1(%1, %%r10, 1), %%r14 \n" +- "mov %%r11, %%rax \n" +- "and $0x1fffe, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0x1fffe, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $17, %%r13 \n" +- "movq 2048(%5,%%r13,8), %%xmm0\n" +- +- "movzb (%2, %%r10, 1), %%r13 \n" +- "movzb 1(%2, %%r10, 1), %%r14 \n" +- "mov %%r11, %%rax \n" +- "and $0x1fffe, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0x1fffe, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $17, %%r13 \n" +- "movq 4096(%5,%%r13,8), %%xmm1\n" +- +- "mov %%r11, %%rax \n" +- "lea (%%r11,%6),%%r10\n" +- "sar $0x10,%%r11\n" +- "paddsw %%xmm1,%%xmm0\n" +- +- "movzb (%0, %%r11, 1), %%r13 \n" +- "movzb 1(%0, %%r11, 1), %%r14 \n" +- "and $0xffff, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0xffff, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $16, %%r13 \n" +- "movq (%5,%%r13,8),%%xmm1\n" +- +- "mov %%r10, %%rax \n" +- "lea (%%r10,%6),%%r11\n" +- "sar $0x10,%%r10\n" +- +- "movzb (%0,%%r10,1), %%r13 \n" +- "movzb 1(%0,%%r10,1), %%r14 \n" +- "and $0xffff, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0xffff, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $16, %%r13 \n" +- "movq (%5,%%r13,8),%%xmm2\n" +- +- "paddsw %%xmm0,%%xmm1\n" +- "paddsw %%xmm0,%%xmm2\n" +- "shufps $0x44,%%xmm2,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movq %%xmm1,0x0(%3)\n" +- "add $0x8,%3\n" +- "sub $0x2,%4\n" +- "jns .lscaleloop\n" +- +-".lscalenext:" +- "add $0x1,%4\n" +- "js .lscaledone\n" +- +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- +- "movzb (%1,%%r10,1), %%r13 \n" +- "movq 2048(%5,%%r13,8),%%xmm0\n" +- +- "movzb (%2,%%r10,1), %%r13 \n" +- "movq 4096(%5,%%r13,8),%%xmm1\n" +- +- "paddsw %%xmm1,%%xmm0\n" +- "sar $0x10,%%r11\n" +- +- "movzb (%0,%%r11,1), %%r13 \n" +- "movq (%5,%%r13,8),%%xmm1\n" +- +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +- +-".lscaledone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY), // %5 +- "r"(static_cast<long>(source_dx)) // %6 +- : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" +-); +-} +- +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) +- +-// PIC version is slower because less registers are available, so +-// non-PIC is used on platforms where it is possible. +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); +- asm( +- ".text\n" +- ".global FastConvertYUVToRGB32Row\n" +-"FastConvertYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "jmp convertend\n" +- +-"convertloop:" +- "movzbl (%edi),%eax\n" +- "add $0x1,%edi\n" +- "movzbl (%esi),%ebx\n" +- "add $0x1,%esi\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" +- "movzbl 0x1(%edx),%ebx\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "add $0x2,%edx\n" +- "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"convertend:" +- "sub $0x2,%ecx\n" +- "jns convertloop\n" +- +- "and $0x1,%ecx\n" +- "je convertdone\n" +- +- "movzbl (%edi),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "movzbl (%esi),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +-"convertdone:" +- "popa\n" +- "ret\n" +-); +- +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); +- asm( +- ".text\n" +- ".global ScaleYUVToRGB32Row\n" +-"ScaleYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "xor %ebx,%ebx\n" +- "jmp scaleend\n" +- +-"scaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"scaleend:" +- "sub $0x2,%ecx\n" +- "jns scaleloop\n" +- +- "and $0x1,%ecx\n" +- "je scaledone\n" +- +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +- +-"scaledone:" +- "popa\n" +- "ret\n" +-); +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); +- asm( +- ".text\n" +- ".global LinearScaleYUVToRGB32Row\n" +-"LinearScaleYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x30(%esp),%ebp\n" +- +- // source_width = width * source_dx + ebx +- "mov 0x34(%esp), %ecx\n" +- "imull 0x38(%esp), %ecx\n" +- "mov %ecx, 0x34(%esp)\n" +- +- "mov 0x38(%esp), %ecx\n" +- "xor %ebx,%ebx\n" // x = 0 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%edi,%eax,1),%ecx\n" +- "movzbl 1(%edi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" +- +- "mov 0x2c(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" +- +- "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" +- +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +- +-".lscaleend:" +- "cmp 0x34(%esp), %ebx\n" +- "jl .lscaleloop\n" +- "popa\n" +- "ret\n" +- +-".lscalelastpixel:" +- "paddsw %mm0, %mm1\n" +- "psraw $6, %mm1\n" +- "packuswb %mm1, %mm1\n" +- "movd %mm1, (%ebp)\n" +- "popa\n" +- "ret\n" +-); +- +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) +- +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int16 *kCoefficientsRgbY); +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICConvertYUVToRGB32Row:\n" +-#else +-"PICConvertYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x38(%esp),%ecx\n" +- +- "jmp .Lconvertend\n" +- +-".Lconvertloop:" +- "movzbl (%edi),%eax\n" +- "add $0x1,%edi\n" +- "movzbl (%esi),%ebx\n" +- "add $0x1,%esi\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "paddsw 4096(%ecx,%ebx,8),%mm0\n" +- "movzbl 0x1(%edx),%ebx\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "add $0x2,%edx\n" +- "movq 0(%ecx,%ebx,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-".Lconvertend:" +- "subl $0x2,0x34(%esp)\n" +- "jns .Lconvertloop\n" +- +- "andl $0x1,0x34(%esp)\n" +- "je .Lconvertdone\n" +- +- "movzbl (%edi),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "movzbl (%esi),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +-".Lconvertdone:\n" +- "popa\n" +- "ret\n" +-); +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, +- &kCoefficientsRgbY[0][0]); +-} +- +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); +- +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICScaleYUVToRGB32Row:\n" +-#else +-"PICScaleYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x3c(%esp),%ecx\n" +- "xor %ebx,%ebx\n" +- "jmp Lscaleend\n" +- +-"Lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"Lscaleend:" +- "subl $0x2,0x34(%esp)\n" +- "jns Lscaleloop\n" +- +- "andl $0x1,0x34(%esp)\n" +- "je Lscaledone\n" +- +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +- +-"Lscaledone:" +- "popa\n" +- "ret\n" +-); +- +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICLinearScaleYUVToRGB32Row:\n" +-#else +-"PICLinearScaleYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "mov 0x3c(%esp),%edi\n" +- "xor %ebx,%ebx\n" +- +- // source_width = width * source_dx + ebx +- "mov 0x34(%esp), %ecx\n" +- "imull 0x38(%esp), %ecx\n" +- "mov %ecx, 0x34(%esp)\n" +- +- "mov 0x38(%esp), %ecx\n" +- "xor %ebx,%ebx\n" // x = 0 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov 0x28(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "movq 2048(%edi,%ecx,8),%mm0\n" +- +- "mov 0x2c(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "paddsw 4096(%edi,%ecx,8),%mm0\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq (%edi,%ecx,8),%mm1\n" +- +- "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq (%edi,%ecx,8),%mm2\n" +- +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +- +-".lscaleend:" +- "cmp %ebx, 0x34(%esp)\n" +- "jg .lscaleloop\n" +- "popa\n" +- "ret\n" +- +-".lscalelastpixel:" +- "paddsw %mm0, %mm1\n" +- "psraw $6, %mm1\n" +- "packuswb %mm1, %mm1\n" +- "movd %mm1, (%ebp)\n" +- "popa\n" +- "ret\n" +-); +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-#else // USE_MMX +- + // C reference code that mimic the YUV assembly. + #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) + #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + + static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, +@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y, + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); + } + +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ unsigned int x_shift) { + for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; ++ uint8 u = u_buf[x >> x_shift]; ++ uint8 v = v_buf[x >> x_shift]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; ++ if (x_shift == 0) { ++ u = u_buf[x + 1]; ++ v = v_buf[x + 1]; ++ } + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } + } + + // 16.16 fixed point is used. A shift by 16 isolates the integer. + // A shift by 17 is used to further subsample the chrominence channels. + // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, + // for 1/65536 pixel accurate interpolation. +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } + } + +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; +@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } + } + +-#endif // USE_MMX + } // extern "C" + +diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp +--- a/gfx/ycbcr/yuv_row_posix.cpp ++++ b/gfx/ycbcr/yuv_row_posix.cpp +@@ -1,33 +1,32 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" +- +-#ifdef _DEBUG +-#include "base/logging.h" +-#else ++#include "yuv_row.h" ++#include "mozilla/SSE.h" ++ + #define DCHECK(a) +-#endif + + extern "C" { + +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) ++#if defined(ARCH_CPU_X86_64) ++ ++// We don't need CPUID guards here, since x86-64 implies SSE2. + + // AMD64 ABI uses register paremters. + void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm( +- "jmp convertend\n" +-"convertloop:" ++ "jmp 1f\n" ++"0:" + "movzb (%1),%%r10\n" + "add $0x1,%1\n" + "movzb (%2),%%r11\n" + "add $0x1,%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" +@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "add $0x8,%3\n" +-"convertend:" ++"1:" + "sub $0x2,%4\n" +- "jns convertloop\n" +- +-"convertnext:" ++ "jns 0b\n" ++ ++"2:" + "add $0x1,%4\n" +- "js convertdone\n" ++ "js 3f\n" + + "movzb (%1),%%r10\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%2),%%r10\n" + "movq 4096(%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movzb (%0),%%r10\n" + "movq (%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" +-"convertdone:" ++"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width, // r8 + int source_dx) { // r9 + asm( + "xor %%r11,%%r11\n" + "sub $0x2,%4\n" +- "js scalenext\n" +- +-"scaleloop:" ++ "js 1f\n" ++ ++"0:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "lea (%%r11,%6),%%r10\n" + "sar $0x10,%%r11\n" +@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" +- "jns scaleloop\n" +- +-"scalenext:" ++ "jns 0b\n" ++ ++"1:" + "add $0x1,%4\n" +- "js scaledone\n" ++ "js 2f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +-"scaledone:" ++"2:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 +@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + asm( + "xor %%r11,%%r11\n" // x = 0 + "sub $0x2,%4\n" +- "js .lscalenext\n" ++ "js 2f\n" + "cmp $0x20000,%6\n" // if source_dx >= 2.0 +- "jl .lscalehalf\n" ++ "jl 0f\n" + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +-".lscalehalf:" +- +-".lscaleloop:" ++"0:" ++ ++"1:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1, %%r10, 1), %%r13 \n" + "movzb 1(%1, %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" +@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" +- "jns .lscaleloop\n" +- +-".lscalenext:" ++ "jns 1b\n" ++ ++"2:" + "add $0x1,%4\n" +- "js .lscaledone\n" ++ "js 3f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1,%%r10,1), %%r13 \n" + "movq 2048(%5,%%r13,8),%%xmm0\n" + + "movzb (%2,%%r10,1), %%r13 \n" +@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint + "movzb (%0,%%r11,1), %%r13 \n" + "movq (%5,%%r13,8),%%xmm1\n" + + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +-".lscaledone:" ++"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 + : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" + ); + } + +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) + + // PIC version is slower because less registers are available, so + // non-PIC is used on platforms where it is possible. +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); + asm( + ".text\n" +- ".global FastConvertYUVToRGB32Row\n" +-"FastConvertYUVToRGB32Row:\n" ++ ".global FastConvertYUVToRGB32Row_SSE\n" ++ ".type FastConvertYUVToRGB32Row_SSE, @function\n" ++"FastConvertYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" +- "jmp convertend\n" +- +-"convertloop:" ++ "jmp 1f\n" ++ ++"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" +@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"convertend:" ++"1:" + "sub $0x2,%ecx\n" +- "jns convertloop\n" ++ "jns 0b\n" + + "and $0x1,%ecx\n" +- "je convertdone\n" ++ "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +-"convertdone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); ++void FastConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) ++{ ++ if (mozilla::supports_sse()) { ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); ++ return; ++ } ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ ++ ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); + asm( + ".text\n" +- ".global ScaleYUVToRGB32Row\n" +-"ScaleYUVToRGB32Row:\n" ++ ".global ScaleYUVToRGB32Row_SSE\n" ++ ".type ScaleYUVToRGB32Row_SSE, @function\n" ++"ScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "xor %ebx,%ebx\n" +- "jmp scaleend\n" +- +-"scaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"scaleend:" ++"1:" + "sub $0x2,%ecx\n" +- "jns scaleloop\n" ++ "jns 0b\n" + + "and $0x1,%ecx\n" +- "je scaledone\n" ++ "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" +@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +-"scaledone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++ } ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++} ++ ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); + asm( + ".text\n" +- ".global LinearScaleYUVToRGB32Row\n" +-"LinearScaleYUVToRGB32Row:\n" ++ ".global LinearScaleYUVToRGB32Row_SSE\n" ++ ".type LinearScaleYUVToRGB32Row_SSE, @function\n" ++"LinearScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x30(%esp),%ebp\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" ++ "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" ++ "jmp 1f\n" ++ ++"0:" ++ "mov %ebx,%eax\n" ++ "sar $0x11,%eax\n" + + "movzbl (%edi,%eax,1),%ecx\n" + "movzbl 1(%edi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" +@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" ++ "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" +@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +-".lscaleend:" ++"1:" + "cmp 0x34(%esp), %ebx\n" +- "jl .lscaleloop\n" ++ "jl 0b\n" + "popa\n" + "ret\n" + +-".lscalelastpixel:" ++"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) +- +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int16 *kCoefficientsRgbY); ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++ } ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++} ++ ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) ++ ++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int16 *kCoefficientsRgbY); ++ + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICConvertYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICConvertYUVToRGB32Row_SSE:\n" + #else +-"PICConvertYUVToRGB32Row:\n" ++"PICConvertYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x38(%esp),%ecx\n" + +- "jmp .Lconvertend\n" +- +-".Lconvertloop:" ++ "jmp 1f\n" ++ ++"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw 4096(%ecx,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" +@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons + "movq 0(%ecx,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-".Lconvertend:" ++"1:" + "subl $0x2,0x34(%esp)\n" +- "jns .Lconvertloop\n" ++ "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" +- "je .Lconvertdone\n" ++ "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +-".Lconvertdone:\n" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, +- int width) { +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, +- &kCoefficientsRgbY[0][0]); +-} +- +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, ++ int width) ++{ ++ if (mozilla::supports_sse()) { ++ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ ++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx, + int16 *kCoefficientsRgbY); + + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICScaleYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICScaleYUVToRGB32Row_SSE:\n" + #else +-"PICScaleYUVToRGB32Row:\n" ++"PICScaleYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x3c(%esp),%ecx\n" + "xor %ebx,%ebx\n" +- "jmp Lscaleend\n" +- +-"Lscaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" +@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const + "movq 0(%ecx,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"Lscaleend:" ++"1:" + "subl $0x2,0x34(%esp)\n" +- "jns Lscaleloop\n" ++ "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" +- "je Lscaledone\n" ++ "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" +@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +-"Lscaledone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +- + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, +- int source_dx) { +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, ++ &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++ ++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx, ++ int16 *kCoefficientsRgbY); ++ + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICLinearScaleYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICLinearScaleYUVToRGB32Row_SSE:\n" + #else +-"PICLinearScaleYUVToRGB32Row:\n" ++"PICLinearScaleYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "mov 0x3c(%esp),%edi\n" + "xor %ebx,%ebx\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" ++ "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov 0x28(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" +@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" ++ "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" +@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +-".lscaleend:" ++"1:" + "cmp %ebx, 0x34(%esp)\n" +- "jg .lscaleloop\n" ++ "jg 0b\n" + "popa\n" + "ret\n" + +-".lscalelastpixel:" ++"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + ++ + void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-#else // USE_MMX +- +-// C reference code that mimic the YUV assembly. +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) +- +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- int b = kCoefficientsRgbY[256+u][0]; +- int g = kCoefficientsRgbY[256+u][1]; +- int r = kCoefficientsRgbY[256+u][2]; +- int a = kCoefficientsRgbY[256+u][3]; +- +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); +- +- b = paddsw(b, kCoefficientsRgbY[y][0]); +- g = paddsw(g, kCoefficientsRgbY[y][1]); +- r = paddsw(r, kCoefficientsRgbY[y][2]); +- a = paddsw(a, kCoefficientsRgbY[y][3]); +- +- b >>= 6; +- g >>= 6; +- r >>= 6; +- a >>= 6; +- +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | +- (packuswb(g) << 8) | +- (packuswb(r) << 16) | +- (packuswb(a) << 24); +-} +- ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ source_dx, &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++#else + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { +- for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; +- uint8 y0 = y_buf[x]; +- YuvPixel(y0, u, v, rgb_buf); +- if ((x + 1) < width) { +- uint8 y1 = y_buf[x + 1]; +- YuvPixel(y1, u, v, rgb_buf + 4); +- } +- rgb_buf += 8; // Advance 2 pixels. +- } +-} +- +-// 16.16 fixed point is used. A shift by 16 isolates the integer. +-// A shift by 17 is used to further subsample the chrominence channels. +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +-// for 1/65536 pixel accurate interpolation. ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- for (int i = 0; i < width; i += 2) { +- int y = y_buf[x >> 16]; +- int u = u_buf[(x >> 17)]; +- int v = v_buf[(x >> 17)]; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y = y_buf[x >> 16]; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} + + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- if (source_dx >= 0x20000) { +- x = 32768; +- } +- for (int i = 0; i < width; i += 2) { +- int y0 = y_buf[x >> 16]; +- int y1 = y_buf[(x >> 16) + 1]; +- int u0 = u_buf[(x >> 17)]; +- int u1 = u_buf[(x >> 17) + 1]; +- int v0 = v_buf[(x >> 17)]; +- int v1 = v_buf[(x >> 17) + 1]; +- int y_frac = (x & 65535); +- int uv_frac = ((x >> 1) & 65535); +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y0 = y_buf[x >> 16]; +- y1 = y_buf[(x >> 16) + 1]; +- y_frac = (x & 65535); +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} +- +-#endif // USE_MMX +-} // extern "C" +- ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++#endif ++ ++} +diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp +--- a/gfx/ycbcr/yuv_row_table.cpp ++++ b/gfx/ycbcr/yuv_row_table.cpp +@@ -1,13 +1,13 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" ++#include "yuv_row.h" + + extern "C" { + + #define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp +--- a/gfx/ycbcr/yuv_row_win.cpp ++++ b/gfx/ycbcr/yuv_row_win.cpp +@@ -1,26 +1,27 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" ++#include "yuv_row.h" ++#include "mozilla/SSE.h" + + #define kCoefficientsRgbU kCoefficientsRgbY + 2048 + #define kCoefficientsRgbV kCoefficientsRgbY + 4096 + + extern "C" { + +-#if USE_MMX +-__declspec(naked) +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++__declspec(naked) ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp convertend +@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint + convertdone : + + popad + ret + } + } + + __declspec(naked) +-void ConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int step) { ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step +@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y + wdone : + + popad + ret + } + } + + __declspec(naked) +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int ystep, +- int uvstep) { ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend +@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui + wdone : + + popad + ret + } + } + + __declspec(naked) +-void DoubleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend +@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_ + jns wloop1 + wdone : + popad + ret + } + } + + // This version does general purpose scaling by any amount, up or down. +-// The only thing it can not do it rotation by 90 or 270. +-// For performance the chroma is under sampled, reducing cost of a 3x ++// The only thing it cannot do is rotation by 90 or 270. ++// For performance the chroma is under-sampled, reducing cost of a 3x + // 1080p scale from 8.4 ms to 5.4 ms. + __declspec(naked) +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x +@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b + + scaledone : + popad + ret + } + } + + __declspec(naked) +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + // [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + imul ecx, [esp + 32 + 24] // source_dx +@@ -438,152 +439,60 @@ lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + popad + ret + }; + } +-#else // USE_MMX +- +-// C reference code that mimic the YUV assembly. +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) +- +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- int b = kCoefficientsRgbY[256+u][0]; +- int g = kCoefficientsRgbY[256+u][1]; +- int r = kCoefficientsRgbY[256+u][2]; +- int a = kCoefficientsRgbY[256+u][3]; +- +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); +- +- b = paddsw(b, kCoefficientsRgbY[y][0]); +- g = paddsw(g, kCoefficientsRgbY[y][1]); +- r = paddsw(r, kCoefficientsRgbY[y][2]); +- a = paddsw(a, kCoefficientsRgbY[y][3]); +- +- b >>= 6; +- g >>= 6; +- r >>= 6; +- a >>= 6; +- +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | +- (packuswb(g) << 8) | +- (packuswb(r) << 16) | +- (packuswb(a) << 24); +-} +- +-#if TEST_MMX_YUV +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- __asm { +- movzx eax, u +- movq mm0, [kCoefficientsRgbY+2048 + 8 * eax] +- movzx eax, v +- paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax] +- movzx eax, y +- movq mm1, [kCoefficientsRgbY + 8 * eax] +- paddsw mm1, mm0 +- psraw mm1, 6 +- packuswb mm1, mm1 +- mov eax, rgb_buf +- movd [eax], mm1 +- emms +- } +-} +-#endif ++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { +- for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; +- uint8 y0 = y_buf[x]; +- YuvPixel(y0, u, v, rgb_buf); +- if ((x + 1) < width) { +- uint8 y1 = y_buf[x + 1]; +- YuvPixel(y1, u, v, rgb_buf + 4); +- } +- rgb_buf += 8; // Advance 2 pixels. +- } +-} +- +-// 16.16 fixed point is used. A shift by 16 isolates the integer. +-// A shift by 17 is used to further subsample the chrominence channels. +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +-// for 1/65536 pixel accurate interpolation. ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); ++ return; ++ } ++#endif ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- for (int i = 0; i < width; i += 2) { +- int y = y_buf[x >> 16]; +- int u = u_buf[(x >> 17)]; +- int v = v_buf[(x >> 17)]; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y = y_buf[x >> 16]; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} ++ ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++ return; ++ } ++#endif ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} + + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- if (source_dx >= 0x20000) { +- x = 32768; +- } +- for (int i = 0; i < width; i += 2) { +- int y0 = y_buf[x >> 16]; +- int y1 = y_buf[(x >> 16) + 1]; +- int u0 = u_buf[(x >> 17)]; +- int u1 = u_buf[(x >> 17) + 1]; +- int v0 = v_buf[(x >> 17)]; +- int v1 = v_buf[(x >> 17) + 1]; +- int y_frac = (x & 65535); +- int uv_frac = ((x >> 1) & 65535); +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y0 = y_buf[x >> 16]; +- y1 = y_buf[(x >> 16) + 1]; +- y_frac = (x & 65535); +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} +- +-#endif // USE_MMX +-} // extern "C" +- ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ source_dx); ++ return; ++ } ++#endif ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++ ++} // extern "C" |