summaryrefslogtreecommitdiff
path: root/gfx/cairo/libpixman/src/pixman-vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'gfx/cairo/libpixman/src/pixman-vmx.c')
-rw-r--r--gfx/cairo/libpixman/src/pixman-vmx.c1704
1 files changed, 1608 insertions, 96 deletions
diff --git a/gfx/cairo/libpixman/src/pixman-vmx.c b/gfx/cairo/libpixman/src/pixman-vmx.c
index 6868704a87..52de37e69e 100644
--- a/gfx/cairo/libpixman/src/pixman-vmx.c
+++ b/gfx/cairo/libpixman/src/pixman-vmx.c
@@ -25,20 +25,46 @@
* Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
*/
+#ifdef HAVE_CONFIG_H
#include <config.h>
+#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#include <altivec.h>
#define AVV(x...) {x}
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
static force_inline vector unsigned int
splat_alpha (vector unsigned int pix)
{
+#ifdef WORDS_BIGENDIAN
return vec_perm (pix, pix,
(vector unsigned char)AVV (
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+#else
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV (
+ 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
+ 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
+#endif
+}
+
+static force_inline vector unsigned int
+splat_pixel (vector unsigned int pix)
+{
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV (
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
}
static force_inline vector unsigned int
@@ -48,12 +74,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
/* unpack to short */
hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergeh ((vector unsigned char)AVV (0),
(vector unsigned char)p);
+#else
+ vec_mergeh ((vector unsigned char) p,
+ (vector unsigned char) AVV (0));
+#endif
mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergeh ((vector unsigned char)AVV (0),
(vector unsigned char)a);
+#else
+ vec_mergeh ((vector unsigned char) a,
+ (vector unsigned char) AVV (0));
+#endif
hi = vec_mladd (hi, mod, (vector unsigned short)
AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -65,11 +101,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
/* unpack to short */
lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergel ((vector unsigned char)AVV (0),
(vector unsigned char)p);
+#else
+ vec_mergel ((vector unsigned char) p,
+ (vector unsigned char) AVV (0));
+#endif
+
mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergel ((vector unsigned char)AVV (0),
(vector unsigned char)a);
+#else
+ vec_mergel ((vector unsigned char) a,
+ (vector unsigned char) AVV (0));
+#endif
lo = vec_mladd (lo, mod, (vector unsigned short)
AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -127,60 +174,316 @@ over (vector unsigned int src,
over (pix_multiply (src, mask), \
pix_multiply (srca, mask), dest)
+#ifdef WORDS_BIGENDIAN
#define COMPUTE_SHIFT_MASK(source) \
source ## _mask = vec_lvsl (0, source);
#define COMPUTE_SHIFT_MASKS(dest, source) \
- dest ## _mask = vec_lvsl (0, dest); \
- source ## _mask = vec_lvsl (0, source); \
- store_mask = vec_lvsr (0, dest);
+ source ## _mask = vec_lvsl (0, source);
#define COMPUTE_SHIFT_MASKC(dest, source, mask) \
mask ## _mask = vec_lvsl (0, mask); \
- dest ## _mask = vec_lvsl (0, dest); \
- source ## _mask = vec_lvsl (0, source); \
- store_mask = vec_lvsr (0, dest);
-
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
+ source ## _mask = vec_lvsl (0, source);
-#define LOAD_VECTORS(dest, source) \
+#define LOAD_VECTOR(source) \
+do \
+{ \
+ vector unsigned char tmp1, tmp2; \
tmp1 = (typeof(tmp1))vec_ld (0, source); \
tmp2 = (typeof(tmp2))vec_ld (15, source); \
- tmp3 = (typeof(tmp3))vec_ld (0, dest); \
- v ## source = (typeof(v ## source)) \
+ v ## source = (typeof(v ## source)) \
vec_perm (tmp1, tmp2, source ## _mask); \
- tmp4 = (typeof(tmp4))vec_ld (15, dest); \
- v ## dest = (typeof(v ## dest)) \
- vec_perm (tmp3, tmp4, dest ## _mask);
+} while (0)
+
+#define LOAD_VECTORS(dest, source) \
+do \
+{ \
+ LOAD_VECTOR(source); \
+ v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
+} while (0)
#define LOAD_VECTORSC(dest, source, mask) \
- tmp1 = (typeof(tmp1))vec_ld (0, source); \
- tmp2 = (typeof(tmp2))vec_ld (15, source); \
- tmp3 = (typeof(tmp3))vec_ld (0, dest); \
- v ## source = (typeof(v ## source)) \
- vec_perm (tmp1, tmp2, source ## _mask); \
- tmp4 = (typeof(tmp4))vec_ld (15, dest); \
- tmp1 = (typeof(tmp1))vec_ld (0, mask); \
- v ## dest = (typeof(v ## dest)) \
- vec_perm (tmp3, tmp4, dest ## _mask); \
- tmp2 = (typeof(tmp2))vec_ld (15, mask); \
- v ## mask = (typeof(v ## mask)) \
- vec_perm (tmp1, tmp2, mask ## _mask);
+do \
+{ \
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
+} while (0)
+
+#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
+#define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
+
+#else
+
+/* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
+ * They are defined that way because little endian altivec can do unaligned
+ * reads natively and have no need for constructing the permutation pattern
+ * variables.
+ */
+#define COMPUTE_SHIFT_MASK(source)
+
+#define COMPUTE_SHIFT_MASKS(dest, source)
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)
+
+# define LOAD_VECTOR(source) \
+ v ## source = (typeof(v ## source))vec_xl(0, source);
+
+# define LOAD_VECTORS(dest, source) \
+ LOAD_VECTOR(source); \
+ LOAD_VECTOR(dest); \
+
+# define LOAD_VECTORSC(dest, source, mask) \
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
+
+#define DECLARE_SRC_MASK_VAR
+#define DECLARE_MASK_MASK_VAR
+
+#endif /* WORDS_BIGENDIAN */
#define LOAD_VECTORSM(dest, source, mask) \
- LOAD_VECTORSC (dest, source, mask) \
+ LOAD_VECTORSC (dest, source, mask); \
v ## source = pix_multiply (v ## source, \
splat_alpha (v ## mask));
#define STORE_VECTOR(dest) \
- edges = vec_perm (tmp4, tmp3, dest ## _mask); \
- tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
- tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
- vec_st ((vector unsigned int) tmp3, 15, dest); \
- vec_st ((vector unsigned int) tmp1, 0, dest);
+ vec_st ((vector unsigned int) v ## dest, 0, dest);
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+ return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+ vector unsigned int vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+
+ return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+ vector unsigned int vdata)
+{
+ STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_1x32_128 (const uint32_t *src)
+{
+ vector unsigned int vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+ return vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+ return create_mask_1x32_128(&mask);
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char lo;
+
+ /* unpack to short */
+ lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergel ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char hi;
+
+ /* unpack to short */
+ hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergeh ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short lo;
+
+ /* unpack to char */
+ lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergel ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short hi;
+
+ /* unpack to char */
+ hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergeh ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_16x8(data1, data2);
+ *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_8x16(data1, data2);
+ *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+ vector unsigned int r, g, b, rb, t;
+
+ r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+ g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+ b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+ rb = vec_or (r, b);
+ t = vec_and (rb, mask_565_fix_rb);
+ t = vec_sr (t, create_mask_32_128(5));
+ rb = vec_or (rb, t);
+
+ t = vec_and (g, mask_565_fix_g);
+ t = vec_sr (t, create_mask_32_128(6));
+ g = vec_or (g, t);
+
+ return vec_or (rb, g);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+ uint32_t cmp_result;
+ vector bool int ffs = vec_cmpeq(x, x);
+
+ cmp_result = vec_all_eq(x, ffs);
+
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+ return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+ uint32_t a;
+
+ a = ALPHA_8(src);
+
+ if (a == 0xff)
+ {
+ return src;
+ }
+ else if (src)
+ {
+ UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
+ }
+
+ return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+ uint32_t s = *ps;
+
+ if (pm)
+ UN8x4_MUL_UN8(s, ALPHA_8(*pm));
+
+ return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+ vector unsigned int src, msk;
+
+ if (pm)
+ {
+ msk = load_128_unaligned(pm);
+
+ if (is_transparent(msk))
+ return (vector unsigned int) AVV(0);
+ }
+
+ src = load_128_unaligned(ps);
+
+ if (pm)
+ src = pix_multiply(src, msk);
+
+ return src;
+}
static void
vmx_combine_over_u_no_mask (uint32_t * dest,
@@ -189,8 +492,19 @@ vmx_combine_over_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t ia = ALPHA_8 (~s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -228,8 +542,24 @@ vmx_combine_over_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t ia;
+
+ UN8x4_MUL_UN8 (s, m);
+
+ ia = ALPHA_8 (~s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -284,8 +614,18 @@ vmx_combine_over_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -322,8 +662,22 @@ vmx_combine_over_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8 (s, m);
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -377,8 +731,17 @@ vmx_combine_in_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t a = ALPHA_8 (*dest);
+
+ UN8x4_MUL_UN8 (s, a);
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -413,8 +776,21 @@ vmx_combine_in_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t a = ALPHA_8 (*dest);
+
+ UN8x4_MUL_UN8 (s, m);
+ UN8x4_MUL_UN8 (s, a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -466,8 +842,18 @@ vmx_combine_in_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t d = *dest;
+ uint32_t a = ALPHA_8 (*src++);
+
+ UN8x4_MUL_UN8 (d, a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -503,8 +889,22 @@ vmx_combine_in_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t d = *dest;
+ uint32_t a = *src++;
+
+ UN8x4_MUL_UN8 (a, m);
+ a = ALPHA_8 (a);
+ UN8x4_MUL_UN8 (d, a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -557,8 +957,18 @@ vmx_combine_out_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t a = ALPHA_8 (~(*dest));
+
+ UN8x4_MUL_UN8 (s, a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -594,8 +1004,21 @@ vmx_combine_out_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t a = ALPHA_8 (~(*dest));
+
+ UN8x4_MUL_UN8 (s, m);
+ UN8x4_MUL_UN8 (s, a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -647,8 +1070,18 @@ vmx_combine_out_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t d = *dest;
+ uint32_t a = ALPHA_8 (~(*src++));
+
+ UN8x4_MUL_UN8 (d, a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -685,8 +1118,22 @@ vmx_combine_out_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t d = *dest;
+ uint32_t a = *src++;
+
+ UN8x4_MUL_UN8 (a, m);
+ a = ALPHA_8 (~a);
+ UN8x4_MUL_UN8 (d, a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -739,8 +1186,20 @@ vmx_combine_atop_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t dest_a = ALPHA_8 (d);
+ uint32_t src_ia = ALPHA_8 (~s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -779,8 +1238,26 @@ vmx_combine_atop_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t dest_a = ALPHA_8 (d);
+ uint32_t src_ia;
+
+ UN8x4_MUL_UN8 (s, m);
+
+ src_ia = ALPHA_8 (~s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -838,8 +1315,20 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t src_a = ALPHA_8 (s);
+ uint32_t dest_ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -878,8 +1367,26 @@ vmx_combine_atop_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t src_a;
+ uint32_t dest_ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8 (s, m);
+
+ src_a = ALPHA_8 (s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -937,8 +1444,20 @@ vmx_combine_xor_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t src_ia = ALPHA_8 (~s);
+ uint32_t dest_ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
@@ -977,8 +1496,26 @@ vmx_combine_xor_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t src_ia;
+ uint32_t dest_ia = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8 (s, m);
+
+ src_ia = ALPHA_8 (~s);
+
+ UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1036,8 +1573,18 @@ vmx_combine_add_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+
+ UN8x4_ADD_UN8x4 (d, s);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKS (dest, src);
/* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1072,8 +1619,21 @@ vmx_combine_add_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, src_mask, mask_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t m = ALPHA_8 (*mask++);
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+
+ UN8x4_MUL_UN8 (s, m);
+ UN8x4_ADD_UN8x4 (d, s);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1128,8 +1688,19 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+
+ UN8x4_MUL_UN8x4 (s, a);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1168,8 +1739,23 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (s);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1212,8 +1798,22 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t ida = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1255,8 +1855,21 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t da = ALPHA_8 (*dest);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (s, da);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1297,8 +1910,21 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (*src++);
+
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4 (d, a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1340,8 +1966,22 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t da = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (s, da);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1384,8 +2024,22 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (s);
+
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4 (d, ~a);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1428,8 +2082,24 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask, vsrca;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (s);
+ uint32_t da = ALPHA_8 (d);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1479,8 +2149,24 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (s);
+ uint32_t da = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1527,8 +2213,24 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+ uint32_t sa = ALPHA_8 (s);
+ uint32_t da = ALPHA_8 (~d);
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_MUL_UN8 (a, sa);
+ UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+ *dest++ = d;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1575,8 +2277,21 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
- dest_mask, mask_mask, src_mask, store_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
+
+ while (width && ((uintptr_t)dest & 15))
+ {
+ uint32_t a = *mask++;
+ uint32_t s = *src++;
+ uint32_t d = *dest;
+
+ UN8x4_MUL_UN8x4 (s, a);
+ UN8x4_ADD_UN8x4 (s, d);
+
+ *dest++ = s;
+ width--;
+ }
COMPUTE_SHIFT_MASKC (dest, src, mask);
@@ -1607,16 +2322,809 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
}
}
+static void
+vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d, s, ia;
+
+ vector unsigned int vsrc, valpha, vmask, vdst;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ srca = ALPHA_8(src);
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ vsrc = (vector unsigned int) {src, src, src, src};
+ valpha = splat_alpha(vsrc);
+
+ while (height--)
+ {
+ const uint8_t *pm = mask_line;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *dst;
+ UN8x4_MUL_UN8 (s, m);
+ ia = ALPHA_8 (~s);
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *dst = d;
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)pm);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned(dst, vsrc);
+ }
+ else if (m)
+ {
+ vmask = splat_pixel((vector unsigned int) {m, m, m, m});
+
+ /* dst is 16-byte aligned */
+ vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
+
+ save_128_aligned(dst, vdst);
+ }
+
+ w -= 4;
+ dst += 4;
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *dst;
+ UN8x4_MUL_UN8 (s, m);
+ ia = ALPHA_8 (~s);
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *dst = d;
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ vector unsigned int vfiller;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = filler & 0xff;
+ w = (b << 8) | b;
+ filler = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ filler = (filler & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ vfiller = create_mask_1x32_128(&filler);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ if (w >= 1 && ((uintptr_t)d & 1))
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((uintptr_t)d & 3))
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((uintptr_t)d & 15))
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+ vec_st(vfiller, 0, (uint32_t *) d + 16);
+ vec_st(vfiller, 0, (uint32_t *) d + 20);
+ vec_st(vfiller, 0, (uint32_t *) d + 24);
+ vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ return TRUE;
+}
+
+static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+ vmx_src1 = load_128_unaligned (src);
+ vmx_src2 = load_128_unaligned (src + 4);
+ vmx_src3 = load_128_unaligned (src + 8);
+ vmx_src4 = load_128_unaligned (src + 12);
+
+ save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+ save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+ save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+ save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t src, ia;
+ int i, w, dst_stride;
+ vector unsigned int vdst, vsrc, via;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ vsrc = (vector unsigned int){src, src, src, src};
+ via = negate (splat_alpha (vsrc));
+ ia = ALPHA_8 (~src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((uintptr_t)dst & 15))
+ {
+ uint32_t d = *dst;
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ *dst++ = d;
+ w--;
+ }
+
+ for (i = w / 4; i > 0; i--)
+ {
+ vdst = pix_multiply (load_128_aligned (dst), via);
+ save_128_aligned (dst, pix_add (vsrc, vdst));
+ dst += 4;
+ }
+
+ for (i = w % 4; --i >= 0;)
+ {
+ uint32_t d = dst[i];
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ dst[i] = d;
+ }
+ }
+}
+
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
+
+ while (height--)
+ {
+ vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, ia;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ vector unsigned int vsrc, valpha, vmask, vdest;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ vsrc = (vector unsigned int) {src, src, src, src};
+ valpha = splat_alpha(vsrc);
+ ia = ALPHA_8 (src);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+ uint32_t s;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (uintptr_t)pd & 15)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ UN8x4_MUL_UN8x4 (s, m);
+ UN8x4_MUL_UN8 (m, ia);
+ m = ~m;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+ *pd = d;
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ /* pm is NOT necessarily 16-byte aligned */
+ vmask = load_128_unaligned (pm);
+
+ pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
+
+ /* if all bits in mask are zero, pack_cmp is not 0 */
+ if (pack_cmp == 0)
+ {
+ /* pd is 16-byte aligned */
+ vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
+
+ save_128_aligned(pd, vdest);
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ UN8x4_MUL_UN8x4 (s, m);
+ UN8x4_MUL_UN8 (m, ia);
+ m = ~m;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+ *pd = d;
+ }
+
+ pd++;
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (uintptr_t)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ vmx_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+
+ vmx_combine_add_u (imp, op, dst, src, NULL, width);
+ }
+}
+
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t src_width_fixed,
+ pixman_bool_t fully_transparent_src)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ vector unsigned int vsrc, vdst;
+
+ if (fully_transparent_src)
+ return;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((uintptr_t)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ vector unsigned int tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ tmp[0] = tmp1;
+ tmp[1] = tmp2;
+ tmp[2] = tmp3;
+ tmp[3] = tmp4;
+
+ vsrc = combine4 ((const uint32_t *) &tmp, pm);
+
+ if (is_opaque (vsrc))
+ {
+ save_128_aligned (pd, vsrc);
+ }
+ else if (!is_zero (vsrc))
+ {
+ vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
+
+ save_128_aligned (pd, vdst);
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
{ PIXMAN_OP_NONE },
};
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ vector unsigned int ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && ((uintptr_t)dst) & 0x0f)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((uintptr_t)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vmx0 = load_128_unaligned((uint32_t *) src);
+
+ unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+ save_128_aligned(dst, vmx6);
+ save_128_aligned((dst + 4), vmx5);
+ save_128_aligned((dst + 8), vmx4);
+ save_128_aligned((dst + 12), vmx3);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+#define IMAGE_FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
+ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+ { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+ },
+ { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+ },
+ { PIXMAN_null },
+};
+
pixman_implementation_t *
_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+ /* VMX constants */
+ mask_ff000000 = create_mask_32_128 (0xff000000);
+ mask_red = create_mask_32_128 (0x00f80000);
+ mask_green = create_mask_32_128 (0x0000fc00);
+ mask_blue = create_mask_32_128 (0x000000f8);
+ mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+ mask_565_fix_g = create_mask_32_128 (0x0000c000);
+
/* Set up function pointers */
imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
@@ -1643,5 +3151,9 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+ imp->fill = vmx_fill;
+
+ imp->iter_info = vmx_iters;
+
return imp;
}