diff options
Diffstat (limited to 'media/libtheora/lib')
55 files changed, 0 insertions, 16769 deletions
diff --git a/media/libtheora/lib/apiwrapper.c b/media/libtheora/lib/apiwrapper.c deleted file mode 100644 index dc959b8d1..000000000 --- a/media/libtheora/lib/apiwrapper.c +++ /dev/null @@ -1,166 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include "apiwrapper.h" - - - -const char *theora_version_string(void){ - return th_version_string(); -} - -ogg_uint32_t theora_version_number(void){ - return th_version_number(); -} - -void theora_info_init(theora_info *_ci){ - memset(_ci,0,sizeof(*_ci)); -} - -void theora_info_clear(theora_info *_ci){ - th_api_wrapper *api; - api=(th_api_wrapper *)_ci->codec_setup; - memset(_ci,0,sizeof(*_ci)); - if(api!=NULL){ - if(api->clear!=NULL)(*api->clear)(api); - _ogg_free(api); - } -} - -void theora_clear(theora_state *_th){ - /*Provide compatibility with mixed encoder and decoder shared lib versions.*/ - if(_th->internal_decode!=NULL){ - (*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th); - } - if(_th->internal_encode!=NULL){ - (*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th); - } - if(_th->i!=NULL)theora_info_clear(_th->i); - memset(_th,0,sizeof(*_th)); -} - -int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){ - /*Provide compatibility with mixed encoder and decoder shared lib versions.*/ - if(_th->internal_decode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th, - _req,_buf,_buf_sz); - } - else if(_th->internal_encode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th, - _req,_buf,_buf_sz); - } - else return TH_EINVAL; -} - -ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){ - /*Provide compatibility with mixed encoder and decoder shared lib versions.*/ - if(_th->internal_decode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)( - _th,_gp); - } - else if(_th->internal_encode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)( - _th,_gp); - } - else return -1; -} - -double theora_granule_time(theora_state *_th, ogg_int64_t _gp){ - /*Provide compatibility with mixed encoder and decoder shared lib versions.*/ - if(_th->internal_decode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)( - _th,_gp); - } - else if(_th->internal_encode!=NULL){ - return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)( - _th,_gp); - } - else return -1; -} - -void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){ - _info->version_major=_ci->version_major; - _info->version_minor=_ci->version_minor; - _info->version_subminor=_ci->version_subminor; - _info->frame_width=_ci->width; - _info->frame_height=_ci->height; - _info->pic_width=_ci->frame_width; - _info->pic_height=_ci->frame_height; - _info->pic_x=_ci->offset_x; - _info->pic_y=_ci->offset_y; - _info->fps_numerator=_ci->fps_numerator; - _info->fps_denominator=_ci->fps_denominator; - _info->aspect_numerator=_ci->aspect_numerator; - _info->aspect_denominator=_ci->aspect_denominator; - switch(_ci->colorspace){ - case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break; - case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break; - default:_info->colorspace=TH_CS_UNSPECIFIED;break; - } - switch(_ci->pixelformat){ - case OC_PF_420:_info->pixel_fmt=TH_PF_420;break; - case OC_PF_422:_info->pixel_fmt=TH_PF_422;break; - case OC_PF_444:_info->pixel_fmt=TH_PF_444;break; - default:_info->pixel_fmt=TH_PF_RSVD; - } - _info->target_bitrate=_ci->target_bitrate; - _info->quality=_ci->quality; - _info->keyframe_granule_shift=_ci->keyframe_frequency_force>0? - OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0; -} - -int theora_packet_isheader(ogg_packet *_op){ - return th_packet_isheader(_op); -} - -int theora_packet_iskeyframe(ogg_packet *_op){ - return th_packet_iskeyframe(_op); -} - -int theora_granule_shift(theora_info *_ci){ - /*This breaks when keyframe_frequency_force is not positive or is larger than - 2**31 (if your int is more than 32 bits), but that's what the original - function does.*/ - return oc_ilog(_ci->keyframe_frequency_force-1); -} - -void theora_comment_init(theora_comment *_tc){ - th_comment_init((th_comment *)_tc); -} - -char *theora_comment_query(theora_comment *_tc,char *_tag,int _count){ - return th_comment_query((th_comment *)_tc,_tag,_count); -} - -int theora_comment_query_count(theora_comment *_tc,char *_tag){ - return th_comment_query_count((th_comment *)_tc,_tag); -} - -void theora_comment_clear(theora_comment *_tc){ - th_comment_clear((th_comment *)_tc); -} - -void theora_comment_add(theora_comment *_tc,char *_comment){ - th_comment_add((th_comment *)_tc,_comment); -} - -void theora_comment_add_tag(theora_comment *_tc, char *_tag, char *_value){ - th_comment_add_tag((th_comment *)_tc,_tag,_value); -} diff --git a/media/libtheora/lib/apiwrapper.h b/media/libtheora/lib/apiwrapper.h deleted file mode 100644 index ff45e0a4d..000000000 --- a/media/libtheora/lib/apiwrapper.h +++ /dev/null @@ -1,54 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: apiwrapper.h 13596 2007-08-23 20:05:38Z tterribe $ - - ********************************************************************/ - -#if !defined(_apiwrapper_H) -# define _apiwrapper_H (1) -# include <ogg/ogg.h> -# include <theora/theora.h> -# include "theora/theoradec.h" -# include "theora/theoraenc.h" -# include "state.h" - -typedef struct th_api_wrapper th_api_wrapper; -typedef struct th_api_info th_api_info; - -/*Provide an entry point for the codec setup to clear itself in case we ever - want to break pieces off into a common base library shared by encoder and - decoder. - In addition, this makes several other pieces of the API wrapper cleaner.*/ -typedef void (*oc_setup_clear_func)(void *_ts); - -/*Generally only one of these pointers will be non-NULL in any given instance. - Technically we do not even really need this struct, since we should be able - to figure out which one from "context", but doing it this way makes sure we - don't flub it up.*/ -struct th_api_wrapper{ - oc_setup_clear_func clear; - th_setup_info *setup; - th_dec_ctx *decode; - th_enc_ctx *encode; -}; - -struct th_api_info{ - th_api_wrapper api; - theora_info info; -}; - - -void oc_theora_info2th_info(th_info *_info,const theora_info *_ci); - -#endif diff --git a/media/libtheora/lib/arm/arm2gnu.pl b/media/libtheora/lib/arm/arm2gnu.pl deleted file mode 100644 index 5831bd81e..000000000 --- a/media/libtheora/lib/arm/arm2gnu.pl +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/perl - -my $bigend; # little/big endian -my $nxstack; - -$nxstack = 0; - -eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' - if $running_under_some_shell; - -while ($ARGV[0] =~ /^-/) { - $_ = shift; - last if /^--/; - if (/^-n/) { - $nflag++; - next; - } - die "I don't recognize this switch: $_\\n"; -} -$printit++ unless $nflag; - -$\ = "\n"; # automatically add newline on print -$n=0; - -$thumb = 0; # ARM mode by default, not Thumb. - -LINE: -while (<>) { - - # For ADRLs we need to add a new line after the substituted one. - $addPadding = 0; - - # First, we do not dare to touch *anything* inside double quotes, do we? - # Second, if you want a dollar character in the string, - # insert two of them -- that's how ARM C and assembler treat strings. - s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next }; - s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next }; - s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next }; - # If there's nothing on a line but a comment, don't try to apply any further - # substitutions (this is a cheap hack to avoid mucking up the license header) - s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next }; - # If substituted -- leave immediately ! - - s/@/,:/; - s/;/@/; - while ( /@.*'/ ) { - s/(@.*)'/$1/g; - } - s/\{FALSE\}/0/g; - s/\{TRUE\}/1/g; - s/\{(\w\w\w\w+)\}/$1/g; - s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/; - s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/; - s/\bIMPORT\b/.extern/; - s/\bEXPORT\b/.global/; - s/^(\s+)\[/$1IF/; - s/^(\s+)\|/$1ELSE/; - s/^(\s+)\]/$1ENDIF/; - s/IF *:DEF:/ .ifdef/; - s/IF *:LNOT: *:DEF:/ .ifndef/; - s/ELSE/ .else/; - s/ENDIF/ .endif/; - - if( /\bIF\b/ ) { - s/\bIF\b/ .if/; - s/=/==/; - } - if ( $n == 2) { - s/\$/\\/g; - } - if ($n == 1) { - s/\$//g; - s/label//g; - $n = 2; - } - if ( /MACRO/ ) { - s/MACRO *\n/.macro/; - $n=1; - } - if ( /\bMEND\b/ ) { - s/\bMEND\b/.endm/; - $n=0; - } - - # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there. - # - if ( /\bAREA\b/ ) { - if ( /CODE/ ) { - $nxstack = 1; - } - s/^(.+)CODE(.+)READONLY(.*)/ .text/; - s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata\n .align 2/; - s/^(.+)\|\|\.data\|\|(.+)/ .data\n .align 2/; - s/^(.+)\|\|\.bss\|\|(.+)/ .bss/; - } - - s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3|| - s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2|| - s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2|| - s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/; - s/^(\s+)\%(\s)/ .space $1/; - - s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123 - s/\bCODE32\b/.code 32/ && do {$thumb = 0}; - s/\bCODE16\b/.code 16/ && do {$thumb = 1}; - if (/\bPROC\b/) - { - print " .thumb_func" if ($thumb); - s/\bPROC\b/@ $&/; - } - s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/; - s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/; - s/\bENDP\b/@ $&/; - s/\bSUBT\b/@ $&/; - s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25 - s/\bKEEP\b/@ $&/; - s/\bEXPORTAS\b/@ $&/; - s/\|\|(.)+\bEQU\b/@ $&/; - s/\|\|([\w\$]+)\|\|/$1/; - s/\bENTRY\b/@ $&/; - s/\bASSERT\b/@ $&/; - s/\bGBLL\b/@ $&/; - s/\bGBLA\b/@ $&/; - s/^\W+OPT\b/@ $&/; - s/:OR:/|/g; - s/:SHL:/<</g; - s/:SHR:/>>/g; - s/:AND:/&/g; - s/:LAND:/&&/g; - s/CPSR/cpsr/; - s/SPSR/spsr/; - s/ALIGN$/.balign 4/; - s/ALIGN\s+([0-9x]+)$/.balign $1/; - s/psr_cxsf/psr_all/; - s/LTORG/.ltorg/; - s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/; - s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/; - s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/; - s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/; - - # {PC} + 0xdeadfeed --> . + 0xdeadfeed - s/\{PC\} \+/ \. +/; - - # Single hex constant on the line ! - # - # >>> NOTE <<< - # Double-precision floats in gcc are always mixed-endian, which means - # bytes in two words are little-endian, but words are big-endian. - # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address - # and 0xfeed0000 at high address. - # - s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/; - # Only decimal constants on the line, no hex ! - s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/; - - # Single hex constant on the line ! -# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/; - # Only decimal constants on the line, no hex ! -# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/; - s/\bDCFS[ \t]+0x/.word 0x/; - s/\bDCFS\b/.float/; - - s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/; - s/\bDCD\b/.word/; - s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/; - s/\bDCW\b/.short/; - s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/; - s/\bDCB\b/.byte/; - s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/; - s/^[A-Za-z_\.]\w+/$&:/; - s/^(\d+)/$1:/; - s/\%(\d+)/$1b_or_f/; - s/\%[Bb](\d+)/$1b/; - s/\%[Ff](\d+)/$1f/; - s/\%[Ff][Tt](\d+)/$1f/; - s/&([\dA-Fa-f]+)/0x$1/; - if ( /\b2_[01]+\b/ ) { - s/\b2_([01]+)\b/conv$1&&&&/g; - while ( /[01][01][01][01]&&&&/ ) { - s/0000&&&&/&&&&0/g; - s/0001&&&&/&&&&1/g; - s/0010&&&&/&&&&2/g; - s/0011&&&&/&&&&3/g; - s/0100&&&&/&&&&4/g; - s/0101&&&&/&&&&5/g; - s/0110&&&&/&&&&6/g; - s/0111&&&&/&&&&7/g; - s/1000&&&&/&&&&8/g; - s/1001&&&&/&&&&9/g; - s/1010&&&&/&&&&A/g; - s/1011&&&&/&&&&B/g; - s/1100&&&&/&&&&C/g; - s/1101&&&&/&&&&D/g; - s/1110&&&&/&&&&E/g; - s/1111&&&&/&&&&F/g; - } - s/000&&&&/&&&&0/g; - s/001&&&&/&&&&1/g; - s/010&&&&/&&&&2/g; - s/011&&&&/&&&&3/g; - s/100&&&&/&&&&4/g; - s/101&&&&/&&&&5/g; - s/110&&&&/&&&&6/g; - s/111&&&&/&&&&7/g; - s/00&&&&/&&&&0/g; - s/01&&&&/&&&&1/g; - s/10&&&&/&&&&2/g; - s/11&&&&/&&&&3/g; - s/0&&&&/&&&&0/g; - s/1&&&&/&&&&1/g; - s/conv&&&&/0x/g; - } - - if ( /commandline/) - { - if( /-bigend/) - { - $bigend=1; - } - } - - if ( /\bDCDU\b/ ) - { - my $cmd=$_; - my $value; - my $w1; - my $w2; - my $w3; - my $w4; - - s/\s+DCDU\b/@ $&/; - - $cmd =~ /\bDCDU\b\s+0x(\d+)/; - $value = $1; - $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/; - $w1 = $1; - $w2 = $2; - $w3 = $3; - $w4 = $4; - - if( $bigend ne "") - { - # big endian - - print " .byte 0x".$w1; - print " .byte 0x".$w2; - print " .byte 0x".$w3; - print " .byte 0x".$w4; - } - else - { - # little endian - - print " .byte 0x".$w4; - print " .byte 0x".$w3; - print " .byte 0x".$w2; - print " .byte 0x".$w1; - } - - } - - - if ( /\badrl\b/i ) - { - s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i; - $addPadding = 1; - } - s/\bEND\b/@ END/; -} continue { - printf ("%s", $_) if $printit; - if ($addPadding != 0) - { - printf (" mov r0,r0\n"); - $addPadding = 0; - } -} -#If we had a code section, mark that this object doesn't need an executable -# stack. -if ($nxstack) { - printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n"); -} diff --git a/media/libtheora/lib/arm/armbits.h b/media/libtheora/lib/arm/armbits.h deleted file mode 100644 index 1540d7eb5..000000000 --- a/media/libtheora/lib/arm/armbits.h +++ /dev/null @@ -1,32 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $ - - ********************************************************************/ -#if !defined(_arm_armbits_H) -# define _arm_armbits_H (1) -# include "../bitpack.h" -# include "armcpu.h" - -# if defined(OC_ARM_ASM) -# define oc_pack_read oc_pack_read_arm -# define oc_pack_read1 oc_pack_read1_arm -# define oc_huff_token_decode oc_huff_token_decode_arm -# endif - -long oc_pack_read_arm(oc_pack_buf *_b,int _bits); -int oc_pack_read1_arm(oc_pack_buf *_b); -int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree); - -#endif diff --git a/media/libtheora/lib/arm/armbits.s b/media/libtheora/lib/arm/armbits.s deleted file mode 100644 index 0fdb6fdd3..000000000 --- a/media/libtheora/lib/arm/armbits.s +++ /dev/null @@ -1,236 +0,0 @@ -;******************************************************************** -;* * -;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * -;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * -;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * -;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * -;* * -;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * -;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * -;* * -;******************************************************************** -; -; function: -; last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $ -; -;******************************************************************** - - AREA |.text|, CODE, READONLY - - ; Explicitly specifying alignment here because some versions of - ; gas don't align code correctly. See - ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html - ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 - ALIGN - - EXPORT oc_pack_read_arm - EXPORT oc_pack_read1_arm - EXPORT oc_huff_token_decode_arm - -oc_pack_read1_arm PROC - ; r0 = oc_pack_buf *_b - ADD r12,r0,#8 - LDMIA r12,{r2,r3} ; r2 = window - ; Stall... ; r3 = available - ; Stall... - SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT - BLT oc_pack_read1_refill - MOV r0,r2,LSR #31 ; r0 = window>>31 - MOV r2,r2,LSL #1 ; r2 = window<<=1 - STMIA r12,{r2,r3} ; window = r2 - ; available = r3 - MOV PC,r14 - ENDP - -oc_pack_read_arm PROC - ; r0 = oc_pack_buf *_b - ; r1 = int _bits - ADD r12,r0,#8 - LDMIA r12,{r2,r3} ; r2 = window - ; Stall... ; r3 = available - ; Stall... - SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT - BLT oc_pack_read_refill - RSB r0,r1,#32 ; r0 = 32-_bits - MOV r0,r2,LSR r0 ; r0 = window>>32-_bits - MOV r2,r2,LSL r1 ; r2 = window<<=_bits - STMIA r12,{r2,r3} ; window = r2 - ; available = r3 - MOV PC,r14 - -; We need to refill window. -oc_pack_read1_refill - MOV r1,#1 -oc_pack_read_refill - STMFD r13!,{r10,r11,r14} - LDMIA r0,{r10,r11} ; r10 = stop - ; r11 = ptr - RSB r0,r1,#32 ; r0 = 32-_bits - RSB r3,r3,r0 ; r3 = 32-available -; We can use unsigned compares for both the pointers and for available -; (allowing us to chain condition codes) because available will never be -; larger than 32 (or we wouldn't be here), and thus 32-available will never be -; negative. - CMP r10,r11 ; ptr<stop => HI - CMPHI r3,#7 ; available<=24 => HI - LDRHIB r14,[r11],#1 ; r14 = *ptr++ - SUBHI r3,#8 ; available += 8 - ; (HI) Stall... - ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available - CMPHI r10,r11 ; ptr<stop => HI - CMPHI r3,#7 ; available<=24 => HI - LDRHIB r14,[r11],#1 ; r14 = *ptr++ - SUBHI r3,#8 ; available += 8 - ; (HI) Stall... - ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available - CMPHI r10,r11 ; ptr<stop => HI - CMPHI r3,#7 ; available<=24 => HI - LDRHIB r14,[r11],#1 ; r14 = *ptr++ - SUBHI r3,#8 ; available += 8 - ; (HI) Stall... - ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available - CMPHI r10,r11 ; ptr<stop => HI - CMPHI r3,#7 ; available<=24 => HI - LDRHIB r14,[r11],#1 ; r14 = *ptr++ - SUBHI r3,#8 ; available += 8 - ; (HI) Stall... - ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available - SUBS r3,r0,r3 ; r3 = available-=_bits, available<bits => GT - BLT oc_pack_read_refill_last - MOV r0,r2,LSR r0 ; r0 = window>>32-_bits - MOV r2,r2,LSL r1 ; r2 = window<<=_bits - STR r11,[r12,#-4] ; ptr = r11 - STMIA r12,{r2,r3} ; window = r2 - ; available = r3 - LDMFD r13!,{r10,r11,PC} - -; Either we wanted to read more than 24 bits and didn't have enough room to -; stuff the last byte into the window, or we hit the end of the packet. -oc_pack_read_refill_last - CMP r11,r10 ; ptr<stop => LO -; If we didn't hit the end of the packet, then pull enough of the next byte to -; to fill up the window. - LDRLOB r14,[r11] ; (LO) r14 = *ptr -; Otherwise, set the EOF flag and pretend we have lots of available bits. - MOVHS r14,#1 ; (HS) r14 = 1 - ADDLO r10,r3,r1 ; (LO) r10 = available - STRHS r14,[r12,#8] ; (HS) eof = 1 - ANDLO r10,r10,#7 ; (LO) r10 = available&7 - MOVHS r3,#1<<30 ; (HS) available = OC_LOTS_OF_BITS - ORRLO r2,r14,LSL r10 ; (LO) r2 = window|=*ptr>>(available&7) - MOV r0,r2,LSR r0 ; r0 = window>>32-_bits - MOV r2,r2,LSL r1 ; r2 = window<<=_bits - STR r11,[r12,#-4] ; ptr = r11 - STMIA r12,{r2,r3} ; window = r2 - ; available = r3 - LDMFD r13!,{r10,r11,PC} - ENDP - - - -oc_huff_token_decode_arm PROC - ; r0 = oc_pack_buf *_b - ; r1 = const ogg_int16_t *_tree - STMFD r13!,{r4,r5,r10,r14} - LDRSH r10,[r1] ; r10 = n=_tree[0] - LDMIA r0,{r2-r5} ; r2 = stop - ; Stall... ; r3 = ptr - ; Stall... ; r4 = window - ; r5 = available - CMP r10,r5 ; n>available => GT - BGT oc_huff_token_decode_refill0 - RSB r14,r10,#32 ; r14 = 32-n - MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n - ADD r14,r1,r14,LSL #1 ; r14 = _tree+bits - LDRSH r12,[r14,#2] ; r12 = node=_tree[1+bits] - ; Stall... - ; Stall... - RSBS r14,r12,#0 ; r14 = -node, node>0 => MI - BMI oc_huff_token_decode_continue - MOV r10,r14,LSR #8 ; r10 = n=node>>8 - MOV r4,r4,LSL r10 ; r4 = window<<=n - SUB r5,r10 ; r5 = available-=n - STMIB r0,{r3-r5} ; ptr = r3 - ; window = r4 - ; available = r5 - AND r0,r14,#255 ; r0 = node&255 - LDMFD r13!,{r4,r5,r10,pc} - -; The first tree node wasn't enough to reach a leaf, read another -oc_huff_token_decode_continue - ADD r12,r1,r12,LSL #1 ; r12 = _tree+node - MOV r4,r4,LSL r10 ; r4 = window<<=n - SUB r5,r5,r10 ; r5 = available-=n - LDRSH r10,[r12],#2 ; r10 = n=_tree[node] - ; Stall... ; r12 = _tree+node+1 - ; Stall... - CMP r10,r5 ; n>available => GT - BGT oc_huff_token_decode_refill - RSB r14,r10,#32 ; r14 = 32-n - MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n - ADD r12,r12,r14 ; - LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits] - ; Stall... - ; Stall... - RSBS r14,r12,#0 ; r14 = -node, node>0 => MI - BMI oc_huff_token_decode_continue - MOV r10,r14,LSR #8 ; r10 = n=node>>8 - MOV r4,r4,LSL r10 ; r4 = window<<=n - SUB r5,r10 ; r5 = available-=n - STMIB r0,{r3-r5} ; ptr = r3 - ; window = r4 - ; available = r5 - AND r0,r14,#255 ; r0 = node&255 - LDMFD r13!,{r4,r5,r10,pc} - -oc_huff_token_decode_refill0 - ADD r12,r1,#2 ; r12 = _tree+1 -oc_huff_token_decode_refill -; We can't possibly need more than 15 bits, so available must be <= 15. -; Therefore we can load at least two bytes without checking it. - CMP r2,r3 ; ptr<stop => HI - LDRHIB r14,[r3],#1 ; r14 = *ptr++ - RSBHI r5,r5,#24 ; (HI) available = 32-(available+=8) - RSBLS r5,r5,#32 ; (LS) r5 = 32-available - ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available - CMPHI r2,r3 ; ptr<stop => HI - LDRHIB r14,[r3],#1 ; r14 = *ptr++ - SUBHI r5,#8 ; available += 8 - ; (HI) Stall... - ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available -; We can use unsigned compares for both the pointers and for available -; (allowing us to chain condition codes) because available will never be -; larger than 32 (or we wouldn't be here), and thus 32-available will never be -; negative. - CMPHI r2,r3 ; ptr<stop => HI - CMPHI r5,#7 ; available<=24 => HI - LDRHIB r14,[r3],#1 ; r14 = *ptr++ - SUBHI r5,#8 ; available += 8 - ; (HI) Stall... - ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available - CMP r2,r3 ; ptr<stop => HI - MOVLS r5,#-1<<30 ; (LS) available = OC_LOTS_OF_BITS+32 - CMPHI r5,#7 ; (HI) available<=24 => HI - LDRHIB r14,[r3],#1 ; (HI) r14 = *ptr++ - SUBHI r5,#8 ; (HI) available += 8 - ; (HI) Stall... - ORRHI r4,r14,LSL r5 ; (HI) r4 = window|=r14<<32-available - RSB r14,r10,#32 ; r14 = 32-n - MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n - ADD r12,r12,r14 ; - LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits] - RSB r5,r5,#32 ; r5 = available - ; Stall... - RSBS r14,r12,#0 ; r14 = -node, node>0 => MI - BMI oc_huff_token_decode_continue - MOV r10,r14,LSR #8 ; r10 = n=node>>8 - MOV r4,r4,LSL r10 ; r4 = window<<=n - SUB r5,r10 ; r5 = available-=n - STMIB r0,{r3-r5} ; ptr = r3 - ; window = r4 - ; available = r5 - AND r0,r14,#255 ; r0 = node&255 - LDMFD r13!,{r4,r5,r10,pc} - ENDP - - END diff --git a/media/libtheora/lib/arm/armcpu.c b/media/libtheora/lib/arm/armcpu.c deleted file mode 100644 index 8b0f9a857..000000000 --- a/media/libtheora/lib/arm/armcpu.c +++ /dev/null @@ -1,116 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - CPU capability detection for ARM processors. - - function: - last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $ - - ********************************************************************/ - -#include "armcpu.h" - -#if !defined(OC_ARM_ASM)|| \ - !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \ - !defined(OC_ARM_ASM_NEON) -ogg_uint32_t oc_cpu_flags_get(void){ - return 0; -} - -#elif defined(_MSC_VER) -/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ -# define WIN32_LEAN_AND_MEAN -# define WIN32_EXTRA_LEAN -# include <windows.h> - -ogg_uint32_t oc_cpu_flags_get(void){ - ogg_uint32_t flags; - flags=0; - /*MSVC has no inline __asm support for ARM, but it does let you __emit - instructions via their assembled hex code. - All of these instructions should be essentially nops.*/ -# if defined(OC_ARM_ASM_EDSP) - __try{ - /*PLD [r13]*/ - __emit(0xF5DDF000); - flags|=OC_CPU_ARM_EDSP; - } - __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ - /*Ignore exception.*/ - } -# if defined(OC_ARM_ASM_MEDIA) - __try{ - /*SHADD8 r3,r3,r3*/ - __emit(0xE6333F93); - flags|=OC_CPU_ARM_MEDIA; - } - __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ - /*Ignore exception.*/ - } -# if defined(OC_ARM_ASM_NEON) - __try{ - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags|=OC_CPU_ARM_NEON; - } - __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ - /*Ignore exception.*/ - } -# endif -# endif -# endif - return flags; -} - -#elif defined(__linux__) -# include <stdio.h> -# include <stdlib.h> -# include <string.h> - -ogg_uint32_t oc_cpu_flags_get(void){ - ogg_uint32_t flags; - FILE *fin; - flags=0; - /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on - Android. - This also means that detection will fail in Scratchbox.*/ - fin=fopen("/proc/cpuinfo","r"); - if(fin!=NULL){ - /*512 should be enough for anybody (it's even enough for all the flags that - x86 has accumulated... so far).*/ - char buf[512]; - while(fgets(buf,511,fin)!=NULL){ - if(memcmp(buf,"Features",8)==0){ - char *p; - p=strstr(buf," edsp"); - if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP; - p=strstr(buf," neon"); - if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON; - } - if(memcmp(buf,"CPU architecture:",17)==0){ - int version; - version=atoi(buf+17); - if(version>=6)flags|=OC_CPU_ARM_MEDIA; - } - } - fclose(fin); - } - return flags; -} - -#else -/*The feature registers which can tell us what the processor supports are - accessible in priveleged modes only, so we can't have a general user-space - detection method like on x86.*/ -# error "Configured to use ARM asm but no CPU detection method available for " \ - "your platform. Reconfigure with --disable-asm (or send patches)." -#endif diff --git a/media/libtheora/lib/arm/armcpu.h b/media/libtheora/lib/arm/armcpu.h deleted file mode 100644 index 18dd95821..000000000 --- a/media/libtheora/lib/arm/armcpu.h +++ /dev/null @@ -1,29 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - function: - last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $ - - ********************************************************************/ - -#if !defined(_arm_armcpu_H) -# define _arm_armcpu_H (1) -#include "../internal.h" - -/*"Parallel instructions" from ARM v6 and above.*/ -#define OC_CPU_ARM_MEDIA (1<<24) -/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/ -#define OC_CPU_ARM_EDSP (1<<7) -#define OC_CPU_ARM_NEON (1<<12) - -ogg_uint32_t oc_cpu_flags_get(void); - -#endif diff --git a/media/libtheora/lib/arm/armfrag.s b/media/libtheora/lib/arm/armfrag.s deleted file mode 100644 index e20579eee..000000000 --- a/media/libtheora/lib/arm/armfrag.s +++ /dev/null @@ -1,662 +0,0 @@ -;******************************************************************** -;* * -;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * -;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * -;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * -;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * -;* * -;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * -;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * -;* * -;******************************************************************** -; Original implementation: -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $ -;******************************************************************** - - AREA |.text|, CODE, READONLY - - ; Explicitly specifying alignment here because some versions of - ; gas don't align code correctly. See - ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html - ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 - ALIGN - - GET armopts.s - -; Vanilla ARM v4 versions - EXPORT oc_frag_copy_list_arm - EXPORT oc_frag_recon_intra_arm - EXPORT oc_frag_recon_inter_arm - EXPORT oc_frag_recon_inter2_arm - -oc_frag_copy_list_arm PROC - ; r0 = _dst_frame - ; r1 = _src_frame - ; r2 = _ystride - ; r3 = _fragis - ; <> = _nfragis - ; <> = _frag_buf_offs - LDR r12,[r13] ; r12 = _nfragis - STMFD r13!,{r4-r6,r11,r14} - SUBS r12, r12, #1 - LDR r4,[r3],#4 ; r4 = _fragis[fragii] - LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs - BLT ofcl_arm_end - SUB r2, r2, #4 -ofcl_arm_lp - LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]] - SUBS r12, r12, #1 - ; Stall (on XScale) - ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off - LDR r6, [r4], #4 - ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4], r2 - STR r6, [r11],#4 - LDR r6, [r4], #4 - STR r5, [r11],r2 - LDR r5, [r4] - LDRGE r4,[r3],#4 ; r4 = _fragis[fragii] - STR r6, [r11],#4 - STR r5, [r11] - BGE ofcl_arm_lp -ofcl_arm_end - LDMFD r13!,{r4-r6,r11,PC} -oc_frag_recon_intra_arm - ; r0 = unsigned char *_dst - ; r1 = int _ystride - ; r2 = const ogg_int16_t _residue[64] - STMFD r13!,{r4,r5,r14} - MOV r14,#8 - MOV r5, #255 - SUB r1, r1, #7 -ofrintra_lp_arm - LDRSH r3, [r2], #2 - LDRSH r4, [r2], #2 - LDRSH r12,[r2], #2 - ADDS r3, r3, #128 - CMPGT r5, r3 - EORLT r3, r5, r3, ASR #32 - STRB r3, [r0], #1 - ADDS r4, r4, #128 - CMPGT r5, r4 - EORLT r4, r5, r4, ASR #32 - LDRSH r3, [r2], #2 - STRB r4, [r0], #1 - ADDS r12,r12,#128 - CMPGT r5, r12 - EORLT r12,r5, r12,ASR #32 - LDRSH r4, [r2], #2 - STRB r12,[r0], #1 - ADDS r3, r3, #128 - CMPGT r5, r3 - EORLT r3, r5, r3, ASR #32 - LDRSH r12,[r2], #2 - STRB r3, [r0], #1 - ADDS r4, r4, #128 - CMPGT r5, r4 - EORLT r4, r5, r4, ASR #32 - LDRSH r3, [r2], #2 - STRB r4, [r0], #1 - ADDS r12,r12,#128 - CMPGT r5, r12 - EORLT r12,r5, r12,ASR #32 - LDRSH r4, [r2], #2 - STRB r12,[r0], #1 - ADDS r3, r3, #128 - CMPGT r5, r3 - EORLT r3, r5, r3, ASR #32 - STRB r3, [r0], #1 - ADDS r4, r4, #128 - CMPGT r5, r4 - EORLT r4, r5, r4, ASR #32 - STRB r4, [r0], r1 - SUBS r14,r14,#1 - BGT ofrintra_lp_arm - LDMFD r13!,{r4,r5,PC} - ENDP - -oc_frag_recon_inter_arm PROC - ; r0 = unsigned char *dst - ; r1 = const unsigned char *src - ; r2 = int ystride - ; r3 = const ogg_int16_t residue[64] - STMFD r13!,{r5,r9-r11,r14} - MOV r9, #8 - MOV r5, #255 - SUB r2, r2, #7 -ofrinter_lp_arm - LDRSH r12,[r3], #2 - LDRB r14,[r1], #1 - LDRSH r11,[r3], #2 - LDRB r10,[r1], #1 - ADDS r12,r12,r14 - CMPGT r5, r12 - EORLT r12,r5, r12,ASR #32 - STRB r12,[r0], #1 - ADDS r11,r11,r10 - CMPGT r5, r11 - LDRSH r12,[r3], #2 - LDRB r14,[r1], #1 - EORLT r11,r5, r11,ASR #32 - STRB r11,[r0], #1 - ADDS r12,r12,r14 - CMPGT r5, r12 - LDRSH r11,[r3], #2 - LDRB r10,[r1], #1 - EORLT r12,r5, r12,ASR #32 - STRB r12,[r0], #1 - ADDS r11,r11,r10 - CMPGT r5, r11 - LDRSH r12,[r3], #2 - LDRB r14,[r1], #1 - EORLT r11,r5, r11,ASR #32 - STRB r11,[r0], #1 - ADDS r12,r12,r14 - CMPGT r5, r12 - LDRSH r11,[r3], #2 - LDRB r10,[r1], #1 - EORLT r12,r5, r12,ASR #32 - STRB r12,[r0], #1 - ADDS r11,r11,r10 - CMPGT r5, r11 - LDRSH r12,[r3], #2 - LDRB r14,[r1], #1 - EORLT r11,r5, r11,ASR #32 - STRB r11,[r0], #1 - ADDS r12,r12,r14 - CMPGT r5, r12 - LDRSH r11,[r3], #2 - LDRB r10,[r1], r2 - EORLT r12,r5, r12,ASR #32 - STRB r12,[r0], #1 - ADDS r11,r11,r10 - CMPGT r5, r11 - EORLT r11,r5, r11,ASR #32 - STRB r11,[r0], r2 - SUBS r9, r9, #1 - BGT ofrinter_lp_arm - LDMFD r13!,{r5,r9-r11,PC} - ENDP - -oc_frag_recon_inter2_arm PROC - ; r0 = unsigned char *dst - ; r1 = const unsigned char *src1 - ; r2 = const unsigned char *src2 - ; r3 = int ystride - LDR r12,[r13] - ; r12= const ogg_int16_t residue[64] - STMFD r13!,{r4-r8,r14} - MOV r14,#8 - MOV r8, #255 - SUB r3, r3, #7 -ofrinter2_lp_arm - LDRB r5, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - LDRB r7, [r1], #1 - ADD r5, r5, r6 - ADDS r5, r4, r5, LSR #1 - CMPGT r8, r5 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r5, r8, r5, ASR #32 - STRB r5, [r0], #1 - ADD r7, r7, r6 - ADDS r7, r4, r7, LSR #1 - CMPGT r8, r7 - LDRB r5, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r7, r8, r7, ASR #32 - STRB r7, [r0], #1 - ADD r5, r5, r6 - ADDS r5, r4, r5, LSR #1 - CMPGT r8, r5 - LDRB r7, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r5, r8, r5, ASR #32 - STRB r5, [r0], #1 - ADD r7, r7, r6 - ADDS r7, r4, r7, LSR #1 - CMPGT r8, r7 - LDRB r5, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r7, r8, r7, ASR #32 - STRB r7, [r0], #1 - ADD r5, r5, r6 - ADDS r5, r4, r5, LSR #1 - CMPGT r8, r5 - LDRB r7, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r5, r8, r5, ASR #32 - STRB r5, [r0], #1 - ADD r7, r7, r6 - ADDS r7, r4, r7, LSR #1 - CMPGT r8, r7 - LDRB r5, [r1], #1 - LDRB r6, [r2], #1 - LDRSH r4, [r12],#2 - EORLT r7, r8, r7, ASR #32 - STRB r7, [r0], #1 - ADD r5, r5, r6 - ADDS r5, r4, r5, LSR #1 - CMPGT r8, r5 - LDRB r7, [r1], r3 - LDRB r6, [r2], r3 - LDRSH r4, [r12],#2 - EORLT r5, r8, r5, ASR #32 - STRB r5, [r0], #1 - ADD r7, r7, r6 - ADDS r7, r4, r7, LSR #1 - CMPGT r8, r7 - EORLT r7, r8, r7, ASR #32 - STRB r7, [r0], r3 - SUBS r14,r14,#1 - BGT ofrinter2_lp_arm - LDMFD r13!,{r4-r8,PC} - ENDP - - [ OC_ARM_ASM_EDSP - EXPORT oc_frag_copy_list_edsp - -oc_frag_copy_list_edsp PROC - ; r0 = _dst_frame - ; r1 = _src_frame - ; r2 = _ystride - ; r3 = _fragis - ; <> = _nfragis - ; <> = _frag_buf_offs - LDR r12,[r13] ; r12 = _nfragis - STMFD r13!,{r4-r11,r14} - SUBS r12, r12, #1 - LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] - LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs - BLT ofcl_edsp_end -ofcl_edsp_lp - MOV r4, r1 - LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]] - SUBS r12, r12, #1 - ; Stall (on XScale) - LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off - LDRD r8, [r4, r2]! - ; Stall - STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off - STRD r8, [r5, r2]! - ; Stall - LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive - LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse - LDRD r10,[r4, r2]! ; than us only doing 2, and having to do - ; another pair of LDRD/STRD later on. - ; Stall - STRD r6, [r5, r2]! - STRD r8, [r5, r2]! - STRD r10,[r5, r2]! - LDRD r6, [r4, r2]! - LDRD r8, [r4, r2]! - LDRD r10,[r4, r2]! - STRD r6, [r5, r2]! - STRD r8, [r5, r2]! - STRD r10,[r5, r2]! - LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] - BGE ofcl_edsp_lp -ofcl_edsp_end - LDMFD r13!,{r4-r11,PC} - ENDP - ] - - [ OC_ARM_ASM_MEDIA - EXPORT oc_frag_recon_intra_v6 - EXPORT oc_frag_recon_inter_v6 - EXPORT oc_frag_recon_inter2_v6 - -oc_frag_recon_intra_v6 PROC - ; r0 = unsigned char *_dst - ; r1 = int _ystride - ; r2 = const ogg_int16_t _residue[64] - STMFD r13!,{r4-r6,r14} - MOV r14,#8 - MOV r12,r2 - LDR r6, =0x00800080 -ofrintra_v6_lp - LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222 - LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666 - SUBS r14,r14,#1 - QADD16 r2, r2, r6 - QADD16 r3, r3, r6 - QADD16 r4, r4, r6 - QADD16 r5, r5, r6 - USAT16 r2, #8, r2 ; r2 = __11__00 - USAT16 r3, #8, r3 ; r3 = __33__22 - USAT16 r4, #8, r4 ; r4 = __55__44 - USAT16 r5, #8, r5 ; r5 = __77__66 - ORR r2, r2, r2, LSR #8 ; r2 = __111100 - ORR r3, r3, r3, LSR #8 ; r3 = __333322 - ORR r4, r4, r4, LSR #8 ; r4 = __555544 - ORR r5, r5, r5, LSR #8 ; r5 = __777766 - PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100 - PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544 - STRD r2, [r0], r1 - BGT ofrintra_v6_lp - LDMFD r13!,{r4-r6,PC} - ENDP - -oc_frag_recon_inter_v6 PROC - ; r0 = unsigned char *_dst - ; r1 = const unsigned char *_src - ; r2 = int _ystride - ; r3 = const ogg_int16_t _residue[64] - STMFD r13!,{r4-r7,r14} - MOV r14,#8 -ofrinter_v6_lp - LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222 - SUBS r14,r14,#1 - [ OC_ARM_CAN_UNALIGN_LDRD - LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544 - | - LDR r5, [r1, #4] - LDR r4, [r1], r2 - ] - PKHBT r12,r6, r7, LSL #16 ; r12= 22220000 - PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 - UXTB16 r6,r4 ; r6 = __22__00 - UXTB16 r4,r4, ROR #8 ; r4 = __33__11 - QADD16 r12,r12,r6 ; r12= xx22xx00 - QADD16 r4, r7, r4 ; r4 = xx33xx11 - LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666 - USAT16 r4, #8, r4 ; r4 = __33__11 - USAT16 r12,#8,r12 ; r12= __22__00 - ORR r4, r12,r4, LSL #8 ; r4 = 33221100 - PKHBT r12,r6, r7, LSL #16 ; r12= 66664444 - PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555 - UXTB16 r6,r5 ; r6 = __66__44 - UXTB16 r5,r5, ROR #8 ; r5 = __77__55 - QADD16 r12,r12,r6 ; r12= xx66xx44 - QADD16 r5, r7, r5 ; r5 = xx77xx55 - USAT16 r12,#8, r12 ; r12= __66__44 - USAT16 r5, #8, r5 ; r4 = __77__55 - ORR r5, r12,r5, LSL #8 ; r5 = 33221100 - STRD r4, [r0], r2 - BGT ofrinter_v6_lp - LDMFD r13!,{r4-r7,PC} - ENDP - -oc_frag_recon_inter2_v6 PROC - ; r0 = unsigned char *_dst - ; r1 = const unsigned char *_src1 - ; r2 = const unsigned char *_src2 - ; r3 = int _ystride - LDR r12,[r13] - ; r12= const ogg_int16_t _residue[64] - STMFD r13!,{r4-r9,r14} - MOV r14,#8 -ofrinter2_v6_lp - LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666 - SUBS r14,r14,#1 - LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544 - LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544 - PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444 - PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555 - UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1 - UXTB16 r5, r4 ; r5 = __66__44 - UXTB16 r4, r4, ROR #8 ; r4 = __77__55 - QADD16 r8, r8, r5 ; r8 = xx66xx44 - QADD16 r9, r9, r4 ; r9 = xx77xx55 - LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000 - USAT16 r8, #8, r8 ; r8 = __66__44 - LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100 - USAT16 r9, #8, r9 ; r9 = __77__55 - LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100 - ORR r9, r8, r9, LSL #8 ; r9 = 77665544 - PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000 - UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1 - PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 - UXTB16 r5, r4 ; r5 = __22__00 - UXTB16 r4, r4, ROR #8 ; r4 = __33__11 - QADD16 r8, r8, r5 ; r8 = xx22xx00 - QADD16 r7, r7, r4 ; r7 = xx33xx11 - USAT16 r8, #8, r8 ; r8 = __22__00 - USAT16 r7, #8, r7 ; r7 = __33__11 - ORR r8, r8, r7, LSL #8 ; r8 = 33221100 - STRD r8, [r0], r3 - BGT ofrinter2_v6_lp - LDMFD r13!,{r4-r9,PC} - ENDP - ] - - [ OC_ARM_ASM_NEON - EXPORT oc_frag_copy_list_neon - EXPORT oc_frag_recon_intra_neon - EXPORT oc_frag_recon_inter_neon - EXPORT oc_frag_recon_inter2_neon - -oc_frag_copy_list_neon PROC - ; r0 = _dst_frame - ; r1 = _src_frame - ; r2 = _ystride - ; r3 = _fragis - ; <> = _nfragis - ; <> = _frag_buf_offs - LDR r12,[r13] ; r12 = _nfragis - STMFD r13!,{r4-r7,r14} - CMP r12, #1 - LDRGE r6, [r3] ; r6 = _fragis[fragii] - LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs - BLT ofcl_neon_end - ; Stall (2 on Xscale) - LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] - ; Stall (on XScale) - MOV r7, r6 ; Guarantee PLD points somewhere valid. -ofcl_neon_lp - ADD r4, r1, r6 - VLD1.64 {D0}, [r4@64], r2 - ADD r5, r0, r6 - VLD1.64 {D1}, [r4@64], r2 - SUBS r12, r12, #1 - VLD1.64 {D2}, [r4@64], r2 - LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii] - VLD1.64 {D3}, [r4@64], r2 - LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] - VLD1.64 {D4}, [r4@64], r2 - ADDGT r7, r1, r6 - VLD1.64 {D5}, [r4@64], r2 - PLD [r7] - VLD1.64 {D6}, [r4@64], r2 - PLD [r7, r2] - VLD1.64 {D7}, [r4@64] - PLD [r7, r2, LSL #1] - VST1.64 {D0}, [r5@64], r2 - ADDGT r7, r7, r2, LSL #2 - VST1.64 {D1}, [r5@64], r2 - PLD [r7, -r2] - VST1.64 {D2}, [r5@64], r2 - PLD [r7] - VST1.64 {D3}, [r5@64], r2 - PLD [r7, r2] - VST1.64 {D4}, [r5@64], r2 - PLD [r7, r2, LSL #1] - VST1.64 {D5}, [r5@64], r2 - ADDGT r7, r7, r2, LSL #2 - VST1.64 {D6}, [r5@64], r2 - PLD [r7, -r2] - VST1.64 {D7}, [r5@64] - BGT ofcl_neon_lp -ofcl_neon_end - LDMFD r13!,{r4-r7,PC} - ENDP - -oc_frag_recon_intra_neon PROC - ; r0 = unsigned char *_dst - ; r1 = int _ystride - ; r2 = const ogg_int16_t _residue[64] - MOV r3, #128 - VDUP.S16 Q0, r3 - VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles - VQADD.S16 Q8, Q8, Q0 - VQADD.S16 Q9, Q9, Q0 - VQADD.S16 Q10,Q10,Q0 - VQADD.S16 Q11,Q11,Q0 - VQADD.S16 Q12,Q12,Q0 - VQADD.S16 Q13,Q13,Q0 - VQADD.S16 Q14,Q14,Q0 - VQADD.S16 Q15,Q15,Q0 - VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle - VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle - VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle - VST1.64 {D16},[r0@64], r1 - VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle - VST1.64 {D17},[r0@64], r1 - VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle - VST1.64 {D18},[r0@64], r1 - VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle - VST1.64 {D19},[r0@64], r1 - VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle - VST1.64 {D20},[r0@64], r1 - VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle - VST1.64 {D21},[r0@64], r1 - VST1.64 {D22},[r0@64], r1 - VST1.64 {D23},[r0@64], r1 - MOV PC,R14 - ENDP - -oc_frag_recon_inter_neon PROC - ; r0 = unsigned char *_dst - ; r1 = const unsigned char *_src - ; r2 = int _ystride - ; r3 = const ogg_int16_t _residue[64] - VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles - VLD1.64 {D0}, [r1], r2 - VLD1.64 {D2}, [r1], r2 - VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00 - VLD1.64 {D4}, [r1], r2 - VMOVL.U8 Q1, D2 ; etc - VLD1.64 {D6}, [r1], r2 - VMOVL.U8 Q2, D4 - VMOVL.U8 Q3, D6 - VQADD.S16 Q8, Q8, Q0 - VLD1.64 {D0}, [r1], r2 - VQADD.S16 Q9, Q9, Q1 - VLD1.64 {D2}, [r1], r2 - VQADD.S16 Q10,Q10,Q2 - VLD1.64 {D4}, [r1], r2 - VQADD.S16 Q11,Q11,Q3 - VLD1.64 {D6}, [r1], r2 - VMOVL.U8 Q0, D0 - VMOVL.U8 Q1, D2 - VMOVL.U8 Q2, D4 - VMOVL.U8 Q3, D6 - VQADD.S16 Q12,Q12,Q0 - VQADD.S16 Q13,Q13,Q1 - VQADD.S16 Q14,Q14,Q2 - VQADD.S16 Q15,Q15,Q3 - VQMOVUN.S16 D16,Q8 - VQMOVUN.S16 D17,Q9 - VQMOVUN.S16 D18,Q10 - VST1.64 {D16},[r0@64], r2 - VQMOVUN.S16 D19,Q11 - VST1.64 {D17},[r0@64], r2 - VQMOVUN.S16 D20,Q12 - VST1.64 {D18},[r0@64], r2 - VQMOVUN.S16 D21,Q13 - VST1.64 {D19},[r0@64], r2 - VQMOVUN.S16 D22,Q14 - VST1.64 {D20},[r0@64], r2 - VQMOVUN.S16 D23,Q15 - VST1.64 {D21},[r0@64], r2 - VST1.64 {D22},[r0@64], r2 - VST1.64 {D23},[r0@64], r2 - MOV PC,R14 - ENDP - -oc_frag_recon_inter2_neon PROC - ; r0 = unsigned char *_dst - ; r1 = const unsigned char *_src1 - ; r2 = const unsigned char *_src2 - ; r3 = int _ystride - LDR r12,[r13] - ; r12= const ogg_int16_t _residue[64] - VLDMIA r12,{D16-D31} - VLD1.64 {D0}, [r1], r3 - VLD1.64 {D4}, [r2], r3 - VLD1.64 {D1}, [r1], r3 - VLD1.64 {D5}, [r2], r3 - VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100 - VLD1.64 {D2}, [r1], r3 - VLD1.64 {D6}, [r2], r3 - VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00 - VLD1.64 {D3}, [r1], r3 - VMOVL.U8 Q2, D5 ; etc - VLD1.64 {D7}, [r2], r3 - VHADD.U8 Q3, Q1, Q3 - VQADD.S16 Q8, Q8, Q0 - VQADD.S16 Q9, Q9, Q2 - VLD1.64 {D0}, [r1], r3 - VMOVL.U8 Q1, D6 - VLD1.64 {D4}, [r2], r3 - VMOVL.U8 Q3, D7 - VLD1.64 {D1}, [r1], r3 - VQADD.S16 Q10,Q10,Q1 - VLD1.64 {D5}, [r2], r3 - VQADD.S16 Q11,Q11,Q3 - VLD1.64 {D2}, [r1], r3 - VHADD.U8 Q2, Q0, Q2 - VLD1.64 {D6}, [r2], r3 - VLD1.64 {D3}, [r1], r3 - VMOVL.U8 Q0, D4 - VLD1.64 {D7}, [r2], r3 - VMOVL.U8 Q2, D5 - VHADD.U8 Q3, Q1, Q3 - VQADD.S16 Q12,Q12,Q0 - VQADD.S16 Q13,Q13,Q2 - VMOVL.U8 Q1, D6 - VMOVL.U8 Q3, D7 - VQADD.S16 Q14,Q14,Q1 - VQADD.S16 Q15,Q15,Q3 - VQMOVUN.S16 D16,Q8 - VQMOVUN.S16 D17,Q9 - VQMOVUN.S16 D18,Q10 - VST1.64 {D16},[r0@64], r3 - VQMOVUN.S16 D19,Q11 - VST1.64 {D17},[r0@64], r3 - VQMOVUN.S16 D20,Q12 - VST1.64 {D18},[r0@64], r3 - VQMOVUN.S16 D21,Q13 - VST1.64 {D19},[r0@64], r3 - VQMOVUN.S16 D22,Q14 - VST1.64 {D20},[r0@64], r3 - VQMOVUN.S16 D23,Q15 - VST1.64 {D21},[r0@64], r3 - VST1.64 {D22},[r0@64], r3 - VST1.64 {D23},[r0@64], r3 - MOV PC,R14 - ENDP - ] - - END diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s deleted file mode 100644 index babd846ec..000000000 --- a/media/libtheora/lib/arm/armidct.s +++ /dev/null @@ -1,1914 +0,0 @@ -;******************************************************************** -;* * -;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * -;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * -;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * -;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * -;* * -;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * -;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * -;* * -;******************************************************************** -; Original implementation: -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ -;******************************************************************** - - AREA |.text|, CODE, READONLY - - ; Explicitly specifying alignment here because some versions of - ; gas don't align code correctly. See - ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html - ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 - ALIGN - - GET armopts.s - - EXPORT oc_idct8x8_1_arm - EXPORT oc_idct8x8_arm - -oc_idct8x8_1_arm PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_uint16_t _dc - ORR r1, r1, r1, LSL #16 - MOV r2, r1 - MOV r3, r1 - MOV r12,r1 - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - STMIA r0!,{r1,r2,r3,r12} - MOV PC, r14 - ENDP - -oc_idct8x8_arm PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_int16_t *_x - ; r2 = int _last_zzi - CMP r2, #3 - BLE oc_idct8x8_3_arm - CMP r2, #6 - BLE oc_idct8x8_6_arm - CMP r2, #10 - BLE oc_idct8x8_10_arm -oc_idct8x8_slow_arm - STMFD r13!,{r4-r11,r14} - SUB r13,r13,#64*2 -; Row transforms - STR r0, [r13,#-4]! - ADD r0, r13, #4 ; Write to temp storage. - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - BL idct8core_arm - LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). - SUB r2, r1, #8*16 - CMP r0, r2 - MOV r1, r13 ; And read from temp storage. - BEQ oc_idct8x8_slow_arm_cols - MOV r4, #0 - MOV r5, #0 - MOV r6, #0 - MOV r7, #0 - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} - STMIA r2!,{r4,r5,r6,r7} -oc_idct8x8_slow_arm_cols -; Column transforms - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - BL idct8core_down_arm - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r11,PC} - ENDP - -oc_idct8x8_10_arm PROC - STMFD r13!,{r4-r11,r14} - SUB r13,r13,#64*2 -; Row transforms - MOV r2, r0 - MOV r0, r13 ; Write to temp storage. - BL idct4core_arm - BL idct3core_arm - BL idct2core_arm - BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #4*16 - CMP r0, r2 - MOV r1, r13 ; Read from temp storage. - BEQ oc_idct8x8_10_arm_cols - MOV r4, #0 - STR r4, [r0] - STR r4, [r0,#4] - STR r4, [r0,#16] - STR r4, [r0,#20] - STR r4, [r0,#32] - STR r4, [r0,#48] - MOV r0, r2 ; Write to the final destination -oc_idct8x8_10_arm_cols -; Column transforms - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - BL idct4core_down_arm - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r11,PC} - ENDP - -oc_idct8x8_6_arm PROC - STMFD r13!,{r4-r7,r9-r11,r14} - SUB r13,r13,#64*2 -; Row transforms - MOV r2, r0 - MOV r0, r13 ; Write to temp storage. - BL idct3core_arm - BL idct2core_arm - BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #3*16 - CMP r0, r2 - MOV r1, r13 ; Read from temp storage. - BEQ oc_idct8x8_6_arm_cols - MOV r4, #0 - STR r4, [r0] - STR r4, [r0,#4] - STR r4, [r0,#16] - STR r4, [r0,#32] - MOV r0, r2 ; Write to the final destination -oc_idct8x8_6_arm_cols -; Column transforms - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - BL idct3core_down_arm - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r7,r9-r11,PC} - ENDP - -oc_idct8x8_3_arm PROC - STMFD r13!,{r4-r7,r9-r11,r14} - SUB r13,r13,#64*2 -; Row transforms - MOV r2, r0 - MOV r0, r13 ; Write to temp storage. - BL idct2core_arm - BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #2*16 - CMP r0, r2 - MOV r1, r13 ; Read from temp storage. - MOVNE r4, #0 - STRNE r4, [r0] - STRNE r4, [r0,#16] - MOVNE r0, r2 ; Write to the final destination -; Column transforms - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - BL idct2core_down_arm - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r7,r9-r11,PC} - ENDP - -idct1core_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r3, [r1], #16 - MOV r12,#0x05 - ORR r12,r12,#0xB500 - MUL r3, r12, r3 - ; Stall ? - MOV r3, r3, ASR #16 - STRH r3, [r0], #2 - STRH r3, [r0, #14] - STRH r3, [r0, #30] - STRH r3, [r0, #46] - STRH r3, [r0, #62] - STRH r3, [r0, #78] - STRH r3, [r0, #94] - STRH r3, [r0, #110] - MOV PC,R14 - ENDP - -idct2core_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r12,OC_C4S4 - LDRSH r11,[r1, #-14] ; r11= x[1] - LDR r3, OC_C7S1 - MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r10,OC_C1S7 - MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] - MOV r3, r3, ASR #16 ; r3 = t[4] - MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] - MOV r11,r11,ASR #16 ; r11= t[7] - MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] - MOV r10,r10,ASR #16 ; r10= t[5] - ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6] - ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5] - SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5] - ADD r3, r3, r9 ; r3 = t[0]+t[4] - ADD r11,r11,r9 ; r11= t[0]+t[7] - STRH r11,[r0], #2 ; y[0] = t[0]+t[7] - STRH r12,[r0, #14] ; y[1] = t[0]+t[6] - STRH r10,[r0, #30] ; y[2] = t[0]+t[5] - STRH r3, [r0, #46] ; y[3] = t[0]+t[4] - RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4] - RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5] - RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6] - RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7] - STRH r3, [r0, #62] ; y[4] = t[0]-t[4] - STRH r10,[r0, #78] ; y[5] = t[0]-t[5] - STRH r12,[r0, #94] ; y[6] = t[0]-t[6] - STRH r11,[r0, #110] ; y[7] = t[0]-t[7] - MOV PC,r14 - ENDP - -idct2core_down_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r12,OC_C4S4 - LDRSH r11,[r1, #-14] ; r11= x[1] - LDR r3, OC_C7S1 - MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r10,OC_C1S7 - MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] - ADD r9, r9, #8 ; r9 = t[0]+8 - MOV r3, r3, ASR #16 ; r3 = t[4] - MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] - MOV r11,r11,ASR #16 ; r11= t[7] - MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] - MOV r10,r10,ASR #16 ; r10= t[5] - ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8 - ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8 - SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8 - ADD r3, r3, r9 ; r3 = t[0]+t[4]+8 - ADD r11,r11,r9 ; r11= t[0]+t[7]+8 - ; TODO: This is wrong. - ; The C code truncates to 16 bits by storing to RAM and doing the - ; shifts later; we've got an extra 4 bits here. - MOV r4, r11,ASR #4 - MOV r5, r12,ASR #4 - MOV r6, r10,ASR #4 - MOV r7, r3, ASR #4 - RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8 - RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8 - RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8 - RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8 - MOV r3, r3, ASR #4 - MOV r10,r10,ASR #4 - MOV r12,r12,ASR #4 - MOV r11,r11,ASR #4 - STRH r4, [r0], #2 ; y[0] = t[0]+t[7] - STRH r5, [r0, #14] ; y[1] = t[0]+t[6] - STRH r6, [r0, #30] ; y[2] = t[0]+t[5] - STRH r7, [r0, #46] ; y[3] = t[0]+t[4] - STRH r3, [r0, #62] ; y[4] = t[0]-t[4] - STRH r10,[r0, #78] ; y[5] = t[0]-t[5] - STRH r12,[r0, #94] ; y[6] = t[0]-t[6] - STRH r11,[r0, #110] ; y[7] = t[0]-t[7] - MOV PC,r14 - ENDP - -idct3core_arm PROC - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r12,OC_C4S4 ; r12= OC_C4S4 - LDRSH r3, [r1, #-12] ; r3 = x[2] - LDR r10,OC_C6S2 ; r10= OC_C6S2 - MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r4, OC_C2S6 ; r4 = OC_C2S6 - MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] - LDRSH r11,[r1, #-14] ; r11= x[1] - MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] - LDR r4, OC_C7S1 ; r4 = OC_C7S1 - LDR r5, OC_C1S7 ; r5 = OC_C1S7 - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] - ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3] - MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] - MOV r4, r4, ASR #16 ; r4 = t[4] - MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] - MOV r11,r11,ASR #16 ; r11= t[7] - MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] - ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2] - RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2] - ; r3 = t2[0] = t[0]+t[3] - RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3] - MOV r12,r12,ASR #16 ; r12= t[6] - ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] - RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] - ADD r11,r3, r11 ; r11= t2[0]+t[7] - ADD r5, r10,r5 ; r5 = t[1]+t2[6] - ADD r12,r6, r12 ; r12= t[2]+t2[5] - ADD r4, r9, r4 ; r4 = t2[3]+t[4] - STRH r11,[r0], #2 ; y[0] = t[0]+t[7] - STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] - STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] - STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] - RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7] - RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6] - RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5] - RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4] - STRH r4, [r0, #62] ; y[4] = t2[3]-t[4] - STRH r12,[r0, #78] ; y[5] = t[2]-t2[5] - STRH r5, [r0, #94] ; y[6] = t[1]-t2[6] - STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] - MOV PC,R14 - ENDP - -idct3core_down_arm PROC - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r12,OC_C4S4 ; r12= OC_C4S4 - LDRSH r3, [r1, #-12] ; r3 = x[2] - LDR r10,OC_C6S2 ; r10= OC_C6S2 - MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r4, OC_C2S6 ; r4 = OC_C2S6 - MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] - LDRSH r11,[r1, #-14] ; r11= x[1] - MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] - LDR r4, OC_C7S1 ; r4 = OC_C7S1 - LDR r5, OC_C1S7 ; r5 = OC_C1S7 - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] - ADD r9, r9, #8 ; r9 = t[0]+8 - MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] - ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8 - MOV r4, r4, ASR #16 ; r4 = t[4] - MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] - MOV r11,r11,ASR #16 ; r11= t[7] - MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] - ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8 - RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8 - ; r3 = t2[0]+8 = t[0]+t[3]+8 - RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8 - MOV r12,r12,ASR #16 ; r12= t[6] - ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] - RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] - ADD r11,r3, r11 ; r11= t2[0]+t[7] +8 - ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8 - ADD r12,r6, r12 ; r12= t[2] +t2[5]+8 - ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8 - RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8 - RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8 - RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8 - RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8 - ; TODO: This is wrong. - ; The C code truncates to 16 bits by storing to RAM and doing the - ; shifts later; we've got an extra 4 bits here. - MOV r11,r11,ASR #4 - MOV r5, r5, ASR #4 - MOV r12,r12,ASR #4 - MOV r4, r4, ASR #4 - MOV r9, r9, ASR #4 - MOV r6, r6, ASR #4 - MOV r10,r10,ASR #4 - MOV r3, r3, ASR #4 - STRH r11,[r0], #2 ; y[0] = t[0]+t[7] - STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] - STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] - STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] - STRH r9, [r0, #62] ; y[4] = t2[3]-t[4] - STRH r6, [r0, #78] ; y[5] = t[2]-t2[5] - STRH r10,[r0, #94] ; y[6] = t[1]-t2[6] - STRH r3, [r0, #110] ; y[7] = t2[0]-t[7] - MOV PC,R14 - ENDP - -idct4core_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r10,OC_C4S4 ; r10= OC_C4S4 - LDRSH r12,[r1, #-12] ; r12= x[2] - LDR r4, OC_C6S2 ; r4 = OC_C6S2 - MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r5, OC_C2S6 ; r5 = OC_C2S6 - MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] - LDRSH r3, [r1, #-14] ; r3 = x[1] - MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] - LDR r6, OC_C7S1 ; r6 = OC_C7S1 - LDR r12,OC_C1S7 ; r12= OC_C1S7 - LDRSH r11,[r1, #-10] ; r11= x[3] - MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] - LDR r7, OC_C5S3 ; r7 = OC_C5S3 - MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] - LDR r8, OC_C3S5 ; r8 = OC_C3S5 - MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] - MOV r6, r6, ASR #16 ; r6 = t[4] -; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit -; before multiplying, not after (this is not equivalent) - SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) - RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] - MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) - MOV r3, r3, ASR #16 ; r3 = t[7] - ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] - RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] - MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) - ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] - RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] - ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] - RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] - MOV r3, r3, ASR #16 ; r3 = t2[6] - ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] - RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] - ADD r11,r5, r11 ; r11= t[0]+t2[7] - ADD r6, r4, r6 ; r6 = t[1]+t3[6] - ADD r3, r10,r3 ; r3 = t[2]+t3[5] - ADD r7, r9, r7 ; r7 = t[3]+t2[4] - STRH r11,[r0], #2 ; y[0] = t[0]+t[7] - STRH r6, [r0, #14] ; y[1] = t[1]+t2[6] - STRH r3, [r0, #30] ; y[2] = t[2]+t2[5] - STRH r7, [r0, #46] ; y[3] = t2[3]+t[4] - RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7] - RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6] - RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5] - RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4] - STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] - STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] - STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] - STRH r11, [r0, #110] ; y[7] = t2[0]-t[7] - MOV PC,r14 - ENDP - -idct4core_down_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r9, [r1], #16 ; r9 = x[0] - LDR r10,OC_C4S4 ; r10= OC_C4S4 - LDRSH r12,[r1, #-12] ; r12= x[2] - LDR r4, OC_C6S2 ; r4 = OC_C6S2 - MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] - LDR r5, OC_C2S6 ; r5 = OC_C2S6 - MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] - LDRSH r3, [r1, #-14] ; r3 = x[1] - MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] - LDR r6, OC_C7S1 ; r6 = OC_C7S1 - LDR r12,OC_C1S7 ; r12= OC_C1S7 - LDRSH r11,[r1, #-10] ; r11= x[3] - MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] - LDR r7, OC_C5S3 ; r7 = OC_C5S3 - MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] - LDR r8, OC_C3S5 ; r8 = OC_C3S5 - MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] - MOV r9, r9, ASR #16 ; r9 = t[0] - MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] - MOV r6, r6, ASR #16 ; r6 = t[4] -; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit -; before multiplying, not after (this is not equivalent) - SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) - RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] - MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) - MOV r3, r3, ASR #16 ; r3 = t[7] - ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] - RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] - ADD r9, r9, #8 ; r9 = t[0]+8 - MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) - ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8 - RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8 - ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8 - RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8 - MOV r3, r3, ASR #16 ; r3 = t2[6] - ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] - RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] - ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8 - ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8 - ADD r10,r10,r3 ; r10= t[2]+t3[5]+8 - ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8 - SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8 - SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8 - SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8 - SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8 - ; TODO: This is wrong. - ; The C code truncates to 16 bits by storing to RAM and doing the - ; shifts later; we've got an extra 4 bits here. - MOV r11,r11,ASR #4 - MOV r6, r6, ASR #4 - MOV r3, r3, ASR #4 - MOV r7, r7, ASR #4 - MOV r9, r9, ASR #4 - MOV r10,r10,ASR #4 - MOV r4, r4, ASR #4 - MOV r5, r5, ASR #4 - STRH r5,[r0], #2 ; y[0] = t[0]+t[7] - STRH r4, [r0, #14] ; y[1] = t[1]+t2[6] - STRH r10,[r0, #30] ; y[2] = t[2]+t2[5] - STRH r9, [r0, #46] ; y[3] = t2[3]+t[4] - STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] - STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] - STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] - STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] - MOV PC,r14 - ENDP - -idct8core_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r2, [r1],#16 ; r2 = x[0] - STMFD r13!,{r1,r14} - LDRSH r6, [r1, #-8] ; r6 = x[4] - LDR r12,OC_C4S4 ; r12= C4S4 - LDRSH r4, [r1, #-12] ; r4 = x[2] - ADD r2, r2, r6 ; r2 = x[0] + x[4] - SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] - ; For spec compliance, these sums must be truncated to 16-bit precision - ; _before_ the multiply (not after). - ; Sadly, ARMv4 provides no simple way to do that. - MOV r2, r2, LSL #16 - MOV r6, r6, LSL #16 - MOV r2, r2, ASR #16 - MOV r6, r6, ASR #16 - MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) - LDRSH r8, [r1, #-4] ; r8 = x[6] - LDR r7, OC_C6S2 ; r7 = OC_C6S2 - MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) - LDR r14,OC_C2S6 ; r14= OC_C2S6 - MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] - LDR r5, OC_C7S1 ; r5 = OC_C7S1 - MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] - MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 - MUL r14,r8, r14 ; r14= OC_C2S6*x[6] - MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 - MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] - LDR r7, OC_C1S7 ; r7 = OC_C1S7 - SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 - LDRSH r14,[r1, #-14] ; r14= x[1] - ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 - LDRSH r8, [r1, #-2] ; r8 = x[7] - MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] - LDRSH r10,[r1, #-6] ; r10= x[5] - MUL r14,r7, r14 ; r14= OC_C1S7*x[1] - MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 - MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] - MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 - MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] - LDRSH r1, [r1, #-10] ; r1 = x[3] - LDR r5, OC_C3S5 ; r5 = OC_C3S5 - LDR r11,OC_C5S3 ; r11= OC_C5S3 - ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 - MUL r14,r5, r10 ; r14= OC_C3S5*x[5] - SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 - MUL r10,r11,r10 ; r10= OC_C5S3*x[5] - MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 - MUL r11,r1, r11 ; r11= OC_C5S3*x[3] - MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 - MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] - SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 - ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 - ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] - ; r10=t[6] r12=C4S4 r14=t[5] -; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit -; before multiplying, not after (this is not equivalent) - ; Stage 2 - ; 4-5 butterfly - ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] - SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] - MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) - ; 7-6 butterfly - ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] - SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] - MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) - ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] - ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 - ; Stage 3 - ; 0-3 butterfly - ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] - SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] - ; 1-2 butterfly - ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] - SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] - ; 6-5 butterfly - MOV r14,r14,ASR #16 ; r14= t2[5] - ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] - SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] - ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] - ; r10=t3[6] r14=t3[5] - ; Stage 4 - ADD r2, r2, r8 ; r2 = t[0] + t[7] - ADD r6, r6, r10 ; r6 = t[1] + t[6] - ADD r3, r3, r14 ; r3 = t[2] + t[5] - ADD r4, r4, r9 ; r4 = t[3] + t[4] - SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] - SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] - SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] - SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] - STRH r2, [r0], #2 ; y[0] = t[0]+t[7] - STRH r6, [r0, #14] ; y[1] = t[1]+t[6] - STRH r3, [r0, #30] ; y[2] = t[2]+t[5] - STRH r4, [r0, #46] ; y[3] = t[3]+t[4] - STRH r9, [r0, #62] ; y[4] = t[3]-t[4] - STRH r14,[r0, #78] ; y[5] = t[2]-t[5] - STRH r10,[r0, #94] ; y[6] = t[1]-t[6] - STRH r8, [r0, #110] ; y[7] = t[0]-t[7] - LDMFD r13!,{r1,PC} - ENDP - -idct8core_down_arm PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r2, [r1],#16 ; r2 = x[0] - STMFD r13!,{r1,r14} - LDRSH r6, [r1, #-8] ; r6 = x[4] - LDR r12,OC_C4S4 ; r12= C4S4 - LDRSH r4, [r1, #-12] ; r4 = x[2] - ADD r2, r2, r6 ; r2 = x[0] + x[4] - SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] - ; For spec compliance, these sums must be truncated to 16-bit precision - ; _before_ the multiply (not after). - ; Sadly, ARMv4 provides no simple way to do that. - MOV r2, r2, LSL #16 - MOV r6, r6, LSL #16 - MOV r2, r2, ASR #16 - MOV r6, r6, ASR #16 - MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) - LDRSH r8, [r1, #-4] ; r8 = x[6] - LDR r7, OC_C6S2 ; r7 = OC_C6S2 - MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) - LDR r14,OC_C2S6 ; r14= OC_C2S6 - MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] - LDR r5, OC_C7S1 ; r5 = OC_C7S1 - MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] - MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 - MUL r14,r8, r14 ; r14= OC_C2S6*x[6] - MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 - MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] - LDR r7, OC_C1S7 ; r7 = OC_C1S7 - SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 - LDRSH r14,[r1, #-14] ; r14= x[1] - ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 - LDRSH r8, [r1, #-2] ; r8 = x[7] - MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] - LDRSH r10,[r1, #-6] ; r10= x[5] - MUL r14,r7, r14 ; r14= OC_C1S7*x[1] - MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 - MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] - MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 - MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] - LDRSH r1, [r1, #-10] ; r1 = x[3] - LDR r5, OC_C3S5 ; r5 = OC_C3S5 - LDR r11,OC_C5S3 ; r11= OC_C5S3 - ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 - MUL r14,r5, r10 ; r14= OC_C3S5*x[5] - SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 - MUL r10,r11,r10 ; r10= OC_C5S3*x[5] - MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 - MUL r11,r1, r11 ; r11= OC_C5S3*x[3] - MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 - MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] - SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 - ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 - ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] - ; r10=t[6] r12=C4S4 r14=t[5] - ; Stage 2 -; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit -; before multiplying, not after (this is not equivalent) - ; 4-5 butterfly - ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] - SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] - MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) - ; 7-6 butterfly - ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] - SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] - MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) - ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] - ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 - ; Stage 3 - ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16 - ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16 - ; 0-3 butterfly - ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8 - SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8 - ; 1-2 butterfly - ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8 - SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8 - ; 6-5 butterfly - MOV r14,r14,ASR #16 ; r14= t2[5] - ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] - SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] - ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] - ; r10=t3[6] r14=t3[5] - ; Stage 4 - ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8 - ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8 - ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8 - ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8 - SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8 - SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8 - SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8 - SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8 - ; TODO: This is wrong. - ; The C code truncates to 16 bits by storing to RAM and doing the - ; shifts later; we've got an extra 4 bits here. - MOV r2, r2, ASR #4 - MOV r6, r6, ASR #4 - MOV r3, r3, ASR #4 - MOV r4, r4, ASR #4 - MOV r8, r8, ASR #4 - MOV r10,r10,ASR #4 - MOV r14,r14,ASR #4 - MOV r9, r9, ASR #4 - STRH r2, [r0], #2 ; y[0] = t[0]+t[7] - STRH r6, [r0, #14] ; y[1] = t[1]+t[6] - STRH r3, [r0, #30] ; y[2] = t[2]+t[5] - STRH r4, [r0, #46] ; y[3] = t[3]+t[4] - STRH r9, [r0, #62] ; y[4] = t[3]-t[4] - STRH r14,[r0, #78] ; y[5] = t[2]-t[5] - STRH r10,[r0, #94] ; y[6] = t[1]-t[6] - STRH r8, [r0, #110] ; y[7] = t[0]-t[7] - LDMFD r13!,{r1,PC} - ENDP - - [ OC_ARM_ASM_MEDIA - EXPORT oc_idct8x8_1_v6 - EXPORT oc_idct8x8_v6 - -oc_idct8x8_1_v6 PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_uint16_t _dc - ORR r2, r1, r1, LSL #16 - ORR r3, r1, r1, LSL #16 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - STRD r2, [r0], #8 - MOV PC, r14 - ENDP - -oc_idct8x8_v6 PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_int16_t *_x - ; r2 = int _last_zzi - CMP r2, #3 - BLE oc_idct8x8_3_v6 - ;CMP r2, #6 - ;BLE oc_idct8x8_6_v6 - CMP r2, #10 - BLE oc_idct8x8_10_v6 -oc_idct8x8_slow_v6 - STMFD r13!,{r4-r11,r14} - SUB r13,r13,#64*2 -; Row transforms - STR r0, [r13,#-4]! - ADD r0, r13, #4 ; Write to temp storage. - BL idct8_8core_v6 - BL idct8_8core_v6 - BL idct8_8core_v6 - BL idct8_8core_v6 - LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). - SUB r2, r1, #8*16 - CMP r0, r2 - MOV r1, r13 ; And read from temp storage. - BEQ oc_idct8x8_slow_v6_cols - MOV r4, #0 - MOV r5, #0 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 -oc_idct8x8_slow_v6_cols -; Column transforms - BL idct8_8core_down_v6 - BL idct8_8core_down_v6 - BL idct8_8core_down_v6 - BL idct8_8core_down_v6 - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r11,PC} - ENDP - -oc_idct8x8_10_v6 PROC - STMFD r13!,{r4-r11,r14} - SUB r13,r13,#64*2+4 -; Row transforms - MOV r2, r13 - STR r0, [r13,#-4]! - AND r0, r2, #4 ; Align the stack. - ADD r0, r0, r2 ; Write to temp storage. - BL idct4_3core_v6 - BL idct2_1core_v6 - LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). - SUB r2, r1, #4*16 - CMP r0, r2 - AND r1, r13,#4 ; Align the stack. - BEQ oc_idct8x8_10_v6_cols - MOV r4, #0 - MOV r5, #0 - STRD r4, [r2] - STRD r4, [r2,#16] - STR r4, [r2,#32] - STR r4, [r2,#48] -oc_idct8x8_10_v6_cols -; Column transforms - ADD r1, r1, r13 ; And read from temp storage. - BL idct4_4core_down_v6 - BL idct4_4core_down_v6 - BL idct4_4core_down_v6 - BL idct4_4core_down_v6 - ADD r13,r13,#64*2+4 - LDMFD r13!,{r4-r11,PC} - ENDP - -oc_idct8x8_3_v6 PROC - STMFD r13!,{r4-r8,r14} - SUB r13,r13,#64*2 -; Row transforms - MOV r8, r0 - MOV r0, r13 ; Write to temp storage. - BL idct2_1core_v6 - ; Clear input data for next block (decoder only). - SUB r0, r1, #2*16 - CMP r0, r8 - MOV r1, r13 ; Read from temp storage. - MOVNE r4, #0 - STRNE r4, [r0] - STRNE r4, [r0,#16] - MOVNE r0, r8 ; Write to the final destination. -; Column transforms - BL idct2_2core_down_v6 - BL idct2_2core_down_v6 - BL idct2_2core_down_v6 - BL idct2_2core_down_v6 - ADD r13,r13,#64*2 - LDMFD r13!,{r4-r8,PC} - ENDP - -idct2_1core_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> - LDR r3, OC_C4S4 - LDRSH r6, [r1], #16 ; r6 = x[1,0] - SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 - LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 - SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16 - SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 - SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 -; Stage 2: - SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 - PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]> - SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 - PKHBT r7, r7, r3 ; r7 = <0|t[0,7]> -; Stage 3: - PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]> - PKHBT r4, r4, r3 ; r4 = <0|t[0,4]> - SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]> -; Stage 4: - PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]> - PKHBT r5, r5, r3 ; r5 = <0|t[0,5]> - SADD16 r3, r12,r7 ; r3 = t[0]+t[7] - STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7] - SADD16 r3, r12,r6 ; r3 = t[0]+t[6] - STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6] - SADD16 r3, r12,r5 ; r3 = t[0]+t[5] - STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5] - SADD16 r3, r12,r4 ; r3 = t[0]+t[4] - STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4] - SSUB16 r4, r12,r4 ; r4 = t[0]-t[4] - STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4] - SSUB16 r5, r12,r5 ; r5 = t[0]-t[5] - STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5] - SSUB16 r6, r12,r6 ; r6 = t[0]-t[6] - STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6] - SSUB16 r7, r12,r7 ; r7 = t[0]-t[7] - STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] - MOV PC,r14 - ENDP - ] - - ALIGN 8 -OC_C7S1 - DCD 12785 ; 31F1 -OC_C1S7 - DCD 64277 ; FB15 -OC_C6S2 - DCD 25080 ; 61F8 -OC_C2S6 - DCD 60547 ; EC83 -OC_C5S3 - DCD 36410 ; 8E3A -OC_C3S5 - DCD 54491 ; D4DB -OC_C4S4 - DCD 46341 ; B505 - - [ OC_ARM_ASM_MEDIA -idct2_2core_down_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> - LDR r3, OC_C4S4 - MOV r7 ,#8 ; r7 = 8 - LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]> - SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8 - LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 - SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8 - SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16 - PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> - SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 -; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition. - PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]> -; Stage 2: - SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 - PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]> - SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16 - SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 - PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16 - PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]> -; Stage 3: - SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]> - SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]> -; Stage 4: - SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8 - MOV r3, r2, ASR #4 - MOV r2, r2, LSL #16 - PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4 - STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 - SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8 - MOV r3, r2, ASR #4 - MOV r2, r2, LSL #16 - PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4 - STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4 - SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8 - MOV r3, r2, ASR #4 - MOV r2, r2, LSL #16 - PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4 - STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4 - SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8 - MOV r3, r2, ASR #4 - MOV r2, r2, LSL #16 - PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4 - STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4 - SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8 - MOV r3, r4, ASR #4 - MOV r4, r4, LSL #16 - PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4 - STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4 - SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8 - MOV r3, r5, ASR #4 - MOV r5, r5, LSL #16 - PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4 - STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4 - SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8 - MOV r3, r6, ASR #4 - MOV r6, r6, LSL #16 - PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4 - STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4 - SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8 - MOV r3, r7, ASR #4 - MOV r7, r7, LSL #16 - PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4 - STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 - MOV PC,r14 - ENDP - -; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to -; pay for increased branch mis-prediction to get here, but in practice it -; doesn't seem to slow anything down to take it out, and it's less code this -; way. - [ 0 -oc_idct8x8_6_v6 PROC - STMFD r13!,{r4-r8,r10,r11,r14} - SUB r13,r13,#64*2+4 -; Row transforms - MOV r8, r0 - AND r0, r13,#4 ; Align the stack. - ADD r0, r0, r13 ; Write to temp storage. - BL idct3_2core_v6 - BL idct1core_v6 - ; Clear input data for next block (decoder only). - SUB r0, r1, #3*16 - CMP r0, r8 - AND r1, r13,#4 ; Align the stack. - BEQ oc_idct8x8_6_v6_cols - MOV r4, #0 - MOV r5, #0 - STRD r4, [r0] - STR r4, [r0,#16] - STR r4, [r0,#32] - MOV r0, r8 ; Write to the final destination. -oc_idct8x8_6_v6_cols -; Column transforms - ADD r1, r1, r13 ; And read from temp storage. - BL idct3_3core_down_v6 - BL idct3_3core_down_v6 - BL idct3_3core_down_v6 - BL idct3_3core_down_v6 - ADD r13,r13,#64*2+4 - LDMFD r13!,{r4-r8,r10,r11,PC} - ENDP - -idct1core_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) - LDRSH r3, [r1], #16 - MOV r12,#0x05 - ORR r12,r12,#0xB500 - MUL r3, r12, r3 - ; Stall ? - MOV r3, r3, ASR #16 - ; Don't need to actually store the odd lines; they won't be read. - STRH r3, [r0], #2 - STRH r3, [r0, #30] - STRH r3, [r0, #62] - STRH r3, [r0, #94] - MOV PC,R14 - ENDP - -idct3_2core_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]> - LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6 - ; Stall - SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 - LDR r11,OC_C4S4 - SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 - LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]> - SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16) - LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 - SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16) - PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]> - SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16 - PKHBT r2, r2, r11 ; r2 = <0|t[0,2]> - SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 - PKHBT r3, r3, r11 ; r3 = <0|t[0,3]> - SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16 - PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]> - SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 -; Stage 2: - SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 - PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]> - SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 - SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 - PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 -; Stage 3: - B idct4_3core_stage3_v6 - ENDP - -; Another copy so the LDRD offsets are less than +/- 255. - ALIGN 8 -OC_C7S1_3_v6 - DCD 12785 ; 31F1 -OC_C1S7_3_v6 - DCD 64277 ; FB15 -OC_C6S2_3_v6 - DCD 25080 ; 61F8 -OC_C2S6_3_v6 - DCD 60547 ; EC83 - -idct3_3core_down_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]> - LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 - LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]> - SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 - MOV r7,#8 - SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 - LDR r11,OC_C4S4 - SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 -; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition. - PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]> - SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 - PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]> - LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 - PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> - SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 - SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 - SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 - PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> - SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 -; Stage 2: - SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 - PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> - SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 - SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 - PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 -; Stage 3: - B idct4_4core_down_stage3_v6 - ENDP - ] - -idct4_3core_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> - LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 - LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]> - SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 - SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 - PKHBT r9, r9, r2 ; r9 = <0|t[0,6]> - LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 - PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]> - SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 - SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 - LDR r11,OC_C4S4 - SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 - SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 - PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> - SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 - PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> - SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16 - LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 - PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]> - SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 - SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 - SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 - PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> - SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 -; Stage 2: - SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] - PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> - SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] - SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 - SADD16 r5, r4, r8 ; r5 = t[4]-t[5] - SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 - SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] - SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 - PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 -; Stage 3: -idct4_3core_stage3_v6 - SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2] - PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> - SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2] -idct4_3core_stage3_5_v6 - SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] - SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] - SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3] - SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3] -; Stage 4: - SADD16 r12,r10,r7 ; r12= t[0]+t[7] - STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7] - SADD16 r12,r11,r6 ; r12= t[1]+t[6] - STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6] - SADD16 r12,r2, r5 ; r12= t[2]+t[5] - STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5] - SADD16 r12,r3, r4 ; r12= t[3]+t[4] - STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4] - SSUB16 r4, r3, r4 ; r4 = t[3]-t[4] - STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4] - SSUB16 r5, r2, r5 ; r5 = t[2]-t[5] - STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5] - SSUB16 r6, r11,r6 ; r6 = t[1]-t[6] - STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6] - SSUB16 r7, r10,r7 ; r7 = t[0]-t[7] - STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] - MOV PC,r14 - ENDP - -; Another copy so the LDRD offsets are less than +/- 255. - ALIGN 8 -OC_C7S1_4_v6 - DCD 12785 ; 31F1 -OC_C1S7_4_v6 - DCD 64277 ; FB15 -OC_C6S2_4_v6 - DCD 25080 ; 61F8 -OC_C2S6_4_v6 - DCD 60547 ; EC83 -OC_C5S3_4_v6 - DCD 36410 ; 8E3A -OC_C3S5_4_v6 - DCD 54491 ; D4DB - -idct4_4core_down_v6 PROC - ; r0 = ogg_int16_t *_y (destination) - ; r1 = const ogg_int16_t *_x (source) -; Stage 1: - LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> - LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 - LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]> - SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 - LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 - SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 -; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition. - PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]> - SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 - PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]> - SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 - LDR r11,OC_C4S4 - SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 - MOV r7,#8 - SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 - PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> - SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 - PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> - SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 - LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 - PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> - SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 - SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 - SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 - PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> - SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 -; Stage 2: - SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] - PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> - SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] - SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 - SADD16 r5, r4, r8 ; r5 = t[4]-t[5] - SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 - SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] - SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 - PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 -; Stage 3: -idct4_4core_down_stage3_v6 - SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8 - PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> - SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8 - B idct8_8core_down_stage3_5_v6 - ENDP - -idct8_8core_v6 PROC - STMFD r13!,{r0,r14} -; Stage 1: - ;5-6 rotation by 3pi/16 - LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5 - LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> - LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> - SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 - LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> - SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 - LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> - SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 - SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 - SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) - PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> - SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) - PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> - SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 - PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 - ;2-3 rotation by 6pi/16 - LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6 - PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> - LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> - SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 - SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> - SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 - LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> - SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 - SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 - PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> - SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) - SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) - SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 - PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> - SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 - ;4-7 rotation by 7pi/16 - LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 - PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> - LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> - PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> - SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> - SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 - LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> - SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 - SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 - SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 - SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) - PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> - SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) - PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> - SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 - PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> - SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 - ;0-1 butterfly - LDR r11,OC_C4S4 - PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> - SADD16 r7, r0, r4 ; r7 = x[0]+x[4] - SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> - SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] - SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16 - SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16 - SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16 - PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]> - SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16 -; Stage 2: - SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] - PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]> - SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] - SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 - SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] - SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 - SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] - SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 - PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> - SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 -; Stage 3: - SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2] - PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> - SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2] - LDMFD r13!,{r0,r14} - B idct4_3core_stage3_5_v6 - ENDP - -; Another copy so the LDRD offsets are less than +/- 255. - ALIGN 8 -OC_C7S1_8_v6 - DCD 12785 ; 31F1 -OC_C1S7_8_v6 - DCD 64277 ; FB15 -OC_C6S2_8_v6 - DCD 25080 ; 61F8 -OC_C2S6_8_v6 - DCD 60547 ; EC83 -OC_C5S3_8_v6 - DCD 36410 ; 8E3A -OC_C3S5_8_v6 - DCD 54491 ; D4DB - -idct8_8core_down_v6 PROC - STMFD r13!,{r0,r14} -; Stage 1: - ;5-6 rotation by 3pi/16 - LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5 - LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> - LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> - SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 - LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> - SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 - LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> - SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 - SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 - SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) - PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> - SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) - PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> - SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 - PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> - SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 - ;2-3 rotation by 6pi/16 - LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6 - PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> - LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> - SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 - SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> - SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 - LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> - SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 - SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 - PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> - SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) - SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) - SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 - PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> - SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 - ;4-7 rotation by 7pi/16 - LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 - PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> - LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> - PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> - SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> - SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 - LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> - SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 - SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 - SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 - SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) - PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> - SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) - PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> - SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 - PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> - SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 - ;0-1 butterfly - LDR r11,OC_C4S4 - MOV r14,#8 - PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> - SADD16 r7, r0, r4 ; r7 = x[0]+x[4] - SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> - SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8 - SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] - SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8 - SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8 - PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> - SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8 -; Stage 2: - SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] - PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8> - SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] - SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 - SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] - SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 - SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] - SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 - PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> - SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 -; Stage 3: - SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8 - PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> - SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8 - LDMFD r13!,{r0,r14} -idct8_8core_down_stage3_5_v6 - SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] - SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] - SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8 - SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8 -; Stage 4: - SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8 - SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8 - MOV r10,r12,ASR #4 - MOV r12,r12,LSL #16 - PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4 - STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 - SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8 - SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8 - MOV r10,r12,ASR #4 - MOV r12,r12,LSL #16 - PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4 - STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4 - SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8 - SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8 - MOV r10,r12,ASR #4 - MOV r12,r12,LSL #16 - PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4 - STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4 - SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8 - SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8 - MOV r10,r12,ASR #4 - MOV r12,r12,LSL #16 - PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4 - STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4 - MOV r10,r4, ASR #4 - MOV r4, r4, LSL #16 - PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4 - STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4 - MOV r10,r5, ASR #4 - MOV r5, r5, LSL #16 - PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4 - STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4 - MOV r10,r6, ASR #4 - MOV r6, r6, LSL #16 - PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4 - STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4 - MOV r10,r7, ASR #4 - MOV r7, r7, LSL #16 - PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4 - STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 - MOV PC,r14 - ENDP - ] - - [ OC_ARM_ASM_NEON - EXPORT oc_idct8x8_1_neon - EXPORT oc_idct8x8_neon - - ALIGN 16 -OC_IDCT_CONSTS_NEON - DCW 8 - DCW 64277 ; FB15 (C1S7) - DCW 60547 ; EC83 (C2S6) - DCW 54491 ; D4DB (C3S5) - DCW 46341 ; B505 (C4S4) - DCW 36410 ; 471D (C5S3) - DCW 25080 ; 30FC (C6S2) - DCW 12785 ; 31F1 (C7S1) - -oc_idct8x8_1_neon PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_uint16_t _dc - VDUP.S16 Q0, r1 - VMOV Q1, Q0 - VST1.64 {D0, D1, D2, D3}, [r0@128]! - VST1.64 {D0, D1, D2, D3}, [r0@128]! - VST1.64 {D0, D1, D2, D3}, [r0@128]! - VST1.64 {D0, D1, D2, D3}, [r0@128] - MOV PC, r14 - ENDP - -oc_idct8x8_neon PROC - ; r0 = ogg_int16_t *_y - ; r1 = ogg_int16_t *_x - ; r2 = int _last_zzi - CMP r2, #10 - BLE oc_idct8x8_10_neon -oc_idct8x8_slow_neon - VPUSH {D8-D15} - MOV r2, r1 - ADR r3, OC_IDCT_CONSTS_NEON - ; Row transforms (input is pre-transposed) - VLD1.64 {D16,D17,D18,D19}, [r2@128]! - VLD1.64 {D20,D21,D22,D23}, [r2@128]! - VLD1.64 {D24,D25,D26,D27}, [r2@128]! - VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] - VLD1.64 {D28,D29,D30,D31}, [r2@128] - VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] - VLD1.64 {D0,D1}, [r3@128] - MOV r12, r14 - BL oc_idct8x8_stage123_neon -; Stage 4 - VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' - VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' - VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' - VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' - VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' - VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' - VTRN.16 Q14,Q15 - VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' - VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' - ; 8x8 Transpose - VTRN.16 Q8, Q9 - VTRN.16 Q10,Q11 - VTRN.16 Q12,Q13 - VTRN.32 Q8, Q10 - VTRN.32 Q9, Q11 - VTRN.32 Q12,Q14 - VTRN.32 Q13,Q15 - VSWP D17,D24 - VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] - VSWP D19,D26 - VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] - VSWP D21,D28 - VSWP D23,D30 - ; Column transforms - BL oc_idct8x8_stage123_neon - CMP r0,r1 - ; We have to put the return address back in the LR, or the branch - ; predictor will not recognize the function return and mis-predict the - ; entire call stack. - MOV r14, r12 -; Stage 4 - VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' - VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' - VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' - VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' - VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' - VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' - VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' - VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' - BEQ oc_idct8x8_slow_neon_noclear - VMOV.I8 Q2,#0 - VPOP {D8-D15} - VMOV.I8 Q3,#0 - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VST1.64 {D4, D5, D6, D7}, [r1@128]! - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VST1.64 {D4, D5, D6, D7}, [r1@128]! - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VST1.64 {D4, D5, D6, D7}, [r1@128]! - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VST1.64 {D4, D5, D6, D7}, [r1@128] - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 - -oc_idct8x8_slow_neon_noclear - VPOP {D8-D15} - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 - ENDP - -oc_idct8x8_stage123_neon PROC -; Stages 1 & 2 - VMULL.S16 Q4, D18,D1[3] - VMULL.S16 Q5, D19,D1[3] - VMULL.S16 Q7, D30,D1[3] - VMULL.S16 Q6, D31,D1[3] - VMULL.S16 Q2, D30,D0[1] - VMULL.S16 Q3, D31,D0[1] - VSHRN.S32 D8, Q4, #16 - VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16) - VSHRN.S32 D14,Q7, #16 - VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16) - VSHRN.S32 D4, Q2, #16 - VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7] - VSUB.S16 Q4, Q4, Q15 - VADD.S16 Q7, Q7, Q9 - VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4] - VMULL.S16 Q2, D18,D0[1] - VMULL.S16 Q9, D19,D0[1] - VMULL.S16 Q5, D26,D0[3] - VMULL.S16 Q3, D27,D0[3] - VMULL.S16 Q6, D22,D0[3] - VMULL.S16 Q12,D23,D0[3] - VSHRN.S32 D4, Q2, #16 - VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1] - VSHRN.S32 D10,Q5, #16 - VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5] - VSHRN.S32 D12,Q6, #16 - VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3] - VADD.S16 Q7, Q7, Q2 ; Q7 = t[7] - VSUB.S16 Q5, Q5, Q11 - VADD.S16 Q6, Q6, Q11 - VADD.S16 Q5, Q5, Q13 - VADD.S16 Q6, Q6, Q13 - VMULL.S16 Q9, D22,D1[1] - VMULL.S16 Q11,D23,D1[1] - VMULL.S16 Q15,D26,D1[1] - VMULL.S16 Q13,D27,D1[1] - VMULL.S16 Q2, D20,D1[2] - VMULL.S16 Q12,D21,D1[2] - VSHRN.S32 D18,Q9, #16 - VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3] - VSHRN.S32 D30,Q15,#16 - VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5] - VSHRN.S32 D4, Q2, #16 - VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16) - VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5] - VADD.S16 Q6, Q6, Q15 ; Q6 = t[6] - VSUB.S16 Q2, Q2, Q14 - VMULL.S16 Q3, D28,D1[2] - VMULL.S16 Q11,D29,D1[2] - VMULL.S16 Q12,D28,D0[2] - VMULL.S16 Q9, D29,D0[2] - VMULL.S16 Q13,D20,D0[2] - VMULL.S16 Q15,D21,D0[2] - VSHRN.S32 D6, Q3, #16 - VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16) - VSHRN.S32 D24,Q12,#16 - VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6] - VSHRN.S32 D26,Q13,#16 - VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2] - VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5] - VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6] - VADD.S16 Q3, Q3, Q10 - VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5] - VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6] - VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2] - VADD.S16 Q3, Q3, Q13 ; Q3 = t[3] - VMULL.S16 Q12,D16,D1[0] - VMULL.S16 Q13,D17,D1[0] - VMULL.S16 Q14,D2, D1[0] - VMULL.S16 Q15,D3, D1[0] - VMULL.S16 Q5, D18,D1[0] - VMULL.S16 Q6, D22,D1[0] - VSHRN.S32 D24,Q12,#16 - VSHRN.S32 D25,Q13,#16 - VSHRN.S32 D28,Q14,#16 - VSHRN.S32 D29,Q15,#16 - VMULL.S16 Q13,D19,D1[0] - VMULL.S16 Q15,D23,D1[0] - VADD.S16 Q8, Q8, Q12 ; Q8 = t[0] - VADD.S16 Q1, Q1, Q14 ; Q1 = t[1] - VSHRN.S32 D10,Q5, #16 - VSHRN.S32 D12,Q6, #16 - VSHRN.S32 D11,Q13,#16 - VSHRN.S32 D13,Q15,#16 - VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16 - VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16 -; Stage 3 - VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3] - VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3] - VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2] - VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]' - VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2] - VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]' - MOV PC, r14 - ENDP - -oc_idct8x8_10_neon PROC - ADR r3, OC_IDCT_CONSTS_NEON - VLD1.64 {D0,D1}, [r3@128] - MOV r2, r1 - ; Row transforms (input is pre-transposed) -; Stage 1 - VLD1.64 {D16,D17,D18,D19},[r2@128]! - MOV r12, #16 - VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16) - VLD1.64 {D17}, [r2@64], r12 - VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16) - VLD1.64 {D19}, [r2@64] - VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16) - VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16) - VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16) - VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1] - VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2] - VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0] - VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1] - VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2] - VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3] - VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3] - VSHRN.S32 D5, Q12,#16 ; D5 = t[4] - VSHRN.S32 D2, Q1, #16 ; D2 = t[2] - VADD.S16 D4, D4, D18 ; D4 = t[7] - VADD.S16 D6, D6, D19 ; D6 = t[6] - VADD.S16 D7, D7, D19 ; D7 = -t[5] - VADD.S16 Q15,Q15,Q8 ; D30= t[0] - ; D31= t[3] -; Stages 2 & 3 - VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6] - ; D25= t[4]'=t[4]+t[5] - VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6] - ; D27= t[4]-t[5] - VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6]) - ; -(t[7]-t[6]<<16) - VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5]) - ; -(t[4]-t[5]<<16) - VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3] - VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2] - VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2] - VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16) - ; -(t[7]-t[6]) - VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16) - ; -(t[4]-t[5]) - VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3] - VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16 - VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16 - VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]' - VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]' -; Stage 4 - VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]' - ; D23= y[5]=t[2]'-t[5]'' - VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]' - ; D21= y[4]=t[3]'-t[4]'' - VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]' - ; D17= y[2]=t[2]'+t[5]'' - VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]' - ; D19= y[3]=t[3]'-t[4]'' - ; 8x4 transpose - VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6 - ; Q11= d5d4b5b4 d7d6b7b6 - VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0 - ; Q9 = d3d2b3b2 d1d0b1b0 - VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4 - VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4 - VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0 - ; Q11= d7d6d5d4 d3d2d1d0 - VMULL.S16 Q15,D18,D0[1] - VMULL.S16 Q13,D22,D1[1] - VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0 - ; Q10= c7c6c5c4 c3c2c1c0 - ; Column transforms -; Stages 1, 2, & 3 - VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16) - VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16) - VMULL.S16 Q3, D22,D0[3] - VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16) - VSHRN.S32 D30,Q15,#16 - VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1] - VSHRN.S32 D26,Q13,#16 - VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3] - VSHRN.S32 D28,Q3, #16 - VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3] - VADD.S16 Q15,Q15,Q9 ; Q15= t[7] - VADD.S16 Q13,Q13,Q11 ; Q13= -t[5] - VADD.S16 Q14,Q14,Q11 ; Q14= t[6] - VMULL.S16 Q12,D18,D1[3] - VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1] - VMULL.S16 Q1, D16,D1[0] - VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16) - VMULL.S16 Q3, D20,D0[2] - VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16) - VSHRN.S32 D24,Q12,#16 - VSHRN.S32 D25,Q2, #16 ; Q12= t[4] - VMULL.S16 Q2, D20,D1[2] - VSHRN.S32 D2, Q1, #16 - VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0] - VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2] - VSHRN.S32 D6, Q3, #16 - VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2] - VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6] - VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6] - VSHRN.S32 D4, Q2, #16 - VSHRN.S32 D5, Q11,#16 ; Q2 = t[2] - VADD.S16 Q1, Q1, Q8 ; Q1 = t[0] - VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5] - VADD.S16 Q3, Q3, Q10 ; Q3 = t[3] - VMULL.S16 Q10,D16,D1[0] - VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5]) - ; -(t[4]-t[5]<<16) - VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5] - VMULL.S16 Q14,D18,D1[0] - VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7]) - ; -(t[6]-t[7]<<16) - VSHRN.S32 D20,Q10,#16 - VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16) - ; -(t[4]-t[5]) - VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3] - VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3] - VSHRN.S32 D28,Q14,#16 - VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16) - ; -(t[7]-t[6]) - VADD.S16 Q10,Q10,Q8 ; Q10=t[5]' - VADD.S16 Q14,Q14,Q9 ; Q14=t[6]' - VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]' - VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]' - VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] - VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] -; Stage 4 - CMP r0, r1 - VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' - VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' - VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' - VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]'' - VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]'' - VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' - VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' - VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' - BEQ oc_idct8x8_10_neon_noclear - VMOV.I8 D2, #0 - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VST1.64 {D2}, [r1@64], r12 - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VST1.64 {D2}, [r1@64], r12 - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VST1.64 {D2}, [r1@64], r12 - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VST1.64 {D2}, [r1@64] - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 - -oc_idct8x8_10_neon_noclear - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 - ENDP - ] - - END diff --git a/media/libtheora/lib/arm/armint.h b/media/libtheora/lib/arm/armint.h deleted file mode 100644 index cc62d2438..000000000 --- a/media/libtheora/lib/arm/armint.h +++ /dev/null @@ -1,126 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $ - - ********************************************************************/ -#if !defined(_arm_armint_H) -# define _arm_armint_H (1) -# include "../internal.h" - -# if defined(OC_ARM_ASM) - -# if defined(__ARMEB__) -# error "Big-endian configurations are not supported by the ARM asm. " \ - "Reconfigure with --disable-asm or undefine OC_ARM_ASM." -# endif - -# define oc_state_accel_init oc_state_accel_init_arm -/*This function is implemented entirely in asm, so it's helpful to pull out all - of the things that depend on structure offsets. - We reuse the function pointer with the wrong prototype, though.*/ -# define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \ - _fragy0,_fragy_end) \ - ((oc_loop_filter_frag_rows_arm_func) \ - (_state)->opt_vtable.state_loop_filter_frag_rows)( \ - (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \ - (_bv), \ - (_state)->frags, \ - (_state)->fplanes[(_pli)].froffset \ - +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \ - (_state)->fplanes[(_pli)].froffset \ - +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \ - (_state)->fplanes[(_pli)].froffset, \ - (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \ - (_state)->frag_buf_offs, \ - (_state)->fplanes[(_pli)].nhfrags) -/*For everything else the default vtable macros are fine.*/ -# define OC_STATE_USE_VTABLE (1) -# endif - -# include "../state.h" -# include "armcpu.h" - -# if defined(OC_ARM_ASM) -typedef void (*oc_loop_filter_frag_rows_arm_func)( - unsigned char *_ref_frame_data,int _ystride,signed char _bv[256], - const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end, - ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, - const ptrdiff_t *_frag_buf_offs,int _nhfrags); - -void oc_state_accel_init_arm(oc_theora_state *_state); -void oc_frag_copy_list_arm(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); -void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue); -void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src, - int _ystride,const ogg_int16_t *_residue); -void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc); -void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data, - int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, - ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, - const ptrdiff_t *_frag_buf_offs,int _nhfrags); - -# if defined(OC_ARM_ASM_EDSP) -void oc_frag_copy_list_edsp(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); - -# if defined(OC_ARM_ASM_MEDIA) -void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue); -void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src, - int _ystride,const ogg_int16_t *_residue); -void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc); -void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_loop_filter_init_v6(signed char *_bv,int _flimit); -void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data, - int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, - ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, - const ptrdiff_t *_frag_buf_offs,int _nhfrags); - -# if defined(OC_ARM_ASM_NEON) -void oc_frag_copy_list_neon(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); -void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue); -void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src, - int _ystride,const ogg_int16_t *_residue); -void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc); -void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_loop_filter_init_neon(signed char *_bv,int _flimit); -void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data, - int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, - ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, - const ptrdiff_t *_frag_buf_offs,int _nhfrags); -# endif -# endif -# endif -# endif - -#endif diff --git a/media/libtheora/lib/arm/armloop.s b/media/libtheora/lib/arm/armloop.s deleted file mode 100644 index 0a1d4705e..000000000 --- a/media/libtheora/lib/arm/armloop.s +++ /dev/null @@ -1,682 +0,0 @@ -;******************************************************************** -;* * -;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * -;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * -;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * -;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * -;* * -;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * -;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * -;* * -;******************************************************************** -; Original implementation: -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ -;******************************************************************** - - AREA |.text|, CODE, READONLY - - ; Explicitly specifying alignment here because some versions of - ; gas don't align code correctly. See - ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html - ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 - ALIGN - - GET armopts.s - - EXPORT oc_loop_filter_frag_rows_arm - -; Which bit this is depends on the order of packing within a bitfield. -; Hopefully that doesn't change among any of the relevant compilers. -OC_FRAG_CODED_FLAG * 1 - - ; Vanilla ARM v4 version -loop_filter_h_arm PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int *_bv - ; preserves r0-r3 - STMFD r13!,{r3-r6,r14} - MOV r14,#8 - MOV r6, #255 -lfh_arm_lp - LDRB r3, [r0, #-2] ; r3 = _pix[0] - LDRB r12,[r0, #1] ; r12= _pix[3] - LDRB r4, [r0, #-1] ; r4 = _pix[1] - LDRB r5, [r0] ; r5 = _pix[2] - SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 - ADD r3, r3, #4 - SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] - ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) - ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 - MOV r12,r12,ASR #3 - LDRSB r12,[r2, r12] - ; Stall (2 on Xscale) - ADDS r4, r4, r12 - CMPGT r6, r4 - EORLT r4, r6, r4, ASR #32 - SUBS r5, r5, r12 - CMPGT r6, r5 - EORLT r5, r6, r5, ASR #32 - STRB r4, [r0, #-1] - STRB r5, [r0], r1 - SUBS r14,r14,#1 - BGT lfh_arm_lp - SUB r0, r0, r1, LSL #3 - LDMFD r13!,{r3-r6,PC} - ENDP - -loop_filter_v_arm PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int *_bv - ; preserves r0-r3 - STMFD r13!,{r3-r6,r14} - MOV r14,#8 - MOV r6, #255 -lfv_arm_lp - LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] - LDRB r12,[r0, r1] ; r12= _pix[3] - LDRB r4, [r0, -r1] ; r4 = _pix[1] - LDRB r5, [r0] ; r5 = _pix[2] - SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 - ADD r3, r3, #4 - SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] - ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) - ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 - MOV r12,r12,ASR #3 - LDRSB r12,[r2, r12] - ; Stall (2 on Xscale) - ADDS r4, r4, r12 - CMPGT r6, r4 - EORLT r4, r6, r4, ASR #32 - SUBS r5, r5, r12 - CMPGT r6, r5 - EORLT r5, r6, r5, ASR #32 - STRB r4, [r0, -r1] - STRB r5, [r0], #1 - SUBS r14,r14,#1 - BGT lfv_arm_lp - SUB r0, r0, #8 - LDMFD r13!,{r3-r6,PC} - ENDP - -oc_loop_filter_frag_rows_arm PROC - ; r0 = _ref_frame_data - ; r1 = _ystride - ; r2 = _bv - ; r3 = _frags - ; r4 = _fragi0 - ; r5 = _fragi0_end - ; r6 = _fragi_top - ; r7 = _fragi_bot - ; r8 = _frag_buf_offs - ; r9 = _nhfrags - MOV r12,r13 - STMFD r13!,{r0,r4-r11,r14} - LDMFD r12,{r4-r9} - ADD r2, r2, #127 ; _bv += 127 - CMP r4, r5 ; if(_fragi0>=_fragi0_end) - BGE oslffri_arm_end ; bail - SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) - BLE oslffri_arm_end ; bail - ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] - ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] - SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; -oslffri_arm_lp1 - MOV r10,r4 ; r10= fragi = _fragi0 - ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 -oslffri_arm_lp2 - LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ - LDR r0, [r13] ; r0 = _ref_frame_data - LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ - TST r14,#OC_FRAG_CODED_FLAG - BEQ oslffri_arm_uncoded - CMP r10,r4 ; if (fragi>_fragi0) - ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] - BLGT loop_filter_h_arm - CMP r4, r6 ; if (_fragi0>_fragi_top) - BLGT loop_filter_v_arm - CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) - LDRLT r12,[r3] ; r12 = _frags[fragi+1] - ADD r0, r0, #8 - ADD r10,r10,#1 ; r10 = fragi+1; - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 - BLLT loop_filter_h_arm - CMP r10,r7 ; if (fragi<_fragi_bot) - LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] - SUB r0, r0, #8 - ADD r0, r0, r1, LSL #3 - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG - BLLT loop_filter_v_arm - CMP r10,r11 ; while(fragi<=fragi_end-1) - BLE oslffri_arm_lp2 - MOV r4, r10 ; r4 = fragi0 += _nhfrags - CMP r4, r5 - BLT oslffri_arm_lp1 -oslffri_arm_end - LDMFD r13!,{r0,r4-r11,PC} -oslffri_arm_uncoded - ADD r10,r10,#1 - CMP r10,r11 - BLE oslffri_arm_lp2 - MOV r4, r10 ; r4 = _fragi0 += _nhfrags - CMP r4, r5 - BLT oslffri_arm_lp1 - LDMFD r13!,{r0,r4-r11,PC} - ENDP - - [ OC_ARM_ASM_MEDIA - EXPORT oc_loop_filter_init_v6 - EXPORT oc_loop_filter_frag_rows_v6 - -oc_loop_filter_init_v6 PROC - ; r0 = _bv - ; r1 = _flimit (=L from the spec) - MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> - AND r1, r1, #255 ; r1 = ll=r1&0xFF - ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> - PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> - STR r1, [r0] - MOV PC,r14 - ENDP - -; We could use the same strategy as the v filter below, but that would require -; 40 instructions to load the data and transpose it into columns and another -; 32 to write out the results at the end, plus the 52 instructions to do the -; filtering itself. -; This is slightly less, and less code, even assuming we could have shared the -; 52 instructions in the middle with the other function. -; It executes slightly fewer instructions than the ARMv6 approach David Conrad -; proposed for FFmpeg, but not by much: -; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html -; His is a lot less code, though, because it only does two rows at once instead -; of four. -loop_filter_h_v6 PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int _ll - ; preserves r0-r3 - STMFD r13!,{r4-r11,r14} - LDR r12,=0x10003 - BL loop_filter_h_core_v6 - ADD r0, r0, r1, LSL #2 - BL loop_filter_h_core_v6 - SUB r0, r0, r1, LSL #2 - LDMFD r13!,{r4-r11,PC} - ENDP - -loop_filter_h_core_v6 PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int _ll - ; r12= 0x10003 - ; Preserves r0-r3, r12; Clobbers r4-r11. - LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> - ; Single issue - LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> - UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> - UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> - UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> - UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> - PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> - PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> - SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> - SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> - SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> - SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> - LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> - MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> - LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> - PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> - UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> - UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> - UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> - UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> - PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> - SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> - UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> - SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> - SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> - SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> - ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> - MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> - PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> - PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> - ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> - UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> - MOV r10,#0 - ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> - ; Single issue - ; There's no min, max or abs instruction. - ; SSUB8 and SEL will work for abs, and we can do all the rest with - ; unsigned saturated adds, which means the GE flags are still all - ; set when we're done computing lflim(abs(R_i),L). - ; This allows us to both add and subtract, and split the results by - ; the original sign of R_i. - SSUB8 r7, r10,r6 - ; Single issue - SEL r7, r7, r6 ; r7 = abs(R_i) - ; Single issue - UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) - ; Single issue - UQADD8 r7, r7, r4 - ; Single issue - UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) - ; Single issue - UQSUB8 r4, r8, r7 - UQADD8 r5, r9, r7 - UQADD8 r8, r8, r7 - UQSUB8 r9, r9, r7 - SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) - SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) - MOV r5, r9, LSR #24 ; r5 = s2 - STRB r5, [r0,#2]! - MOV r4, r8, LSR #24 ; r4 = s1 - STRB r4, [r0,#-1] - MOV r5, r9, LSR #8 ; r5 = r2 - STRB r5, [r0,-r1]! - MOV r4, r8, LSR #8 ; r4 = r1 - STRB r4, [r0,#-1] - MOV r5, r9, LSR #16 ; r5 = q2 - STRB r5, [r0,-r1]! - MOV r4, r8, LSR #16 ; r4 = q1 - STRB r4, [r0,#-1] - ; Single issue - STRB r9, [r0,-r1]! - ; Single issue - STRB r8, [r0,#-1] - MOV PC,r14 - ENDP - -; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 -; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. -; This works just as well, with the following procedure for computing the -; filter value, f: -; u = ~UHADD8(p1,~p2); -; v = UHADD8(~p1,p2); -; m = v-u; -; a = m^UHADD8(m^p0,m^~p3); -; f = UHADD8(UHADD8(a,u1),v1); -; where f = 127+R, with R in [-127,128] defined as in the spec. -; This is exactly the same amount of arithmetic as the version that uses PAVGB -; as the basic operator. -; It executes about 2/3 the number of instructions of David Conrad's approach, -; but requires more code, because it does all eight columns at once, instead -; of four at a time. -loop_filter_v_v6 PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int _ll - ; preserves r0-r11 - STMFD r13!,{r4-r11,r14} - LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> - LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> - LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> - MVN r14,r6 ; r14= ~p1 - LDRD r10,[r0, r1] ; r11,r10= <p7|p3> - ; Filter the first four columns. - MVN r12,r8 ; r12= ~p2 - UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 - UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 - MVN r10, r10 ; r10=~p3 - MVN r12,r12 ; r12= u1=~p1+p2+1>>1 - SSUB8 r14,r14,r12 ; r14= m1=v1-u1 - ; Single issue - EOR r4, r4, r14 ; r4 = m1^p0 - EOR r10,r10,r14 ; r10= m1^~p3 - UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 - ; Single issue - EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) - SADD8 r14,r14,r12 ; r14= v1=m1+u1 - UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 - MVN r12,r9 ; r12= ~p6 - UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 - ; Filter the second four columns. - MVN r14,r7 ; r14= ~p5 - UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 - UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 - MVN r12,r12 ; r12= u2=~p5+p6+1>>1 - MVN r11,r11 ; r11=~p7 - SSUB8 r10,r14,r12 ; r10= m2=v2-u2 - ; Single issue - EOR r5, r5, r10 ; r5 = m2^p4 - EOR r11,r11,r10 ; r11= m2^~p7 - UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 - ; Single issue - EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) - ; Single issue - UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 - LDR r12,=0x7F7F7F7F ; r12 = {127}x4 - UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 - ; Now split f[i] by sign. - ; There's no min or max instruction. - ; We could use SSUB8 and SEL, but this is just as many instructions and - ; dual issues more (for v7 without NEON). - UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 - UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 - UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) - UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) - UQADD8 r10,r10,r11 - UQADD8 r4, r4, r14 - UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) - UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) - UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 - UQADD8 r6, r6, r10 - UQSUB8 r8, r8, r10 - UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 - UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) - UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) - UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) - UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) - UQADD8 r11,r11,r10 - UQADD8 r5, r5, r14 - UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) - UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) - UQADD8 r7, r7, r11 - UQSUB8 r9, r9, r11 - UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) - STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] - UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) - STRD r8, [r0] ; [p6:p2] = [r9: r8] - LDMFD r13!,{r4-r11,PC} - ENDP - -oc_loop_filter_frag_rows_v6 PROC - ; r0 = _ref_frame_data - ; r1 = _ystride - ; r2 = _bv - ; r3 = _frags - ; r4 = _fragi0 - ; r5 = _fragi0_end - ; r6 = _fragi_top - ; r7 = _fragi_bot - ; r8 = _frag_buf_offs - ; r9 = _nhfrags - MOV r12,r13 - STMFD r13!,{r0,r4-r11,r14} - LDMFD r12,{r4-r9} - LDR r2, [r2] ; ll = *(int *)_bv - CMP r4, r5 ; if(_fragi0>=_fragi0_end) - BGE oslffri_v6_end ; bail - SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) - BLE oslffri_v6_end ; bail - ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] - ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] - SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; -oslffri_v6_lp1 - MOV r10,r4 ; r10= fragi = _fragi0 - ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 -oslffri_v6_lp2 - LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ - LDR r0, [r13] ; r0 = _ref_frame_data - LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ - TST r14,#OC_FRAG_CODED_FLAG - BEQ oslffri_v6_uncoded - CMP r10,r4 ; if (fragi>_fragi0) - ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] - BLGT loop_filter_h_v6 - CMP r4, r6 ; if (fragi0>_fragi_top) - BLGT loop_filter_v_v6 - CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) - LDRLT r12,[r3] ; r12 = _frags[fragi+1] - ADD r0, r0, #8 - ADD r10,r10,#1 ; r10 = fragi+1; - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 - BLLT loop_filter_h_v6 - CMP r10,r7 ; if (fragi<_fragi_bot) - LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] - SUB r0, r0, #8 - ADD r0, r0, r1, LSL #3 - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG - BLLT loop_filter_v_v6 - CMP r10,r11 ; while(fragi<=fragi_end-1) - BLE oslffri_v6_lp2 - MOV r4, r10 ; r4 = fragi0 += nhfrags - CMP r4, r5 - BLT oslffri_v6_lp1 -oslffri_v6_end - LDMFD r13!,{r0,r4-r11,PC} -oslffri_v6_uncoded - ADD r10,r10,#1 - CMP r10,r11 - BLE oslffri_v6_lp2 - MOV r4, r10 ; r4 = fragi0 += nhfrags - CMP r4, r5 - BLT oslffri_v6_lp1 - LDMFD r13!,{r0,r4-r11,PC} - ENDP - ] - - [ OC_ARM_ASM_NEON - EXPORT oc_loop_filter_init_neon - EXPORT oc_loop_filter_frag_rows_neon - -oc_loop_filter_init_neon PROC - ; r0 = _bv - ; r1 = _flimit (=L from the spec) - MOV r1, r1, LSL #1 ; r1 = 2*L - VDUP.S16 Q15, r1 ; Q15= 2L in U16s - VST1.64 {D30,D31}, [r0@128] - MOV PC,r14 - ENDP - -loop_filter_h_neon PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int *_bv - ; preserves r0-r3 - ; We assume Q15= 2*L in U16s - ; My best guesses at cycle counts (and latency)--vvv - SUB r12,r0, #2 - ; Doing a 2-element structure load saves doing two VTRN's below, at the - ; cost of using two more slower single-lane loads vs. the faster - ; all-lane loads. - ; It's less code this way, though, and benches a hair faster, but it - ; leaves D2 and D4 swapped. - VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 - ; D2 = ____________3322 - VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 - ; D6 = ____________7766 - VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 - ; D2 = ________BBAA3322 - VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 - ; D6 = ________FFEE7766 - VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 - ; D2 = ____JJIIBBAA3322 - VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 - ; D6 = ____NNMMFFEE7766 - VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 - ; D2 = RRQQJJIIBBAA3322 - VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 - ; D6 = VVUUNNMMFFEE7766 - VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 - VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 - VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 - VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 - ADD r12,r0, #8 - VADD.S16 Q0, Q0, Q8 ; 1,3 - PLD [r12] - VADD.S16 Q0, Q0, Q8 ; 1,3 - PLD [r12,r1] - VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 - PLD [r12,r1, LSL #1] - VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 - ADD r12,r12,r1, LSL #2 - ; We want to do - ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) - ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) - ; So we've reduced the left and right hand terms to be the same, except - ; for a negation. - ; Stall x3 - VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 - PLD [r12,-r1] - VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 - PLD [r12] - VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 - PLD [r12,r1] - VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 - PLD [r12,r1,LSL #1] - VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 - ADD r12,r12,r1, LSL #2 - ; Now we need to correct for the sign of f. - ; For negative elements of Q0, we want to subtract the appropriate - ; element of Q9. For positive elements we want to add them. No NEON - ; instruction exists to do this, so we need to negate the negative - ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b - VADD.S16 Q9, Q9, Q0 ; 1,3 - PLD [r12,-r1] - VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 - ; Bah. No VRSBW.U8 - ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) - VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 - VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 - VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 - VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 - SUB r12,r0, #1 - VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 - VST1.16 {D4[0]}, [r12], r1 - VST1.16 {D2[0]}, [r12], r1 - VST1.16 {D4[1]}, [r12], r1 - VST1.16 {D2[1]}, [r12], r1 - VST1.16 {D4[2]}, [r12], r1 - VST1.16 {D2[2]}, [r12], r1 - VST1.16 {D4[3]}, [r12], r1 - VST1.16 {D2[3]}, [r12], r1 - MOV PC,r14 - ENDP - -loop_filter_v_neon PROC - ; r0 = unsigned char *_pix - ; r1 = int _ystride - ; r2 = int *_bv - ; preserves r0-r3 - ; We assume Q15= 2*L in U16s - ; My best guesses at cycle counts (and latency)--vvv - SUB r12,r0, r1, LSL #1 - VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 - VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 - VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 - VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 - VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 - VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 - ADD r12, #8 - VADD.S16 Q0, Q0, Q8 ; 1,3 - PLD [r12] - VADD.S16 Q0, Q0, Q8 ; 1,3 - PLD [r12,r1] - VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 - SUB r12, r0, r1 - VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 - ; We want to do - ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) - ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) - ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) - ; So we've reduced the left and right hand terms to be the same, except - ; for a negation. - ; Stall x3 - VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 - VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 - ; Stall x2 - VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 - VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 - ; Stall x2 - VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 - ; Now we need to correct for the sign of f. - ; For negative elements of Q0, we want to subtract the appropriate - ; element of Q9. For positive elements we want to add them. No NEON - ; instruction exists to do this, so we need to negate the negative - ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b - ; Stall x3 - VADD.S16 Q9, Q9, Q0 ; 1,3 - ; Stall x2 - VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 - ; Bah. No VRSBW.U8 - ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) - VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 - VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 - VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 - VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 - VST1.64 {D2}, [r12@64], r1 - VST1.64 {D4}, [r12@64], r1 - MOV PC,r14 - ENDP - -oc_loop_filter_frag_rows_neon PROC - ; r0 = _ref_frame_data - ; r1 = _ystride - ; r2 = _bv - ; r3 = _frags - ; r4 = _fragi0 - ; r5 = _fragi0_end - ; r6 = _fragi_top - ; r7 = _fragi_bot - ; r8 = _frag_buf_offs - ; r9 = _nhfrags - MOV r12,r13 - STMFD r13!,{r0,r4-r11,r14} - LDMFD r12,{r4-r9} - CMP r4, r5 ; if(_fragi0>=_fragi0_end) - BGE oslffri_neon_end; bail - SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) - BLE oslffri_neon_end ; bail - VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s - ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] - ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] - SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; -oslffri_neon_lp1 - MOV r10,r4 ; r10= fragi = _fragi0 - ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 -oslffri_neon_lp2 - LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ - LDR r0, [r13] ; r0 = _ref_frame_data - LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ - TST r14,#OC_FRAG_CODED_FLAG - BEQ oslffri_neon_uncoded - CMP r10,r4 ; if (fragi>_fragi0) - ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] - BLGT loop_filter_h_neon - CMP r4, r6 ; if (_fragi0>_fragi_top) - BLGT loop_filter_v_neon - CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) - LDRLT r12,[r3] ; r12 = _frags[fragi+1] - ADD r0, r0, #8 - ADD r10,r10,#1 ; r10 = fragi+1; - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 - BLLT loop_filter_h_neon - CMP r10,r7 ; if (fragi<_fragi_bot) - LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] - SUB r0, r0, #8 - ADD r0, r0, r1, LSL #3 - ANDLT r12,r12,#OC_FRAG_CODED_FLAG - CMPLT r12,#OC_FRAG_CODED_FLAG - BLLT loop_filter_v_neon - CMP r10,r11 ; while(fragi<=fragi_end-1) - BLE oslffri_neon_lp2 - MOV r4, r10 ; r4 = _fragi0 += _nhfrags - CMP r4, r5 - BLT oslffri_neon_lp1 -oslffri_neon_end - LDMFD r13!,{r0,r4-r11,PC} -oslffri_neon_uncoded - ADD r10,r10,#1 - CMP r10,r11 - BLE oslffri_neon_lp2 - MOV r4, r10 ; r4 = _fragi0 += _nhfrags - CMP r4, r5 - BLT oslffri_neon_lp1 - LDMFD r13!,{r0,r4-r11,PC} - ENDP - ] - - END diff --git a/media/libtheora/lib/arm/armopts.s b/media/libtheora/lib/arm/armopts.s deleted file mode 100644 index e4da429e4..000000000 --- a/media/libtheora/lib/arm/armopts.s +++ /dev/null @@ -1,39 +0,0 @@ -;******************************************************************** -;* * -;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * -;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * -;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * -;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * -;* * -;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * -;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * -;* * -;******************************************************************** -; Original implementation: -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $ -;******************************************************************** - -; Set the following to 1 if we have EDSP instructions -; (LDRD/STRD, etc., ARMv5E and later). -OC_ARM_ASM_EDSP * 1 - -; Set the following to 1 if we have ARMv6 media instructions. -OC_ARM_ASM_MEDIA * 1 - -; Set the following to 1 if we have NEON (some ARMv7) -OC_ARM_ASM_NEON * 1 - -; Set the following to 1 if LDR/STR can work on unaligned addresses -; This is assumed to be true for ARMv6 and later code -OC_ARM_CAN_UNALIGN * 0 - -; Large unaligned loads and stores are often configured to cause an exception. -; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store) -; boundary, so it's usually a bad idea to use them anyway if they can be -; avoided. - -; Set the following to 1 if LDRD/STRD can work on unaligned addresses -OC_ARM_CAN_UNALIGN_LDRD * 0 - - END diff --git a/media/libtheora/lib/arm/armstate.c b/media/libtheora/lib/arm/armstate.c deleted file mode 100644 index a56060838..000000000 --- a/media/libtheora/lib/arm/armstate.c +++ /dev/null @@ -1,219 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $ - - ********************************************************************/ -#include "armint.h" - -#if defined(OC_ARM_ASM) - -# if defined(OC_ARM_ASM_NEON) -/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into - the destination.*/ -static const unsigned char OC_FZIG_ZAG_NEON[128]={ - 0, 8, 1, 2, 9,16,24,17, - 10, 3, 4,11,18,25,32,40, - 33,26,19,12, 5, 6,13,20, - 27,34,41,48,56,49,42,35, - 28,21,14, 7,15,22,29,36, - 43,50,57,58,51,44,37,30, - 23,31,38,45,52,59,60,53, - 46,39,47,54,61,62,55,63, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64 -}; -# endif - -void oc_state_accel_init_arm(oc_theora_state *_state){ - oc_state_accel_init_c(_state); - _state->cpu_flags=oc_cpu_flags_get(); -# if defined(OC_STATE_USE_VTABLE) - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm; - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm; - _state->opt_vtable.idct8x8=oc_idct8x8_arm; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm; - /*Note: We _must_ set this function pointer, because the macro in armint.h - calls it with different arguments, so the C version will segfault.*/ - _state->opt_vtable.state_loop_filter_frag_rows= - (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm; -# endif -# if defined(OC_ARM_ASM_EDSP) - if(_state->cpu_flags&OC_CPU_ARM_EDSP){ -# if defined(OC_STATE_USE_VTABLE) - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp; -# endif - } -# if defined(OC_ARM_ASM_MEDIA) - if(_state->cpu_flags&OC_CPU_ARM_MEDIA){ -# if defined(OC_STATE_USE_VTABLE) - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6; - _state->opt_vtable.idct8x8=oc_idct8x8_v6; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6; - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6; - _state->opt_vtable.state_loop_filter_frag_rows= - (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6; -# endif - } -# if defined(OC_ARM_ASM_NEON) - if(_state->cpu_flags&OC_CPU_ARM_NEON){ -# if defined(OC_STATE_USE_VTABLE) - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon; - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon; - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon; - _state->opt_vtable.state_loop_filter_frag_rows= - (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon; - _state->opt_vtable.idct8x8=oc_idct8x8_neon; -# endif - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON; - } -# endif -# endif -# endif -} - -void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - ogg_uint16_t p; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - oc_idct8x8_1_arm(_dct_coeffs+64,p); - } - else{ - /*First, dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs+64); - } - else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } -} - -# if defined(OC_ARM_ASM_MEDIA) -void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - ogg_uint16_t p; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - oc_idct8x8_1_v6(_dct_coeffs+64,p); - } - else{ - /*First, dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs+64); - } - else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } -} - -# if defined(OC_ARM_ASM_NEON) -void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - ogg_uint16_t p; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - oc_idct8x8_1_neon(_dct_coeffs+64,p); - } - else{ - /*First, dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs+64); - } - else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } -} -# endif -# endif - -#endif diff --git a/media/libtheora/lib/bitpack.c b/media/libtheora/lib/bitpack.c deleted file mode 100644 index 8bfce4c3d..000000000 --- a/media/libtheora/lib/bitpack.c +++ /dev/null @@ -1,114 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: packing variable sized words into an octet stream - last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ -#include <string.h> -#include <stdlib.h> -#include "bitpack.h" - -/*We're 'MSb' endian; if we write a word but read individual bits, - then we'll read the MSb first.*/ - -void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes){ - memset(_b,0,sizeof(*_b)); - _b->ptr=_buf; - _b->stop=_buf+_bytes; -} - -static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){ - const unsigned char *ptr; - const unsigned char *stop; - oc_pb_window window; - int available; - unsigned shift; - stop=_b->stop; - ptr=_b->ptr; - window=_b->window; - available=_b->bits; - shift=OC_PB_WINDOW_SIZE-available; - while(7<shift&&ptr<stop){ - shift-=8; - window|=(oc_pb_window)*ptr++<<shift; - } - _b->ptr=ptr; - available=OC_PB_WINDOW_SIZE-shift; - if(_bits>available){ - if(ptr>=stop){ - _b->eof=1; - available=OC_LOTS_OF_BITS; - } - else window|=*ptr>>(available&7); - } - _b->bits=available; - return window; -} - -int oc_pack_look1(oc_pack_buf *_b){ - oc_pb_window window; - int available; - window=_b->window; - available=_b->bits; - if(available<1)_b->window=window=oc_pack_refill(_b,1); - return window>>OC_PB_WINDOW_SIZE-1; -} - -void oc_pack_adv1(oc_pack_buf *_b){ - _b->window<<=1; - _b->bits--; -} - -/*Here we assume that 0<=_bits&&_bits<=32.*/ -long oc_pack_read_c(oc_pack_buf *_b,int _bits){ - oc_pb_window window; - int available; - long result; - window=_b->window; - available=_b->bits; - if(_bits==0)return 0; - if(available<_bits){ - window=oc_pack_refill(_b,_bits); - available=_b->bits; - } - result=window>>OC_PB_WINDOW_SIZE-_bits; - available-=_bits; - window<<=1; - window<<=_bits-1; - _b->window=window; - _b->bits=available; - return result; -} - -int oc_pack_read1_c(oc_pack_buf *_b){ - oc_pb_window window; - int available; - int result; - window=_b->window; - available=_b->bits; - if(available<1){ - window=oc_pack_refill(_b,1); - available=_b->bits; - } - result=window>>OC_PB_WINDOW_SIZE-1; - available--; - window<<=1; - _b->window=window; - _b->bits=available; - return result; -} - -long oc_pack_bytes_left(oc_pack_buf *_b){ - if(_b->eof)return -1; - return _b->stop-_b->ptr+(_b->bits>>3); -} diff --git a/media/libtheora/lib/bitpack.h b/media/libtheora/lib/bitpack.h deleted file mode 100644 index 237b58405..000000000 --- a/media/libtheora/lib/bitpack.h +++ /dev/null @@ -1,76 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: packing variable sized words into an octet stream - last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $ - - ********************************************************************/ -#if !defined(_bitpack_H) -# define _bitpack_H (1) -# include <stddef.h> -# include <limits.h> -# include "internal.h" - - - -typedef size_t oc_pb_window; -typedef struct oc_pack_buf oc_pack_buf; - - - -/*Custom bitpacker implementations.*/ -# if defined(OC_ARM_ASM) -# include "arm/armbits.h" -# endif - -# if !defined(oc_pack_read) -# define oc_pack_read oc_pack_read_c -# endif -# if !defined(oc_pack_read1) -# define oc_pack_read1 oc_pack_read1_c -# endif -# if !defined(oc_huff_token_decode) -# define oc_huff_token_decode oc_huff_token_decode_c -# endif - -# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT) -/*This is meant to be a large, positive constant that can still be efficiently - loaded as an immediate (on platforms like ARM, for example). - Even relatively modest values like 100 would work fine.*/ -# define OC_LOTS_OF_BITS (0x40000000) - - - -struct oc_pack_buf{ - const unsigned char *stop; - const unsigned char *ptr; - oc_pb_window window; - int bits; - int eof; -}; - -void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes); -int oc_pack_look1(oc_pack_buf *_b); -void oc_pack_adv1(oc_pack_buf *_b); -/*Here we assume 0<=_bits&&_bits<=32.*/ -long oc_pack_read_c(oc_pack_buf *_b,int _bits); -int oc_pack_read1_c(oc_pack_buf *_b); -/* returns -1 for read beyond EOF, or the number of whole bytes available */ -long oc_pack_bytes_left(oc_pack_buf *_b); - -/*These two functions are implemented locally in huffdec.c*/ -/*Read in bits without advancing the bitptr. - Here we assume 0<=_bits&&_bits<=32.*/ -/*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/ -/*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/ - -#endif diff --git a/media/libtheora/lib/config.h b/media/libtheora/lib/config.h deleted file mode 100644 index 49772ac7f..000000000 --- a/media/libtheora/lib/config.h +++ /dev/null @@ -1,98 +0,0 @@ -/* config.h. Generated from config.h.in by configure. */ -/* config.h.in. Generated from configure.ac by autoheader. */ - -/* libcairo is available for visual debugging output */ -/* #undef HAVE_CAIRO */ - -/* Define to 1 if you have the <dlfcn.h> header file. */ -#define HAVE_DLFCN_H 1 - -/* Define to 1 if you have the <inttypes.h> header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define to 1 if you have the <machine/soundcard.h> header file. */ -/* #undef HAVE_MACHINE_SOUNDCARD_H */ - -/* Define to 1 if you have the <memory.h> header file. */ -#define HAVE_MEMORY_H 1 - -/* Define to 1 if you have the <soundcard.h> header file. */ -/* #undef HAVE_SOUNDCARD_H */ - -/* Define to 1 if you have the <stdint.h> header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the <stdlib.h> header file. */ -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the <strings.h> header file. */ -#define HAVE_STRINGS_H 1 - -/* Define to 1 if you have the <string.h> header file. */ -#define HAVE_STRING_H 1 - -/* Define to 1 if you have the <sys/soundcard.h> header file. */ -#define HAVE_SYS_SOUNDCARD_H 1 - -/* Define to 1 if you have the <sys/stat.h> header file. */ -#define HAVE_SYS_STAT_H 1 - -/* Define to 1 if you have the <sys/types.h> header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if you have the <unistd.h> header file. */ -#define HAVE_UNISTD_H 1 - -/* Define to 1 if your C compiler doesn't accept -c and -o together. */ -/* #undef NO_MINUS_C_MINUS_O */ - -/* make use of arm asm optimization */ - - -/* Define if assembler supports EDSP instructions */ - - -/* Define if assembler supports ARMv6 media instructions */ - - -/* Define if compiler supports NEON instructions */ - - -/* make use of c64x+ asm optimization */ -/* #undef OC_C64X_ASM */ - -/* make use of x86_64 asm optimization */ -/* #undef OC_X86_64_ASM */ - -/* make use of x86 asm optimization */ -/* #undef OC_X86_ASM */ - -/* Name of package */ -#define PACKAGE "libtheora" - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "libtheora" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "libtheora 1.2.0alpha1+svn" - -/* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "libtheora" - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "1.2.0alpha1+svn" - -/* Define to 1 if you have the ANSI C header files. */ -#define STDC_HEADERS 1 - -/* Define to exclude encode support from the build */ -/* #undef THEORA_DISABLE_ENCODE */ - -/* Define to exclude floating point code from the build */ -/* #undef THEORA_DISABLE_FLOAT */ - -/* Version number of package */ -#define VERSION "1.2.0alpha1+svn" diff --git a/media/libtheora/lib/dct.h b/media/libtheora/lib/dct.h deleted file mode 100644 index 24ba6f111..000000000 --- a/media/libtheora/lib/dct.h +++ /dev/null @@ -1,31 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -/*Definitions shared by the forward and inverse DCT transforms.*/ -#if !defined(_dct_H) -# define _dct_H (1) - -/*cos(n*pi/16) (resp. sin(m*pi/16)) scaled by 65536.*/ -#define OC_C1S7 ((ogg_int32_t)64277) -#define OC_C2S6 ((ogg_int32_t)60547) -#define OC_C3S5 ((ogg_int32_t)54491) -#define OC_C4S4 ((ogg_int32_t)46341) -#define OC_C5S3 ((ogg_int32_t)36410) -#define OC_C6S2 ((ogg_int32_t)25080) -#define OC_C7S1 ((ogg_int32_t)12785) - -#endif diff --git a/media/libtheora/lib/decapiwrapper.c b/media/libtheora/lib/decapiwrapper.c deleted file mode 100644 index 12ea475d1..000000000 --- a/media/libtheora/lib/decapiwrapper.c +++ /dev/null @@ -1,193 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: decapiwrapper.c 13596 2007-08-23 20:05:38Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include "apiwrapper.h" -#include "decint.h" -#include "theora/theoradec.h" - -static void th_dec_api_clear(th_api_wrapper *_api){ - if(_api->setup)th_setup_free(_api->setup); - if(_api->decode)th_decode_free(_api->decode); - memset(_api,0,sizeof(*_api)); -} - -static void theora_decode_clear(theora_state *_td){ - if(_td->i!=NULL)theora_info_clear(_td->i); - memset(_td,0,sizeof(*_td)); -} - -static int theora_decode_control(theora_state *_td,int _req, - void *_buf,size_t _buf_sz){ - return th_decode_ctl(((th_api_wrapper *)_td->i->codec_setup)->decode, - _req,_buf,_buf_sz); -} - -static ogg_int64_t theora_decode_granule_frame(theora_state *_td, - ogg_int64_t _gp){ - return th_granule_frame(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp); -} - -static double theora_decode_granule_time(theora_state *_td,ogg_int64_t _gp){ - return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp); -} - -static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={ - (oc_state_clear_func)theora_decode_clear, - (oc_state_control_func)theora_decode_control, - (oc_state_granule_frame_func)theora_decode_granule_frame, - (oc_state_granule_time_func)theora_decode_granule_time, -}; - -static void th_info2theora_info(theora_info *_ci,const th_info *_info){ - _ci->version_major=_info->version_major; - _ci->version_minor=_info->version_minor; - _ci->version_subminor=_info->version_subminor; - _ci->width=_info->frame_width; - _ci->height=_info->frame_height; - _ci->frame_width=_info->pic_width; - _ci->frame_height=_info->pic_height; - _ci->offset_x=_info->pic_x; - _ci->offset_y=_info->pic_y; - _ci->fps_numerator=_info->fps_numerator; - _ci->fps_denominator=_info->fps_denominator; - _ci->aspect_numerator=_info->aspect_numerator; - _ci->aspect_denominator=_info->aspect_denominator; - switch(_info->colorspace){ - case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break; - case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break; - default:_ci->colorspace=OC_CS_UNSPECIFIED;break; - } - switch(_info->pixel_fmt){ - case TH_PF_420:_ci->pixelformat=OC_PF_420;break; - case TH_PF_422:_ci->pixelformat=OC_PF_422;break; - case TH_PF_444:_ci->pixelformat=OC_PF_444;break; - default:_ci->pixelformat=OC_PF_RSVD; - } - _ci->target_bitrate=_info->target_bitrate; - _ci->quality=_info->quality; - _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift; -} - -int theora_decode_init(theora_state *_td,theora_info *_ci){ - th_api_info *apiinfo; - th_api_wrapper *api; - th_info info; - api=(th_api_wrapper *)_ci->codec_setup; - /*Allocate our own combined API wrapper/theora_info struct. - We put them both in one malloc'd block so that when the API wrapper is - freed, the info struct goes with it. - This avoids having to figure out whether or not we need to free the info - struct in either theora_info_clear() or theora_clear().*/ - apiinfo=(th_api_info *)_ogg_calloc(1,sizeof(*apiinfo)); - if(apiinfo==NULL)return OC_FAULT; - /*Make our own copy of the info struct, since its lifetime should be - independent of the one we were passed in.*/ - *&apiinfo->info=*_ci; - /*Convert the info struct now instead of saving the the one we decoded with - theora_decode_header(), since the user might have modified values (i.e., - color space, aspect ratio, etc. can be specified from a higher level). - The user also might be doing something "clever" with the header packets if - they are not using an Ogg encapsulation.*/ - oc_theora_info2th_info(&info,_ci); - /*Don't bother to copy the setup info; th_decode_alloc() makes its own copy - of the stuff it needs.*/ - apiinfo->api.decode=th_decode_alloc(&info,api->setup); - if(apiinfo->api.decode==NULL){ - _ogg_free(apiinfo); - return OC_EINVAL; - } - apiinfo->api.clear=(oc_setup_clear_func)th_dec_api_clear; - _td->internal_encode=NULL; - /*Provide entry points for ABI compatibility with old decoder shared libs.*/ - _td->internal_decode=(void *)&OC_DEC_DISPATCH_VTBL; - _td->granulepos=0; - _td->i=&apiinfo->info; - _td->i->codec_setup=&apiinfo->api; - return 0; -} - -int theora_decode_header(theora_info *_ci,theora_comment *_cc,ogg_packet *_op){ - th_api_wrapper *api; - th_info info; - int ret; - api=(th_api_wrapper *)_ci->codec_setup; - /*Allocate an API wrapper struct on demand, since it will not also include a - theora_info struct like the ones that are used in a theora_state struct.*/ - if(api==NULL){ - _ci->codec_setup=_ogg_calloc(1,sizeof(*api)); - if(_ci->codec_setup==NULL)return OC_FAULT; - api=(th_api_wrapper *)_ci->codec_setup; - api->clear=(oc_setup_clear_func)th_dec_api_clear; - } - /*Convert from the theora_info struct instead of saving our own th_info - struct between calls. - The user might be doing something "clever" with the header packets if they - are not using an Ogg encapsulation, and we don't want to break this.*/ - oc_theora_info2th_info(&info,_ci); - /*We rely on the fact that theora_comment and th_comment structures are - actually identical. - Take care not to change this fact unless you change the code here as - well!*/ - ret=th_decode_headerin(&info,(th_comment *)_cc,&api->setup,_op); - /*We also rely on the fact that the error return code values are the same, - and that the implementations of these two functions return the same set of - them. - Note that theora_decode_header() really can return OC_NOTFORMAT, even - though it is not currently documented to do so.*/ - if(ret<0)return ret; - th_info2theora_info(_ci,&info); - return 0; -} - -int theora_decode_packetin(theora_state *_td,ogg_packet *_op){ - th_api_wrapper *api; - ogg_int64_t gp; - int ret; - if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT; - api=(th_api_wrapper *)_td->i->codec_setup; - ret=th_decode_packetin(api->decode,_op,&gp); - if(ret<0)return OC_BADPACKET; - _td->granulepos=gp; - return 0; -} - -int theora_decode_YUVout(theora_state *_td,yuv_buffer *_yuv){ - th_api_wrapper *api; - th_dec_ctx *decode; - th_ycbcr_buffer buf; - int ret; - if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT; - api=(th_api_wrapper *)_td->i->codec_setup; - decode=(th_dec_ctx *)api->decode; - if(!decode)return OC_FAULT; - ret=th_decode_ycbcr_out(decode,buf); - if(ret>=0){ - _yuv->y_width=buf[0].width; - _yuv->y_height=buf[0].height; - _yuv->y_stride=buf[0].stride; - _yuv->uv_width=buf[1].width; - _yuv->uv_height=buf[1].height; - _yuv->uv_stride=buf[1].stride; - _yuv->y=buf[0].data; - _yuv->u=buf[1].data; - _yuv->v=buf[2].data; - } - return ret; -} diff --git a/media/libtheora/lib/decinfo.c b/media/libtheora/lib/decinfo.c deleted file mode 100644 index 603b1f93e..000000000 --- a/media/libtheora/lib/decinfo.c +++ /dev/null @@ -1,250 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include "decint.h" - - - -/*Unpacks a series of octets from a given byte array into the pack buffer. - No checking is done to ensure the buffer contains enough data. - _opb: The pack buffer to read the octets from. - _buf: The byte array to store the unpacked bytes in. - _len: The number of octets to unpack.*/ -static void oc_unpack_octets(oc_pack_buf *_opb,char *_buf,size_t _len){ - while(_len-->0){ - long val; - val=oc_pack_read(_opb,8); - *_buf++=(char)val; - } -} - -/*Unpacks a 32-bit integer encoded by octets in little-endian form.*/ -static long oc_unpack_length(oc_pack_buf *_opb){ - long ret[4]; - int i; - for(i=0;i<4;i++)ret[i]=oc_pack_read(_opb,8); - return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24; -} - -static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){ - long val; - /*Check the codec bitstream version.*/ - val=oc_pack_read(_opb,8); - _info->version_major=(unsigned char)val; - val=oc_pack_read(_opb,8); - _info->version_minor=(unsigned char)val; - val=oc_pack_read(_opb,8); - _info->version_subminor=(unsigned char)val; - /*verify we can parse this bitstream version. - We accept earlier minors and all subminors, by spec*/ - if(_info->version_major>TH_VERSION_MAJOR|| - _info->version_major==TH_VERSION_MAJOR&& - _info->version_minor>TH_VERSION_MINOR){ - return TH_EVERSION; - } - /*Read the encoded frame description.*/ - val=oc_pack_read(_opb,16); - _info->frame_width=(ogg_uint32_t)val<<4; - val=oc_pack_read(_opb,16); - _info->frame_height=(ogg_uint32_t)val<<4; - val=oc_pack_read(_opb,24); - _info->pic_width=(ogg_uint32_t)val; - val=oc_pack_read(_opb,24); - _info->pic_height=(ogg_uint32_t)val; - val=oc_pack_read(_opb,8); - _info->pic_x=(ogg_uint32_t)val; - val=oc_pack_read(_opb,8); - _info->pic_y=(ogg_uint32_t)val; - val=oc_pack_read(_opb,32); - _info->fps_numerator=(ogg_uint32_t)val; - val=oc_pack_read(_opb,32); - _info->fps_denominator=(ogg_uint32_t)val; - if(_info->frame_width==0||_info->frame_height==0|| - _info->pic_width+_info->pic_x>_info->frame_width|| - _info->pic_height+_info->pic_y>_info->frame_height|| - _info->fps_numerator==0||_info->fps_denominator==0){ - return TH_EBADHEADER; - } - /*Note: The sense of pic_y is inverted in what we pass back to the - application compared to how it is stored in the bitstream. - This is because the bitstream uses a right-handed coordinate system, while - applications expect a left-handed one.*/ - _info->pic_y=_info->frame_height-_info->pic_height-_info->pic_y; - val=oc_pack_read(_opb,24); - _info->aspect_numerator=(ogg_uint32_t)val; - val=oc_pack_read(_opb,24); - _info->aspect_denominator=(ogg_uint32_t)val; - val=oc_pack_read(_opb,8); - _info->colorspace=(th_colorspace)val; - val=oc_pack_read(_opb,24); - _info->target_bitrate=(int)val; - val=oc_pack_read(_opb,6); - _info->quality=(int)val; - val=oc_pack_read(_opb,5); - _info->keyframe_granule_shift=(int)val; - val=oc_pack_read(_opb,2); - _info->pixel_fmt=(th_pixel_fmt)val; - if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER; - val=oc_pack_read(_opb,3); - if(val!=0||oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; - return 0; -} - -static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){ - long len; - int i; - /*Read the vendor string.*/ - len=oc_unpack_length(_opb); - if(len<0||len>oc_pack_bytes_left(_opb))return TH_EBADHEADER; - _tc->vendor=_ogg_malloc((size_t)len+1); - if(_tc->vendor==NULL)return TH_EFAULT; - oc_unpack_octets(_opb,_tc->vendor,len); - _tc->vendor[len]='\0'; - /*Read the user comments.*/ - _tc->comments=(int)oc_unpack_length(_opb); - len=_tc->comments; - if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){ - _tc->comments=0; - return TH_EBADHEADER; - } - _tc->comment_lengths=(int *)_ogg_malloc( - _tc->comments*sizeof(_tc->comment_lengths[0])); - _tc->user_comments=(char **)_ogg_malloc( - _tc->comments*sizeof(_tc->user_comments[0])); - if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){ - _tc->comments=0; - return TH_EFAULT; - } - for(i=0;i<_tc->comments;i++){ - len=oc_unpack_length(_opb); - if(len<0||len>oc_pack_bytes_left(_opb)){ - _tc->comments=i; - return TH_EBADHEADER; - } - _tc->comment_lengths[i]=len; - _tc->user_comments[i]=_ogg_malloc((size_t)len+1); - if(_tc->user_comments[i]==NULL){ - _tc->comments=i; - return TH_EFAULT; - } - oc_unpack_octets(_opb,_tc->user_comments[i],len); - _tc->user_comments[i][len]='\0'; - } - return oc_pack_bytes_left(_opb)<0?TH_EBADHEADER:0; -} - -static int oc_setup_unpack(oc_pack_buf *_opb,th_setup_info *_setup){ - int ret; - /*Read the quantizer tables.*/ - ret=oc_quant_params_unpack(_opb,&_setup->qinfo); - if(ret<0)return ret; - /*Read the Huffman trees.*/ - return oc_huff_trees_unpack(_opb,_setup->huff_tables); -} - -static void oc_setup_clear(th_setup_info *_setup){ - oc_quant_params_clear(&_setup->qinfo); - oc_huff_trees_clear(_setup->huff_tables); -} - -static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info, - th_comment *_tc,th_setup_info **_setup,ogg_packet *_op){ - char buffer[6]; - long val; - int packtype; - int ret; - val=oc_pack_read(_opb,8); - packtype=(int)val; - /*If we're at a data packet and we have received all three headers, we're - done.*/ - if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){ - return 0; - } - /*Check the codec string.*/ - oc_unpack_octets(_opb,buffer,6); - if(memcmp(buffer,"theora",6)!=0)return TH_ENOTFORMAT; - switch(packtype){ - /*Codec info header.*/ - case 0x80:{ - /*This should be the first packet, and we should not already be - initialized.*/ - if(!_op->b_o_s||_info->frame_width>0)return TH_EBADHEADER; - ret=oc_info_unpack(_opb,_info); - if(ret<0)th_info_clear(_info); - else ret=3; - }break; - /*Comment header.*/ - case 0x81:{ - if(_tc==NULL)return TH_EFAULT; - /*We shoud have already decoded the info header, and should not yet have - decoded the comment header.*/ - if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER; - ret=oc_comment_unpack(_opb,_tc); - if(ret<0)th_comment_clear(_tc); - else ret=2; - }break; - /*Codec setup header.*/ - case 0x82:{ - oc_setup_info *setup; - if(_tc==NULL||_setup==NULL)return TH_EFAULT; - /*We should have already decoded the info header and the comment header, - and should not yet have decoded the setup header.*/ - if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){ - return TH_EBADHEADER; - } - setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup)); - if(setup==NULL)return TH_EFAULT; - ret=oc_setup_unpack(_opb,setup); - if(ret<0){ - oc_setup_clear(setup); - _ogg_free(setup); - } - else{ - *_setup=setup; - ret=1; - } - }break; - default:{ - /*We don't know what this header is.*/ - return TH_EBADHEADER; - }break; - } - return ret; -} - - -/*Decodes one header packet. - This should be called repeatedly with the packets at the beginning of the - stream until it returns 0.*/ -int th_decode_headerin(th_info *_info,th_comment *_tc, - th_setup_info **_setup,ogg_packet *_op){ - oc_pack_buf opb; - if(_op==NULL)return TH_EBADHEADER; - if(_info==NULL)return TH_EFAULT; - oc_pack_readinit(&opb,_op->packet,_op->bytes); - return oc_dec_headerin(&opb,_info,_tc,_setup,_op); -} - -void th_setup_free(th_setup_info *_setup){ - if(_setup!=NULL){ - oc_setup_clear(_setup); - _ogg_free(_setup); - } -} diff --git a/media/libtheora/lib/decint.h b/media/libtheora/lib/decint.h deleted file mode 100644 index bd6522273..000000000 --- a/media/libtheora/lib/decint.h +++ /dev/null @@ -1,186 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $ - - ********************************************************************/ - -#include <limits.h> -#if !defined(_decint_H) -# define _decint_H (1) -# include "theora/theoradec.h" -# include "state.h" -# include "bitpack.h" -# include "huffdec.h" -# include "dequant.h" - -typedef struct th_setup_info oc_setup_info; -typedef struct oc_dec_opt_vtable oc_dec_opt_vtable; -typedef struct oc_dec_pipeline_state oc_dec_pipeline_state; -typedef struct th_dec_ctx oc_dec_ctx; - - - -/*Decoder-specific accelerated functions.*/ -# if defined(OC_C64X_ASM) -# include "c64x/c64xdec.h" -# endif - -# if !defined(oc_dec_accel_init) -# define oc_dec_accel_init oc_dec_accel_init_c -# endif -# if defined(OC_DEC_USE_VTABLE) -# if !defined(oc_dec_dc_unpredict_mcu_plane) -# define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \ - ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli)) -# endif -# else -# if !defined(oc_dec_dc_unpredict_mcu_plane) -# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c -# endif -# endif - - - -/*Constants for the packet-in state machine specific to the decoder.*/ - -/*Next packet to read: Data packet.*/ -#define OC_PACKET_DATA (0) - - - -struct th_setup_info{ - /*The Huffman codes.*/ - ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES]; - /*The quantization parameters.*/ - th_quant_info qinfo; -}; - - - -/*Decoder specific functions with accelerated variants.*/ -struct oc_dec_opt_vtable{ - void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec, - oc_dec_pipeline_state *_pipe,int _pli); -}; - - - -struct oc_dec_pipeline_state{ - /*Decoded DCT coefficients. - These are placed here instead of on the stack so that they can persist - between blocks, which makes clearing them back to zero much faster when - only a few non-zero coefficients were decoded. - It requires at least 65 elements because the zig-zag index array uses the - 65th element as a dumping ground for out-of-range indices to protect us - from buffer overflow. - We make it fully twice as large so that the second half can serve as the - reconstruction buffer, which saves passing another parameter to all the - acceleration functios. - It also solves problems with 16-byte alignment for NEON on ARM. - gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte - alignment, and silently produces incorrect results if you ask for 16. - Finally, keeping it off the stack means there's less likely to be a data - hazard beween the NEON co-processor and the regular ARM core, which avoids - unnecessary stalls.*/ - OC_ALIGN16(ogg_int16_t dct_coeffs[128]); - OC_ALIGN16(signed char bounding_values[256]); - ptrdiff_t ti[3][64]; - ptrdiff_t ebi[3][64]; - ptrdiff_t eob_runs[3][64]; - const ptrdiff_t *coded_fragis[3]; - const ptrdiff_t *uncoded_fragis[3]; - ptrdiff_t ncoded_fragis[3]; - ptrdiff_t nuncoded_fragis[3]; - const ogg_uint16_t *dequant[3][3][2]; - int fragy0[3]; - int fragy_end[3]; - int pred_last[3][4]; - int mcu_nvfrags; - int loop_filter; - int pp_level; -}; - - -struct th_dec_ctx{ - /*Shared encoder/decoder state.*/ - oc_theora_state state; - /*Whether or not packets are ready to be emitted. - This takes on negative values while there are remaining header packets to - be emitted, reaches 0 when the codec is ready for input, and goes to 1 - when a frame has been processed and a data packet is ready.*/ - int packet_state; - /*Buffer in which to assemble packets.*/ - oc_pack_buf opb; - /*Huffman decode trees.*/ - ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES]; - /*The index of the first token in each plane for each coefficient.*/ - ptrdiff_t ti0[3][64]; - /*The number of outstanding EOB runs at the start of each coefficient in each - plane.*/ - ptrdiff_t eob_runs[3][64]; - /*The DCT token lists.*/ - unsigned char *dct_tokens; - /*The extra bits associated with DCT tokens.*/ - unsigned char *extra_bits; - /*The number of dct tokens unpacked so far.*/ - int dct_tokens_count; - /*The out-of-loop post-processing level.*/ - int pp_level; - /*The DC scale used for out-of-loop deblocking.*/ - int pp_dc_scale[64]; - /*The sharpen modifier used for out-of-loop deringing.*/ - int pp_sharp_mod[64]; - /*The DC quantization index of each block.*/ - unsigned char *dc_qis; - /*The variance of each block.*/ - int *variances; - /*The storage for the post-processed frame buffer.*/ - unsigned char *pp_frame_data; - /*Whether or not the post-processsed frame buffer has space for chroma.*/ - int pp_frame_state; - /*The buffer used for the post-processed frame. - Note that this is _not_ guaranteed to have the same strides and offsets as - the reference frame buffers.*/ - th_ycbcr_buffer pp_frame_buf; - /*The striped decode callback function.*/ - th_stripe_callback stripe_cb; - oc_dec_pipeline_state pipe; -# if defined(OC_DEC_USE_VTABLE) - /*Table for decoder acceleration functions.*/ - oc_dec_opt_vtable opt_vtable; -# endif -# if defined(HAVE_CAIRO) - /*Output metrics for debugging.*/ - int telemetry; - int telemetry_mbmode; - int telemetry_mv; - int telemetry_qi; - int telemetry_bits; - int telemetry_frame_bytes; - int telemetry_coding_bytes; - int telemetry_mode_bytes; - int telemetry_mv_bytes; - int telemetry_qi_bytes; - int telemetry_dc_bytes; - unsigned char *telemetry_frame_data; -# endif -}; - -/*Default pure-C implementations of decoder-specific accelerated functions.*/ -void oc_dec_accel_init_c(oc_dec_ctx *_dec); - -void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec, - oc_dec_pipeline_state *_pipe,int _pli); - -#endif diff --git a/media/libtheora/lib/decode.c b/media/libtheora/lib/decode.c deleted file mode 100644 index 563782b7a..000000000 --- a/media/libtheora/lib/decode.c +++ /dev/null @@ -1,2963 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: decode.c 17576 2010-10-29 01:07:51Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <ogg/ogg.h> -#include "decint.h" -#if defined(OC_DUMP_IMAGES) -# include <stdio.h> -# include "png.h" -#endif -#if defined(HAVE_CAIRO) -# include <cairo.h> -#endif - - -/*No post-processing.*/ -#define OC_PP_LEVEL_DISABLED (0) -/*Keep track of DC qi for each block only.*/ -#define OC_PP_LEVEL_TRACKDCQI (1) -/*Deblock the luma plane.*/ -#define OC_PP_LEVEL_DEBLOCKY (2) -/*Dering the luma plane.*/ -#define OC_PP_LEVEL_DERINGY (3) -/*Stronger luma plane deringing.*/ -#define OC_PP_LEVEL_SDERINGY (4) -/*Deblock the chroma planes.*/ -#define OC_PP_LEVEL_DEBLOCKC (5) -/*Dering the chroma planes.*/ -#define OC_PP_LEVEL_DERINGC (6) -/*Stronger chroma plane deringing.*/ -#define OC_PP_LEVEL_SDERINGC (7) -/*Maximum valid post-processing level.*/ -#define OC_PP_LEVEL_MAX (7) - - - -/*The mode alphabets for the various mode coding schemes. - Scheme 0 uses a custom alphabet, which is not stored in this table.*/ -static const unsigned char OC_MODE_ALPHABETS[7][OC_NMODES]={ - /*Last MV dominates */ - { - OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV, - OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - }, - { - OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_NOMV, - OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - }, - { - OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST2, - OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - }, - { - OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_NOMV, - OC_MODE_INTER_MV_LAST2,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV, - OC_MODE_GOLDEN_MV,OC_MODE_INTER_MV_FOUR - }, - /*No MV dominates.*/ - { - OC_MODE_INTER_NOMV,OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2, - OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - }, - { - OC_MODE_INTER_NOMV,OC_MODE_GOLDEN_NOMV,OC_MODE_INTER_MV_LAST, - OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - }, - /*Default ordering.*/ - { - OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST, - OC_MODE_INTER_MV_LAST2,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV, - OC_MODE_INTER_MV_FOUR - } -}; - - -/*The original DCT tokens are extended and reordered during the construction of - the Huffman tables. - The extension means more bits can be read with fewer calls to the bitpacker - during the Huffman decoding process (at the cost of larger Huffman tables), - and fewer tokens require additional extra bits (reducing the average storage - per decoded token). - The revised ordering reveals essential information in the token value - itself; specifically, whether or not there are additional extra bits to read - and the parameter to which those extra bits are applied. - The token is used to fetch a code word from the OC_DCT_CODE_WORD table below. - The extra bits are added into code word at the bit position inferred from the - token value, giving the final code word from which all required parameters - are derived. - The number of EOBs and the leading zero run length can be extracted directly. - The coefficient magnitude is optionally negated before extraction, according - to a 'flip' bit.*/ - -/*The number of additional extra bits that are decoded with each of the - internal DCT tokens.*/ -static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={ - 12,4,3,3,4,4,5,5,8,8,8,8,3,3,6 -}; - -/*Whether or not an internal token needs any additional extra bits.*/ -#define OC_DCT_TOKEN_NEEDS_MORE(token) \ - (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \ - sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS))) - -/*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/ -#define OC_DCT_TOKEN_FAT_EOB (0) - -/*The number of EOBs to use for an end-of-frame token. - Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99, which - is not yet available everywhere; this should be equivalent.*/ -#define OC_DCT_EOB_FINISH (~(size_t)0>>1) - -/*The location of the (6) run length bits in the code word. - These are placed at index 0 and given 8 bits (even though 6 would suffice) - because it may be faster to extract the lower byte on some platforms.*/ -#define OC_DCT_CW_RLEN_SHIFT (0) -/*The location of the (12) EOB bits in the code word.*/ -#define OC_DCT_CW_EOB_SHIFT (8) -/*The location of the (1) flip bit in the code word. - This must be right under the magnitude bits.*/ -#define OC_DCT_CW_FLIP_BIT (20) -/*The location of the (11) token magnitude bits in the code word. - These must be last, and rely on a sign-extending right shift.*/ -#define OC_DCT_CW_MAG_SHIFT (21) - -/*Pack the given fields into a code word.*/ -#define OC_DCT_CW_PACK(_eobs,_rlen,_mag,_flip) \ - ((_eobs)<<OC_DCT_CW_EOB_SHIFT| \ - (_rlen)<<OC_DCT_CW_RLEN_SHIFT| \ - (_flip)<<OC_DCT_CW_FLIP_BIT| \ - (_mag)-(_flip)<<OC_DCT_CW_MAG_SHIFT) - -/*A special code word value that signals the end of the frame (a long EOB run - of zero).*/ -#define OC_DCT_CW_FINISH (0) - -/*The position at which to insert the extra bits in the code word. - We use this formulation because Intel has no useful cmov. - A real architecture would probably do better with two of those. - This translates to 11 instructions(!), and is _still_ faster than either a - table lookup (just barely) or the naive double-ternary implementation (which - gcc translates to a jump and a cmov). - This assumes OC_DCT_CW_RLEN_SHIFT is zero, but could easily be reworked if - you want to make one of the other shifts zero.*/ -#define OC_DCT_TOKEN_EB_POS(_token) \ - ((OC_DCT_CW_EOB_SHIFT-OC_DCT_CW_MAG_SHIFT&-((_token)<2)) \ - +(OC_DCT_CW_MAG_SHIFT&-((_token)<12))) - -/*The code words for each internal token. - See the notes at OC_DCT_TOKEN_MAP for the reasons why things are out of - order.*/ -static const ogg_int32_t OC_DCT_CODE_WORD[92]={ - /*These tokens require additional extra bits for the EOB count.*/ - /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/ - OC_DCT_CW_FINISH, - /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/ - OC_DCT_CW_PACK(16, 0, 0,0), - /*These tokens require additional extra bits for the magnitude.*/ - /*OC_DCT_VAL_CAT5 (4 extra bits-1 already read)*/ - OC_DCT_CW_PACK( 0, 0, 13,0), - OC_DCT_CW_PACK( 0, 0, 13,1), - /*OC_DCT_VAL_CAT6 (5 extra bits-1 already read)*/ - OC_DCT_CW_PACK( 0, 0, 21,0), - OC_DCT_CW_PACK( 0, 0, 21,1), - /*OC_DCT_VAL_CAT7 (6 extra bits-1 already read)*/ - OC_DCT_CW_PACK( 0, 0, 37,0), - OC_DCT_CW_PACK( 0, 0, 37,1), - /*OC_DCT_VAL_CAT8 (10 extra bits-2 already read)*/ - OC_DCT_CW_PACK( 0, 0, 69,0), - OC_DCT_CW_PACK( 0, 0,325,0), - OC_DCT_CW_PACK( 0, 0, 69,1), - OC_DCT_CW_PACK( 0, 0,325,1), - /*These tokens require additional extra bits for the run length.*/ - /*OC_DCT_RUN_CAT1C (4 extra bits-1 already read)*/ - OC_DCT_CW_PACK( 0,10, +1,0), - OC_DCT_CW_PACK( 0,10, -1,0), - /*OC_DCT_ZRL_TOKEN (6 extra bits) - Flip is set to distinguish this from OC_DCT_CW_FINISH.*/ - OC_DCT_CW_PACK( 0, 0, 0,1), - /*The remaining tokens require no additional extra bits.*/ - /*OC_DCT_EOB1_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 1, 0, 0,0), - /*OC_DCT_EOB2_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 2, 0, 0,0), - /*OC_DCT_EOB3_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 3, 0, 0,0), - /*OC_DCT_RUN_CAT1A (1 extra bit-1 already read)x5*/ - OC_DCT_CW_PACK( 0, 1, +1,0), - OC_DCT_CW_PACK( 0, 1, -1,0), - OC_DCT_CW_PACK( 0, 2, +1,0), - OC_DCT_CW_PACK( 0, 2, -1,0), - OC_DCT_CW_PACK( 0, 3, +1,0), - OC_DCT_CW_PACK( 0, 3, -1,0), - OC_DCT_CW_PACK( 0, 4, +1,0), - OC_DCT_CW_PACK( 0, 4, -1,0), - OC_DCT_CW_PACK( 0, 5, +1,0), - OC_DCT_CW_PACK( 0, 5, -1,0), - /*OC_DCT_RUN_CAT2A (2 extra bits-2 already read)*/ - OC_DCT_CW_PACK( 0, 1, +2,0), - OC_DCT_CW_PACK( 0, 1, +3,0), - OC_DCT_CW_PACK( 0, 1, -2,0), - OC_DCT_CW_PACK( 0, 1, -3,0), - /*OC_DCT_RUN_CAT1B (3 extra bits-3 already read)*/ - OC_DCT_CW_PACK( 0, 6, +1,0), - OC_DCT_CW_PACK( 0, 7, +1,0), - OC_DCT_CW_PACK( 0, 8, +1,0), - OC_DCT_CW_PACK( 0, 9, +1,0), - OC_DCT_CW_PACK( 0, 6, -1,0), - OC_DCT_CW_PACK( 0, 7, -1,0), - OC_DCT_CW_PACK( 0, 8, -1,0), - OC_DCT_CW_PACK( 0, 9, -1,0), - /*OC_DCT_RUN_CAT2B (3 extra bits-3 already read)*/ - OC_DCT_CW_PACK( 0, 2, +2,0), - OC_DCT_CW_PACK( 0, 3, +2,0), - OC_DCT_CW_PACK( 0, 2, +3,0), - OC_DCT_CW_PACK( 0, 3, +3,0), - OC_DCT_CW_PACK( 0, 2, -2,0), - OC_DCT_CW_PACK( 0, 3, -2,0), - OC_DCT_CW_PACK( 0, 2, -3,0), - OC_DCT_CW_PACK( 0, 3, -3,0), - /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits-3 already read) - Flip is set on the first one to distinguish it from OC_DCT_CW_FINISH.*/ - OC_DCT_CW_PACK( 0, 0, 0,1), - OC_DCT_CW_PACK( 0, 1, 0,0), - OC_DCT_CW_PACK( 0, 2, 0,0), - OC_DCT_CW_PACK( 0, 3, 0,0), - OC_DCT_CW_PACK( 0, 4, 0,0), - OC_DCT_CW_PACK( 0, 5, 0,0), - OC_DCT_CW_PACK( 0, 6, 0,0), - OC_DCT_CW_PACK( 0, 7, 0,0), - /*OC_ONE_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 0, 0, +1,0), - /*OC_MINUS_ONE_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 0, 0, -1,0), - /*OC_TWO_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 0, 0, +2,0), - /*OC_MINUS_TWO_TOKEN (0 extra bits)*/ - OC_DCT_CW_PACK( 0, 0, -2,0), - /*OC_DCT_VAL_CAT2 (1 extra bit-1 already read)x4*/ - OC_DCT_CW_PACK( 0, 0, +3,0), - OC_DCT_CW_PACK( 0, 0, -3,0), - OC_DCT_CW_PACK( 0, 0, +4,0), - OC_DCT_CW_PACK( 0, 0, -4,0), - OC_DCT_CW_PACK( 0, 0, +5,0), - OC_DCT_CW_PACK( 0, 0, -5,0), - OC_DCT_CW_PACK( 0, 0, +6,0), - OC_DCT_CW_PACK( 0, 0, -6,0), - /*OC_DCT_VAL_CAT3 (2 extra bits-2 already read)*/ - OC_DCT_CW_PACK( 0, 0, +7,0), - OC_DCT_CW_PACK( 0, 0, +8,0), - OC_DCT_CW_PACK( 0, 0, -7,0), - OC_DCT_CW_PACK( 0, 0, -8,0), - /*OC_DCT_VAL_CAT4 (3 extra bits-3 already read)*/ - OC_DCT_CW_PACK( 0, 0, +9,0), - OC_DCT_CW_PACK( 0, 0,+10,0), - OC_DCT_CW_PACK( 0, 0,+11,0), - OC_DCT_CW_PACK( 0, 0,+12,0), - OC_DCT_CW_PACK( 0, 0, -9,0), - OC_DCT_CW_PACK( 0, 0,-10,0), - OC_DCT_CW_PACK( 0, 0,-11,0), - OC_DCT_CW_PACK( 0, 0,-12,0), - /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits-3 already read)*/ - OC_DCT_CW_PACK( 8, 0, 0,0), - OC_DCT_CW_PACK( 9, 0, 0,0), - OC_DCT_CW_PACK(10, 0, 0,0), - OC_DCT_CW_PACK(11, 0, 0,0), - OC_DCT_CW_PACK(12, 0, 0,0), - OC_DCT_CW_PACK(13, 0, 0,0), - OC_DCT_CW_PACK(14, 0, 0,0), - OC_DCT_CW_PACK(15, 0, 0,0), - /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits-2 already read)*/ - OC_DCT_CW_PACK( 4, 0, 0,0), - OC_DCT_CW_PACK( 5, 0, 0,0), - OC_DCT_CW_PACK( 6, 0, 0,0), - OC_DCT_CW_PACK( 7, 0, 0,0), -}; - - - -static int oc_sb_run_unpack(oc_pack_buf *_opb){ - /*Coding scheme: - Codeword Run Length - 0 1 - 10x 2-3 - 110x 4-5 - 1110xx 6-9 - 11110xxx 10-17 - 111110xxxx 18-33 - 111111xxxxxxxxxxxx 34-4129*/ - static const ogg_int16_t OC_SB_RUN_TREE[22]={ - 4, - -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1), - -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1), - -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3), - -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17, - 2, - -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6) - }; - int ret; - ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE); - if(ret>=0x10){ - int offs; - offs=ret&0x1F; - ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4); - } - return ret; -} - -static int oc_block_run_unpack(oc_pack_buf *_opb){ - /*Coding scheme: - Codeword Run Length - 0x 1-2 - 10x 3-4 - 110x 5-6 - 1110xx 7-10 - 11110xx 11-14 - 11111xxxx 15-30*/ - static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={ - 5, - -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), - -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), - -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2), - -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2), - -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3), - -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4), - -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6), - 33, 36, 39, 44, - 1,-(1<<8|7),-(1<<8|8), - 1,-(1<<8|9),-(1<<8|10), - 2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14), - 4, - -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18), - -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22), - -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26), - -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30) - }; - return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE); -} - - - -void oc_dec_accel_init_c(oc_dec_ctx *_dec){ -# if defined(OC_DEC_USE_VTABLE) - _dec->opt_vtable.dc_unpredict_mcu_plane= - oc_dec_dc_unpredict_mcu_plane_c; -# endif -} - -static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, - const th_setup_info *_setup){ - int qti; - int pli; - int qi; - int ret; - ret=oc_state_init(&_dec->state,_info,3); - if(ret<0)return ret; - ret=oc_huff_trees_copy(_dec->huff_tables, - (const ogg_int16_t *const *)_setup->huff_tables); - if(ret<0){ - oc_state_clear(&_dec->state); - return ret; - } - /*For each fragment, allocate one byte for every DCT coefficient token, plus - one byte for extra-bits for each token, plus one more byte for the long - EOB run, just in case it's the very last token and has a run length of - one.*/ - _dec->dct_tokens=(unsigned char *)_ogg_malloc((64+64+1)* - _dec->state.nfrags*sizeof(_dec->dct_tokens[0])); - if(_dec->dct_tokens==NULL){ - oc_huff_trees_clear(_dec->huff_tables); - oc_state_clear(&_dec->state); - return TH_EFAULT; - } - for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ - _dec->state.dequant_tables[qi][pli][qti]= - _dec->state.dequant_table_data[qi][pli][qti]; - } - oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale, - &_setup->qinfo); - for(qi=0;qi<64;qi++){ - int qsum; - qsum=0; - for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){ - qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+ - _dec->state.dequant_tables[qi][pli][qti][17]+ - _dec->state.dequant_tables[qi][pli][qti][18]+ - _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0); - } - _dec->pp_sharp_mod[qi]=-(qsum>>11); - } - memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits, - sizeof(_dec->state.loop_filter_limits)); - oc_dec_accel_init(_dec); - _dec->pp_level=OC_PP_LEVEL_DISABLED; - _dec->dc_qis=NULL; - _dec->variances=NULL; - _dec->pp_frame_data=NULL; - _dec->stripe_cb.ctx=NULL; - _dec->stripe_cb.stripe_decoded=NULL; -#if defined(HAVE_CAIRO) - _dec->telemetry=0; - _dec->telemetry_bits=0; - _dec->telemetry_qi=0; - _dec->telemetry_mbmode=0; - _dec->telemetry_mv=0; - _dec->telemetry_frame_data=NULL; -#endif - return 0; -} - -static void oc_dec_clear(oc_dec_ctx *_dec){ -#if defined(HAVE_CAIRO) - _ogg_free(_dec->telemetry_frame_data); -#endif - _ogg_free(_dec->pp_frame_data); - _ogg_free(_dec->variances); - _ogg_free(_dec->dc_qis); - _ogg_free(_dec->dct_tokens); - oc_huff_trees_clear(_dec->huff_tables); - oc_state_clear(&_dec->state); -} - - -static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){ - long val; - /*Check to make sure this is a data packet.*/ - val=oc_pack_read1(&_dec->opb); - if(val!=0)return TH_EBADPACKET; - /*Read in the frame type (I or P).*/ - val=oc_pack_read1(&_dec->opb); - _dec->state.frame_type=(int)val; - /*Read in the qi list.*/ - val=oc_pack_read(&_dec->opb,6); - _dec->state.qis[0]=(unsigned char)val; - val=oc_pack_read1(&_dec->opb); - if(!val)_dec->state.nqis=1; - else{ - val=oc_pack_read(&_dec->opb,6); - _dec->state.qis[1]=(unsigned char)val; - val=oc_pack_read1(&_dec->opb); - if(!val)_dec->state.nqis=2; - else{ - val=oc_pack_read(&_dec->opb,6); - _dec->state.qis[2]=(unsigned char)val; - _dec->state.nqis=3; - } - } - if(_dec->state.frame_type==OC_INTRA_FRAME){ - /*Keyframes have 3 unused configuration bits, holdovers from VP3 days. - Most of the other unused bits in the VP3 headers were eliminated. - I don't know why these remain.*/ - /*I wanted to eliminate wasted bits, but not all config wiggle room - --Monty.*/ - val=oc_pack_read(&_dec->opb,3); - if(val!=0)return TH_EIMPL; - } - return 0; -} - -/*Mark all fragments as coded and in OC_MODE_INTRA. - This also builds up the coded fragment list (in coded order), and clears the - uncoded fragment list. - It does not update the coded macro block list nor the super block flags, as - those are not used when decoding INTRA frames.*/ -static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){ - const oc_sb_map *sb_maps; - const oc_sb_flags *sb_flags; - oc_fragment *frags; - ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t prev_ncoded_fragis; - unsigned nsbs; - unsigned sbi; - int pli; - coded_fragis=_dec->state.coded_fragis; - prev_ncoded_fragis=ncoded_fragis=0; - sb_maps=(const oc_sb_map *)_dec->state.sb_maps; - sb_flags=_dec->state.sb_flags; - frags=_dec->state.frags; - sbi=nsbs=0; - for(pli=0;pli<3;pli++){ - nsbs+=_dec->state.fplanes[pli].nsbs; - for(;sbi<nsbs;sbi++){ - int quadi; - for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ - int bi; - for(bi=0;bi<4;bi++){ - ptrdiff_t fragi; - fragi=sb_maps[sbi][quadi][bi]; - if(fragi>=0){ - frags[fragi].coded=1; - frags[fragi].refi=OC_FRAME_SELF; - frags[fragi].mb_mode=OC_MODE_INTRA; - coded_fragis[ncoded_fragis++]=fragi; - } - } - } - } - _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis; - prev_ncoded_fragis=ncoded_fragis; - } - _dec->state.ntotal_coded_fragis=ncoded_fragis; -} - -/*Decodes the bit flags indicating whether each super block is partially coded - or not. - Return: The number of partially coded super blocks.*/ -static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){ - oc_sb_flags *sb_flags; - unsigned nsbs; - unsigned sbi; - unsigned npartial; - unsigned run_count; - long val; - int flag; - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - sb_flags=_dec->state.sb_flags; - nsbs=_dec->state.nsbs; - sbi=npartial=0; - while(sbi<nsbs){ - int full_run; - run_count=oc_sb_run_unpack(&_dec->opb); - full_run=run_count>=4129; - do{ - sb_flags[sbi].coded_partially=flag; - sb_flags[sbi].coded_fully=0; - npartial+=flag; - sbi++; - } - while(--run_count>0&&sbi<nsbs); - if(full_run&&sbi<nsbs){ - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - } - else flag=!flag; - } - /*TODO: run_count should be 0 here. - If it's not, we should issue a warning of some kind.*/ - return npartial; -} - -/*Decodes the bit flags for whether or not each non-partially-coded super - block is fully coded or not. - This function should only be called if there is at least one - non-partially-coded super block. - Return: The number of partially coded super blocks.*/ -static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){ - oc_sb_flags *sb_flags; - unsigned nsbs; - unsigned sbi; - unsigned run_count; - long val; - int flag; - sb_flags=_dec->state.sb_flags; - nsbs=_dec->state.nsbs; - /*Skip partially coded super blocks.*/ - for(sbi=0;sb_flags[sbi].coded_partially;sbi++); - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - do{ - int full_run; - run_count=oc_sb_run_unpack(&_dec->opb); - full_run=run_count>=4129; - for(;sbi<nsbs;sbi++){ - if(sb_flags[sbi].coded_partially)continue; - if(run_count--<=0)break; - sb_flags[sbi].coded_fully=flag; - } - if(full_run&&sbi<nsbs){ - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - } - else flag=!flag; - } - while(sbi<nsbs); - /*TODO: run_count should be 0 here. - If it's not, we should issue a warning of some kind.*/ -} - -static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ - const oc_sb_map *sb_maps; - const oc_sb_flags *sb_flags; - signed char *mb_modes; - oc_fragment *frags; - unsigned nsbs; - unsigned sbi; - unsigned npartial; - long val; - int pli; - int flag; - int run_count; - ptrdiff_t *coded_fragis; - ptrdiff_t *uncoded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t nuncoded_fragis; - ptrdiff_t prev_ncoded_fragis; - npartial=oc_dec_partial_sb_flags_unpack(_dec); - if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec); - if(npartial>0){ - val=oc_pack_read1(&_dec->opb); - flag=!(int)val; - } - else flag=0; - sb_maps=(const oc_sb_map *)_dec->state.sb_maps; - sb_flags=_dec->state.sb_flags; - mb_modes=_dec->state.mb_modes; - frags=_dec->state.frags; - sbi=nsbs=run_count=0; - coded_fragis=_dec->state.coded_fragis; - uncoded_fragis=coded_fragis+_dec->state.nfrags; - prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0; - for(pli=0;pli<3;pli++){ - nsbs+=_dec->state.fplanes[pli].nsbs; - for(;sbi<nsbs;sbi++){ - int quadi; - for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ - int quad_coded; - int bi; - quad_coded=0; - for(bi=0;bi<4;bi++){ - ptrdiff_t fragi; - fragi=sb_maps[sbi][quadi][bi]; - if(fragi>=0){ - int coded; - if(sb_flags[sbi].coded_fully)coded=1; - else if(!sb_flags[sbi].coded_partially)coded=0; - else{ - if(run_count<=0){ - run_count=oc_block_run_unpack(&_dec->opb); - flag=!flag; - } - run_count--; - coded=flag; - } - if(coded)coded_fragis[ncoded_fragis++]=fragi; - else *(uncoded_fragis-++nuncoded_fragis)=fragi; - quad_coded|=coded; - frags[fragi].coded=coded; - frags[fragi].refi=OC_FRAME_NONE; - } - } - /*Remember if there's a coded luma block in this macro block.*/ - if(!pli)mb_modes[sbi<<2|quadi]=quad_coded; - } - } - _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis; - prev_ncoded_fragis=ncoded_fragis; - } - _dec->state.ntotal_coded_fragis=ncoded_fragis; - /*TODO: run_count should be 0 here. - If it's not, we should issue a warning of some kind.*/ -} - - -/*Coding scheme: - Codeword Mode Index - 0 0 - 10 1 - 110 2 - 1110 3 - 11110 4 - 111110 5 - 1111110 6 - 1111111 7*/ -static const ogg_int16_t OC_VLC_MODE_TREE[26]={ - 4, - -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0), - -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0), - -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), - -(3<<8|2),-(3<<8|2),-(4<<8|3),17, - 3, - -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4), - -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7) -}; - -static const ogg_int16_t OC_CLC_MODE_TREE[9]={ - 3, - -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3), - -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7) -}; - -/*Unpacks the list of macro block modes for INTER frames.*/ -static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){ - signed char *mb_modes; - const unsigned char *alphabet; - unsigned char scheme0_alphabet[8]; - const ogg_int16_t *mode_tree; - size_t nmbs; - size_t mbi; - long val; - int mode_scheme; - val=oc_pack_read(&_dec->opb,3); - mode_scheme=(int)val; - if(mode_scheme==0){ - int mi; - /*Just in case, initialize the modes to something. - If the bitstream doesn't contain each index exactly once, it's likely - corrupt and the rest of the packet is garbage anyway, but this way we - won't crash, and we'll decode SOMETHING.*/ - /*LOOP VECTORIZES*/ - for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV; - for(mi=0;mi<OC_NMODES;mi++){ - val=oc_pack_read(&_dec->opb,3); - scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi]; - } - alphabet=scheme0_alphabet; - } - else alphabet=OC_MODE_ALPHABETS[mode_scheme-1]; - mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE; - mb_modes=_dec->state.mb_modes; - nmbs=_dec->state.nmbs; - for(mbi=0;mbi<nmbs;mbi++){ - if(mb_modes[mbi]>0){ - /*We have a coded luma block; decode a mode.*/ - mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)]; - } - /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the - fact that OC_MODE_INTER_NOMV is already 0.*/ - } -} - - - -static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={ - 5, - -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0), - -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1), - -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1), - -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2), - -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3), - 33, 36, 39, 42, - 45, 50, 55, 60, - 65, 74, 83, 92, - 1,-(1<<8|32+4),-(1<<8|32-4), - 1,-(1<<8|32+5),-(1<<8|32-5), - 1,-(1<<8|32+6),-(1<<8|32-6), - 1,-(1<<8|32+7),-(1<<8|32-7), - 2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9), - 2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11), - 2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13), - 2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15), - 3, - -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17), - -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19), - 3, - -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21), - -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23), - 3, - -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25), - -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27), - 3, - -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29), - -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31) -}; - -static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={ - 6, - -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1), - -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3), - -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5), - -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7), - -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9), - -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11), - -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13), - -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15), - -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17), - -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19), - -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21), - -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23), - -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25), - -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27), - -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29), - -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31) -}; - - -static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){ - int dx; - int dy; - dx=oc_huff_token_decode(_opb,_tree)-32; - dy=oc_huff_token_decode(_opb,_tree)-32; - return OC_MV(dx,dy); -} - -/*Unpacks the list of motion vectors for INTER frames, and propagtes the macro - block modes and motion vectors to the individual fragments.*/ -static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){ - const oc_mb_map *mb_maps; - const signed char *mb_modes; - oc_set_chroma_mvs_func set_chroma_mvs; - const ogg_int16_t *mv_comp_tree; - oc_fragment *frags; - oc_mv *frag_mvs; - const unsigned char *map_idxs; - int map_nidxs; - oc_mv last_mv; - oc_mv prior_mv; - oc_mv cbmvs[4]; - size_t nmbs; - size_t mbi; - long val; - set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt]; - val=oc_pack_read1(&_dec->opb); - mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE; - map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt]; - map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt]; - prior_mv=last_mv=0; - frags=_dec->state.frags; - frag_mvs=_dec->state.frag_mvs; - mb_maps=(const oc_mb_map *)_dec->state.mb_maps; - mb_modes=_dec->state.mb_modes; - nmbs=_dec->state.nmbs; - for(mbi=0;mbi<nmbs;mbi++){ - int mb_mode; - mb_mode=mb_modes[mbi]; - if(mb_mode!=OC_MODE_INVALID){ - oc_mv mbmv; - ptrdiff_t fragi; - int mapi; - int mapii; - int refi; - if(mb_mode==OC_MODE_INTER_MV_FOUR){ - oc_mv lbmvs[4]; - int bi; - prior_mv=last_mv; - for(bi=0;bi<4;bi++){ - fragi=mb_maps[mbi][0][bi]; - if(frags[fragi].coded){ - frags[fragi].refi=OC_FRAME_PREV; - frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR; - lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree); - frag_mvs[fragi]=lbmvs[bi]; - } - else lbmvs[bi]=0; - } - (*set_chroma_mvs)(cbmvs,lbmvs); - for(mapii=4;mapii<map_nidxs;mapii++){ - mapi=map_idxs[mapii]; - bi=mapi&3; - fragi=mb_maps[mbi][mapi>>2][bi]; - if(frags[fragi].coded){ - frags[fragi].refi=OC_FRAME_PREV; - frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR; - frag_mvs[fragi]=cbmvs[bi]; - } - } - } - else{ - switch(mb_mode){ - case OC_MODE_INTER_MV:{ - prior_mv=last_mv; - last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree); - }break; - case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break; - case OC_MODE_INTER_MV_LAST2:{ - mbmv=prior_mv; - prior_mv=last_mv; - last_mv=mbmv; - }break; - case OC_MODE_GOLDEN_MV:{ - mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree); - }break; - default:mbmv=0;break; - } - /*Fill in the MVs for the fragments.*/ - refi=OC_FRAME_FOR_MODE(mb_mode); - mapii=0; - do{ - mapi=map_idxs[mapii]; - fragi=mb_maps[mbi][mapi>>2][mapi&3]; - if(frags[fragi].coded){ - frags[fragi].refi=refi; - frags[fragi].mb_mode=mb_mode; - frag_mvs[fragi]=mbmv; - } - } - while(++mapii<map_nidxs); - } - } - } -} - -static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){ - oc_fragment *frags; - const ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragii; - ptrdiff_t fragi; - ncoded_fragis=_dec->state.ntotal_coded_fragis; - if(ncoded_fragis<=0)return; - frags=_dec->state.frags; - coded_fragis=_dec->state.coded_fragis; - if(_dec->state.nqis==1){ - /*If this frame has only a single qi value, then just use it for all coded - fragments.*/ - for(fragii=0;fragii<ncoded_fragis;fragii++){ - frags[coded_fragis[fragii]].qii=0; - } - } - else{ - long val; - int flag; - int nqi1; - int run_count; - /*Otherwise, we decode a qi index for each fragment, using two passes of - the same binary RLE scheme used for super-block coded bits. - The first pass marks each fragment as having a qii of 0 or greater than - 0, and the second pass (if necessary), distinguishes between a qii of - 1 and 2. - At first we just store the qii in the fragment. - After all the qii's are decoded, we make a final pass to replace them - with the corresponding qi's for this frame.*/ - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - nqi1=0; - fragii=0; - while(fragii<ncoded_fragis){ - int full_run; - run_count=oc_sb_run_unpack(&_dec->opb); - full_run=run_count>=4129; - do{ - frags[coded_fragis[fragii++]].qii=flag; - nqi1+=flag; - } - while(--run_count>0&&fragii<ncoded_fragis); - if(full_run&&fragii<ncoded_fragis){ - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - } - else flag=!flag; - } - /*TODO: run_count should be 0 here. - If it's not, we should issue a warning of some kind.*/ - /*If we have 3 different qi's for this frame, and there was at least one - fragment with a non-zero qi, make the second pass.*/ - if(_dec->state.nqis==3&&nqi1>0){ - /*Skip qii==0 fragments.*/ - for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++); - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - do{ - int full_run; - run_count=oc_sb_run_unpack(&_dec->opb); - full_run=run_count>=4129; - for(;fragii<ncoded_fragis;fragii++){ - fragi=coded_fragis[fragii]; - if(frags[fragi].qii==0)continue; - if(run_count--<=0)break; - frags[fragi].qii+=flag; - } - if(full_run&&fragii<ncoded_fragis){ - val=oc_pack_read1(&_dec->opb); - flag=(int)val; - } - else flag=!flag; - } - while(fragii<ncoded_fragis); - /*TODO: run_count should be 0 here. - If it's not, we should issue a warning of some kind.*/ - } - } -} - - - -/*Unpacks the DC coefficient tokens. - Unlike when unpacking the AC coefficient tokens, we actually need to decode - the DC coefficient values now so that we can do DC prediction. - _huff_idx: The index of the Huffman table to use for each color plane. - _ntoks_left: The number of tokens left to be decoded in each color plane for - each coefficient. - This is updated as EOB tokens and zero run tokens are decoded. - Return: The length of any outstanding EOB run.*/ -static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2], - ptrdiff_t _ntoks_left[3][64]){ - unsigned char *dct_tokens; - oc_fragment *frags; - const ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragii; - ptrdiff_t eobs; - ptrdiff_t ti; - int pli; - dct_tokens=_dec->dct_tokens; - frags=_dec->state.frags; - coded_fragis=_dec->state.coded_fragis; - ncoded_fragis=fragii=eobs=ti=0; - for(pli=0;pli<3;pli++){ - ptrdiff_t run_counts[64]; - ptrdiff_t eob_count; - ptrdiff_t eobi; - int rli; - ncoded_fragis+=_dec->state.ncoded_fragis[pli]; - memset(run_counts,0,sizeof(run_counts)); - _dec->eob_runs[pli][0]=eobs; - _dec->ti0[pli][0]=ti; - /*Continue any previous EOB run, if there was one.*/ - eobi=eobs; - if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii; - eob_count=eobi; - eobs-=eobi; - while(eobi-->0)frags[coded_fragis[fragii++]].dc=0; - while(fragii<ncoded_fragis){ - int token; - int cw; - int eb; - int skip; - token=oc_huff_token_decode(&_dec->opb, - _dec->huff_tables[_huff_idxs[pli+1>>1]]); - dct_tokens[ti++]=(unsigned char)token; - if(OC_DCT_TOKEN_NEEDS_MORE(token)){ - eb=(int)oc_pack_read(&_dec->opb, - OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]); - dct_tokens[ti++]=(unsigned char)eb; - if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8); - eb<<=OC_DCT_TOKEN_EB_POS(token); - } - else eb=0; - cw=OC_DCT_CODE_WORD[token]+eb; - eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF; - if(cw==OC_DCT_CW_FINISH)eobs=OC_DCT_EOB_FINISH; - if(eobs){ - eobi=OC_MINI(eobs,ncoded_fragis-fragii); - eob_count+=eobi; - eobs-=eobi; - while(eobi-->0)frags[coded_fragis[fragii++]].dc=0; - } - else{ - int coeff; - skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT); - cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT); - coeff=cw>>OC_DCT_CW_MAG_SHIFT; - if(skip)coeff=0; - run_counts[skip]++; - frags[coded_fragis[fragii++]].dc=coeff; - } - } - /*Add the total EOB count to the longest run length.*/ - run_counts[63]+=eob_count; - /*And convert the run_counts array to a moment table.*/ - for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1]; - /*Finally, subtract off the number of coefficients that have been - accounted for by runs started in this coefficient.*/ - for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli]; - } - _dec->dct_tokens_count=ti; - return eobs; -} - -/*Unpacks the AC coefficient tokens. - This can completely discard coefficient values while unpacking, and so is - somewhat simpler than unpacking the DC coefficient tokens. - _huff_idx: The index of the Huffman table to use for each color plane. - _ntoks_left: The number of tokens left to be decoded in each color plane for - each coefficient. - This is updated as EOB tokens and zero run tokens are decoded. - _eobs: The length of any outstanding EOB run from previous - coefficients. - Return: The length of any outstanding EOB run.*/ -static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2], - ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){ - unsigned char *dct_tokens; - ptrdiff_t ti; - int pli; - dct_tokens=_dec->dct_tokens; - ti=_dec->dct_tokens_count; - for(pli=0;pli<3;pli++){ - ptrdiff_t run_counts[64]; - ptrdiff_t eob_count; - size_t ntoks_left; - size_t ntoks; - int rli; - _dec->eob_runs[pli][_zzi]=_eobs; - _dec->ti0[pli][_zzi]=ti; - ntoks_left=_ntoks_left[pli][_zzi]; - memset(run_counts,0,sizeof(run_counts)); - eob_count=0; - ntoks=0; - while(ntoks+_eobs<ntoks_left){ - int token; - int cw; - int eb; - int skip; - ntoks+=_eobs; - eob_count+=_eobs; - token=oc_huff_token_decode(&_dec->opb, - _dec->huff_tables[_huff_idxs[pli+1>>1]]); - dct_tokens[ti++]=(unsigned char)token; - if(OC_DCT_TOKEN_NEEDS_MORE(token)){ - eb=(int)oc_pack_read(&_dec->opb, - OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]); - dct_tokens[ti++]=(unsigned char)eb; - if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8); - eb<<=OC_DCT_TOKEN_EB_POS(token); - } - else eb=0; - cw=OC_DCT_CODE_WORD[token]+eb; - skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT); - _eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF; - if(cw==OC_DCT_CW_FINISH)_eobs=OC_DCT_EOB_FINISH; - if(_eobs==0){ - run_counts[skip]++; - ntoks++; - } - } - /*Add the portion of the last EOB run actually used by this coefficient.*/ - eob_count+=ntoks_left-ntoks; - /*And remove it from the remaining EOB count.*/ - _eobs-=ntoks_left-ntoks; - /*Add the total EOB count to the longest run length.*/ - run_counts[63]+=eob_count; - /*And convert the run_counts array to a moment table.*/ - for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1]; - /*Finally, subtract off the number of coefficients that have been - accounted for by runs started in this coefficient.*/ - for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli]; - } - _dec->dct_tokens_count=ti; - return _eobs; -} - -/*Tokens describing the DCT coefficients that belong to each fragment are - stored in the bitstream grouped by coefficient, not by fragment. - - This means that we either decode all the tokens in order, building up a - separate coefficient list for each fragment as we go, and then go back and - do the iDCT on each fragment, or we have to create separate lists of tokens - for each coefficient, so that we can pull the next token required off the - head of the appropriate list when decoding a specific fragment. - - The former was VP3's choice, and it meant 2*w*h extra storage for all the - decoded coefficient values. - - We take the second option, which lets us store just one to three bytes per - token (generally far fewer than the number of coefficients, due to EOB - tokens and zero runs), and which requires us to only maintain a counter for - each of the 64 coefficients, instead of a counter for every fragment to - determine where the next token goes. - - We actually use 3 counters per coefficient, one for each color plane, so we - can decode all color planes simultaneously. - This lets color conversion, etc., be done as soon as a full MCU (one or - two super block rows) is decoded, while the image data is still in cache.*/ - -static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){ - static const unsigned char OC_HUFF_LIST_MAX[5]={1,6,15,28,64}; - ptrdiff_t ntoks_left[3][64]; - int huff_idxs[2]; - ptrdiff_t eobs; - long val; - int pli; - int zzi; - int hgi; - for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){ - ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli]; - } - val=oc_pack_read(&_dec->opb,4); - huff_idxs[0]=(int)val; - val=oc_pack_read(&_dec->opb,4); - huff_idxs[1]=(int)val; - _dec->eob_runs[0][0]=0; - eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left); -#if defined(HAVE_CAIRO) - _dec->telemetry_dc_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - val=oc_pack_read(&_dec->opb,4); - huff_idxs[0]=(int)val; - val=oc_pack_read(&_dec->opb,4); - huff_idxs[1]=(int)val; - zzi=1; - for(hgi=1;hgi<5;hgi++){ - huff_idxs[0]+=16; - huff_idxs[1]+=16; - for(;zzi<OC_HUFF_LIST_MAX[hgi];zzi++){ - eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs); - } - } - /*TODO: eobs should be exactly zero, or 4096 or greater. - The second case occurs when an EOB run of size zero is encountered, which - gets treated as an infinite EOB run (where infinity is PTRDIFF_MAX). - If neither of these conditions holds, then a warning should be issued.*/ -} - - -static int oc_dec_postprocess_init(oc_dec_ctx *_dec){ - /*pp_level 0: disabled; free any memory used and return*/ - if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){ - if(_dec->dc_qis!=NULL){ - _ogg_free(_dec->dc_qis); - _dec->dc_qis=NULL; - _ogg_free(_dec->variances); - _dec->variances=NULL; - _ogg_free(_dec->pp_frame_data); - _dec->pp_frame_data=NULL; - } - return 1; - } - if(_dec->dc_qis==NULL){ - /*If we haven't been tracking DC quantization indices, there's no point in - starting now.*/ - if(_dec->state.frame_type!=OC_INTRA_FRAME)return 1; - _dec->dc_qis=(unsigned char *)_ogg_malloc( - _dec->state.nfrags*sizeof(_dec->dc_qis[0])); - if(_dec->dc_qis==NULL)return 1; - memset(_dec->dc_qis,_dec->state.qis[0],_dec->state.nfrags); - } - else{ - unsigned char *dc_qis; - const ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragii; - unsigned char qi0; - /*Update the DC quantization index of each coded block.*/ - dc_qis=_dec->dc_qis; - coded_fragis=_dec->state.coded_fragis; - ncoded_fragis=_dec->state.ncoded_fragis[0]+ - _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2]; - qi0=(unsigned char)_dec->state.qis[0]; - for(fragii=0;fragii<ncoded_fragis;fragii++){ - dc_qis[coded_fragis[fragii]]=qi0; - } - } - /*pp_level 1: Stop after updating DC quantization indices.*/ - if(_dec->pp_level<=OC_PP_LEVEL_TRACKDCQI){ - if(_dec->variances!=NULL){ - _ogg_free(_dec->variances); - _dec->variances=NULL; - _ogg_free(_dec->pp_frame_data); - _dec->pp_frame_data=NULL; - } - return 1; - } - if(_dec->variances==NULL){ - size_t frame_sz; - size_t c_sz; - int c_w; - int c_h; - frame_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height; - c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1); - c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2); - c_sz=c_w*(size_t)c_h; - /*Allocate space for the chroma planes, even if we're not going to use - them; this simplifies allocation state management, though it may waste - memory on the few systems that don't overcommit pages.*/ - frame_sz+=c_sz<<1; - _dec->pp_frame_data=(unsigned char *)_ogg_malloc( - frame_sz*sizeof(_dec->pp_frame_data[0])); - _dec->variances=(int *)_ogg_malloc( - _dec->state.nfrags*sizeof(_dec->variances[0])); - if(_dec->variances==NULL||_dec->pp_frame_data==NULL){ - _ogg_free(_dec->pp_frame_data); - _dec->pp_frame_data=NULL; - _ogg_free(_dec->variances); - _dec->variances=NULL; - return 1; - } - /*Force an update of the PP buffer pointers.*/ - _dec->pp_frame_state=0; - } - /*Update the PP buffer pointers if necessary.*/ - if(_dec->pp_frame_state!=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC)){ - if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){ - /*If chroma processing is disabled, just use the PP luma plane.*/ - _dec->pp_frame_buf[0].width=_dec->state.info.frame_width; - _dec->pp_frame_buf[0].height=_dec->state.info.frame_height; - _dec->pp_frame_buf[0].stride=-_dec->pp_frame_buf[0].width; - _dec->pp_frame_buf[0].data=_dec->pp_frame_data+ - (1-_dec->pp_frame_buf[0].height)*(ptrdiff_t)_dec->pp_frame_buf[0].stride; - } - else{ - size_t y_sz; - size_t c_sz; - int c_w; - int c_h; - /*Otherwise, set up pointers to all three PP planes.*/ - y_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height; - c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1); - c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2); - c_sz=c_w*(size_t)c_h; - _dec->pp_frame_buf[0].width=_dec->state.info.frame_width; - _dec->pp_frame_buf[0].height=_dec->state.info.frame_height; - _dec->pp_frame_buf[0].stride=_dec->pp_frame_buf[0].width; - _dec->pp_frame_buf[0].data=_dec->pp_frame_data; - _dec->pp_frame_buf[1].width=c_w; - _dec->pp_frame_buf[1].height=c_h; - _dec->pp_frame_buf[1].stride=_dec->pp_frame_buf[1].width; - _dec->pp_frame_buf[1].data=_dec->pp_frame_buf[0].data+y_sz; - _dec->pp_frame_buf[2].width=c_w; - _dec->pp_frame_buf[2].height=c_h; - _dec->pp_frame_buf[2].stride=_dec->pp_frame_buf[2].width; - _dec->pp_frame_buf[2].data=_dec->pp_frame_buf[1].data+c_sz; - oc_ycbcr_buffer_flip(_dec->pp_frame_buf,_dec->pp_frame_buf); - } - _dec->pp_frame_state=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC); - } - /*If we're not processing chroma, copy the reference frame's chroma planes.*/ - if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){ - memcpy(_dec->pp_frame_buf+1, - _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]]+1, - sizeof(_dec->pp_frame_buf[1])*2); - } - return 0; -} - - -/*Initialize the main decoding pipeline.*/ -static void oc_dec_pipeline_init(oc_dec_ctx *_dec, - oc_dec_pipeline_state *_pipe){ - const ptrdiff_t *coded_fragis; - const ptrdiff_t *uncoded_fragis; - int flimit; - int pli; - int qii; - int qti; - int zzi; - /*If chroma is sub-sampled in the vertical direction, we have to decode two - super block rows of Y' for each super block row of Cb and Cr.*/ - _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2); - /*Initialize the token and extra bits indices for each plane and - coefficient.*/ - memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti)); - /*Also copy over the initial the EOB run counts.*/ - memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs)); - /*Set up per-plane pointers to the coded and uncoded fragments lists.*/ - coded_fragis=_dec->state.coded_fragis; - uncoded_fragis=coded_fragis+_dec->state.nfrags; - for(pli=0;pli<3;pli++){ - ptrdiff_t ncoded_fragis; - _pipe->coded_fragis[pli]=coded_fragis; - _pipe->uncoded_fragis[pli]=uncoded_fragis; - ncoded_fragis=_dec->state.ncoded_fragis[pli]; - coded_fragis+=ncoded_fragis; - uncoded_fragis+=ncoded_fragis-_dec->state.fplanes[pli].nfrags; - } - /*Set up condensed quantizer tables.*/ - for(pli=0;pli<3;pli++){ - for(qii=0;qii<_dec->state.nqis;qii++){ - for(qti=0;qti<2;qti++){ - _pipe->dequant[pli][qii][qti]= - _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti]; - } - } - } - /*Set the previous DC predictor to 0 for all color planes and frame types.*/ - memset(_pipe->pred_last,0,sizeof(_pipe->pred_last)); - /*Initialize the bounding value array for the loop filter.*/ - flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]]; - _pipe->loop_filter=flimit!=0; - if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit); - /*Initialize any buffers needed for post-processing. - We also save the current post-processing level, to guard against the user - changing it from a callback.*/ - if(!oc_dec_postprocess_init(_dec))_pipe->pp_level=_dec->pp_level; - /*If we don't have enough information to post-process, disable it, regardless - of the user-requested level.*/ - else{ - _pipe->pp_level=OC_PP_LEVEL_DISABLED; - memcpy(_dec->pp_frame_buf, - _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]], - sizeof(_dec->pp_frame_buf[0])*3); - } - /*Clear down the DCT coefficient buffer for the first block.*/ - for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0; -} - -/*Undo the DC prediction in a single plane of an MCU (one or two super block - rows). - As a side effect, the number of coded and uncoded fragments in this plane of - the MCU is also computed.*/ -void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec, - oc_dec_pipeline_state *_pipe,int _pli){ - const oc_fragment_plane *fplane; - oc_fragment *frags; - int *pred_last; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragi; - int fragx; - int fragy; - int fragy0; - int fragy_end; - int nhfrags; - /*Compute the first and last fragment row of the current MCU for this - plane.*/ - fplane=_dec->state.fplanes+_pli; - fragy0=_pipe->fragy0[_pli]; - fragy_end=_pipe->fragy_end[_pli]; - nhfrags=fplane->nhfrags; - pred_last=_pipe->pred_last[_pli]; - frags=_dec->state.frags; - ncoded_fragis=0; - fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags; - for(fragy=fragy0;fragy<fragy_end;fragy++){ - if(fragy==0){ - /*For the first row, all of the cases reduce to just using the previous - predictor for the same reference frame.*/ - for(fragx=0;fragx<nhfrags;fragx++,fragi++){ - if(frags[fragi].coded){ - int refi; - refi=frags[fragi].refi; - pred_last[refi]=frags[fragi].dc+=pred_last[refi]; - ncoded_fragis++; - } - } - } - else{ - oc_fragment *u_frags; - int l_ref; - int ul_ref; - int u_ref; - u_frags=frags-nhfrags; - l_ref=-1; - ul_ref=-1; - u_ref=u_frags[fragi].refi; - for(fragx=0;fragx<nhfrags;fragx++,fragi++){ - int ur_ref; - if(fragx+1>=nhfrags)ur_ref=-1; - else ur_ref=u_frags[fragi+1].refi; - if(frags[fragi].coded){ - int pred; - int refi; - refi=frags[fragi].refi; - /*We break out a separate case based on which of our neighbors use - the same reference frames. - This is somewhat faster than trying to make a generic case which - handles all of them, since it reduces lots of poorly predicted - jumps to one switch statement, and also lets a number of the - multiplications be optimized out by strength reduction.*/ - switch((l_ref==refi)|(ul_ref==refi)<<1| - (u_ref==refi)<<2|(ur_ref==refi)<<3){ - default:pred=pred_last[refi];break; - case 1: - case 3:pred=frags[fragi-1].dc;break; - case 2:pred=u_frags[fragi-1].dc;break; - case 4: - case 6: - case 12:pred=u_frags[fragi].dc;break; - case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break; - case 8:pred=u_frags[fragi+1].dc;break; - case 9: - case 11: - case 13:{ - /*The TI compiler mis-compiles this line.*/ - pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128; - }break; - case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break; - case 14:{ - pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc) - +10*u_frags[fragi].dc)/16; - }break; - case 7: - case 15:{ - int p0; - int p1; - int p2; - p0=frags[fragi-1].dc; - p1=u_frags[fragi-1].dc; - p2=u_frags[fragi].dc; - pred=(29*(p0+p2)-26*p1)/32; - if(abs(pred-p2)>128)pred=p2; - else if(abs(pred-p0)>128)pred=p0; - else if(abs(pred-p1)>128)pred=p1; - }break; - } - pred_last[refi]=frags[fragi].dc+=pred; - ncoded_fragis++; - l_ref=refi; - } - else l_ref=-1; - ul_ref=u_ref; - u_ref=ur_ref; - } - } - } - _pipe->ncoded_fragis[_pli]=ncoded_fragis; - /*Also save the number of uncoded fragments so we know how many to copy.*/ - _pipe->nuncoded_fragis[_pli]= - (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis; -} - -/*Reconstructs all coded fragments in a single MCU (one or two super block - rows). - This requires that each coded fragment have a proper macro block mode and - motion vector (if not in INTRA mode), and have its DC value decoded, with - the DC prediction process reversed, and the number of coded and uncoded - fragments in this plane of the MCU be counted. - The token lists for each color plane and coefficient should also be filled - in, along with initial token offsets, extra bits offsets, and EOB run - counts.*/ -static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec, - oc_dec_pipeline_state *_pipe,int _pli){ - unsigned char *dct_tokens; - const unsigned char *dct_fzig_zag; - ogg_uint16_t dc_quant[2]; - const oc_fragment *frags; - const ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragii; - ptrdiff_t *ti; - ptrdiff_t *eob_runs; - int qti; - dct_tokens=_dec->dct_tokens; - dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag; - frags=_dec->state.frags; - coded_fragis=_pipe->coded_fragis[_pli]; - ncoded_fragis=_pipe->ncoded_fragis[_pli]; - ti=_pipe->ti[_pli]; - eob_runs=_pipe->eob_runs[_pli]; - for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0]; - for(fragii=0;fragii<ncoded_fragis;fragii++){ - const ogg_uint16_t *ac_quant; - ptrdiff_t fragi; - int last_zzi; - int zzi; - fragi=coded_fragis[fragii]; - qti=frags[fragi].mb_mode!=OC_MODE_INTRA; - ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti]; - /*Decode the AC coefficients.*/ - for(zzi=0;zzi<64;){ - int token; - last_zzi=zzi; - if(eob_runs[zzi]){ - eob_runs[zzi]--; - break; - } - else{ - ptrdiff_t eob; - int cw; - int rlen; - int coeff; - int lti; - lti=ti[zzi]; - token=dct_tokens[lti++]; - cw=OC_DCT_CODE_WORD[token]; - /*These parts could be done branchless, but the branches are fairly - predictable and the C code translates into more than a few - instructions, so it's worth it to avoid them.*/ - if(OC_DCT_TOKEN_NEEDS_MORE(token)){ - cw+=dct_tokens[lti++]<<OC_DCT_TOKEN_EB_POS(token); - } - eob=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF; - if(token==OC_DCT_TOKEN_FAT_EOB){ - eob+=dct_tokens[lti++]<<8; - if(eob==0)eob=OC_DCT_EOB_FINISH; - } - rlen=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT); - cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT); - coeff=cw>>OC_DCT_CW_MAG_SHIFT; - eob_runs[zzi]=eob; - ti[zzi]=lti; - zzi+=rlen; - _pipe->dct_coeffs[dct_fzig_zag[zzi]]= - (ogg_int16_t)(coeff*(int)ac_quant[zzi]); - zzi+=!eob; - } - } - /*TODO: zzi should be exactly 64 here. - If it's not, we should report some kind of warning.*/ - zzi=OC_MINI(zzi,64); - _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc; - /*last_zzi is always initialized. - If your compiler thinks otherwise, it is dumb.*/ - oc_state_frag_recon(&_dec->state,fragi,_pli, - _pipe->dct_coeffs,last_zzi,dc_quant[qti]); - } - _pipe->coded_fragis[_pli]+=ncoded_fragis; - /*Right now the reconstructed MCU has only the coded blocks in it.*/ - /*TODO: We make the decision here to always copy the uncoded blocks into it - from the reference frame. - We could also copy the coded blocks back over the reference frame, if we - wait for an additional MCU to be decoded, which might be faster if only a - small number of blocks are coded. - However, this introduces more latency, creating a larger cache footprint. - It's unknown which decision is better, but this one results in simpler - code, and the hard case (high bitrate, high resolution) is handled - correctly.*/ - /*Copy the uncoded blocks from the previous reference frame.*/ - if(_pipe->nuncoded_fragis[_pli]>0){ - _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; - oc_frag_copy_list(&_dec->state, - _dec->state.ref_frame_data[OC_FRAME_SELF], - _dec->state.ref_frame_data[OC_FRAME_PREV], - _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli], - _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs); - } -} - -/*Filter a horizontal block edge.*/ -static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride, - const unsigned char *_src,int _src_ystride,int _qstep,int _flimit, - int *_variance0,int *_variance1){ - unsigned char *rdst; - const unsigned char *rsrc; - unsigned char *cdst; - const unsigned char *csrc; - int r[10]; - int sum0; - int sum1; - int bx; - int by; - rdst=_dst; - rsrc=_src; - for(bx=0;bx<8;bx++){ - cdst=rdst; - csrc=rsrc; - for(by=0;by<10;by++){ - r[by]=*csrc; - csrc+=_src_ystride; - } - sum0=sum1=0; - for(by=0;by<4;by++){ - sum0+=abs(r[by+1]-r[by]); - sum1+=abs(r[by+5]-r[by+6]); - } - *_variance0+=OC_MINI(255,sum0); - *_variance1+=OC_MINI(255,sum1); - if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){ - *cdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3); - cdst+=_dst_ystride; - *cdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3); - cdst+=_dst_ystride; - for(by=0;by<4;by++){ - *cdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+ - r[by+4]+r[by+5]+r[by+6]+4>>3); - cdst+=_dst_ystride; - } - *cdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3); - cdst+=_dst_ystride; - *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3); - } - else{ - for(by=1;by<=8;by++){ - *cdst=(unsigned char)r[by]; - cdst+=_dst_ystride; - } - } - rdst++; - rsrc++; - } -} - -/*Filter a vertical block edge.*/ -static void oc_filter_vedge(unsigned char *_dst,int _dst_ystride, - int _qstep,int _flimit,int *_variances){ - unsigned char *rdst; - const unsigned char *rsrc; - unsigned char *cdst; - int r[10]; - int sum0; - int sum1; - int bx; - int by; - cdst=_dst; - for(by=0;by<8;by++){ - rsrc=cdst-1; - rdst=cdst; - for(bx=0;bx<10;bx++)r[bx]=*rsrc++; - sum0=sum1=0; - for(bx=0;bx<4;bx++){ - sum0+=abs(r[bx+1]-r[bx]); - sum1+=abs(r[bx+5]-r[bx+6]); - } - _variances[0]+=OC_MINI(255,sum0); - _variances[1]+=OC_MINI(255,sum1); - if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){ - *rdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3); - *rdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3); - for(bx=0;bx<4;bx++){ - *rdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+ - r[bx+4]+r[bx+5]+r[bx+6]+4>>3); - } - *rdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3); - *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3); - } - cdst+=_dst_ystride; - } -} - -static void oc_dec_deblock_frag_rows(oc_dec_ctx *_dec, - th_img_plane *_dst,th_img_plane *_src,int _pli,int _fragy0, - int _fragy_end){ - oc_fragment_plane *fplane; - int *variance; - unsigned char *dc_qi; - unsigned char *dst; - const unsigned char *src; - ptrdiff_t froffset; - int dst_ystride; - int src_ystride; - int nhfrags; - int width; - int notstart; - int notdone; - int flimit; - int qstep; - int y_end; - int y; - int x; - _dst+=_pli; - _src+=_pli; - fplane=_dec->state.fplanes+_pli; - nhfrags=fplane->nhfrags; - froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags; - variance=_dec->variances+froffset; - dc_qi=_dec->dc_qis+froffset; - notstart=_fragy0>0; - notdone=_fragy_end<fplane->nvfrags; - /*We want to clear an extra row of variances, except at the end.*/ - memset(variance+(nhfrags&-notstart),0, - (_fragy_end+notdone-_fragy0-notstart)*(nhfrags*sizeof(variance[0]))); - /*Except for the first time, we want to point to the middle of the row.*/ - y=(_fragy0<<3)+(notstart<<2); - dst_ystride=_dst->stride; - src_ystride=_src->stride; - dst=_dst->data+y*(ptrdiff_t)dst_ystride; - src=_src->data+y*(ptrdiff_t)src_ystride; - width=_dst->width; - for(;y<4;y++){ - memcpy(dst,src,width*sizeof(dst[0])); - dst+=dst_ystride; - src+=src_ystride; - } - /*We also want to skip the last row in the frame for this loop.*/ - y_end=_fragy_end-!notdone<<3; - for(;y<y_end;y+=8){ - qstep=_dec->pp_dc_scale[*dc_qi]; - flimit=(qstep*3)>>2; - oc_filter_hedge(dst,dst_ystride,src-src_ystride,src_ystride, - qstep,flimit,variance,variance+nhfrags); - variance++; - dc_qi++; - for(x=8;x<width;x+=8){ - qstep=_dec->pp_dc_scale[*dc_qi]; - flimit=(qstep*3)>>2; - oc_filter_hedge(dst+x,dst_ystride,src+x-src_ystride,src_ystride, - qstep,flimit,variance,variance+nhfrags); - oc_filter_vedge(dst+x-(dst_ystride<<2)-4,dst_ystride, - qstep,flimit,variance-1); - variance++; - dc_qi++; - } - dst+=dst_ystride<<3; - src+=src_ystride<<3; - } - /*And finally, handle the last row in the frame, if it's in the range.*/ - if(!notdone){ - int height; - height=_dst->height; - for(;y<height;y++){ - memcpy(dst,src,width*sizeof(dst[0])); - dst+=dst_ystride; - src+=src_ystride; - } - /*Filter the last row of vertical block edges.*/ - dc_qi++; - for(x=8;x<width;x+=8){ - qstep=_dec->pp_dc_scale[*dc_qi++]; - flimit=(qstep*3)>>2; - oc_filter_vedge(dst+x-(dst_ystride<<3)-4,dst_ystride, - qstep,flimit,variance++); - } - } -} - -static void oc_dering_block(unsigned char *_idata,int _ystride,int _b, - int _dc_scale,int _sharp_mod,int _strong){ - static const unsigned char OC_MOD_MAX[2]={24,32}; - static const unsigned char OC_MOD_SHIFT[2]={1,0}; - const unsigned char *psrc; - const unsigned char *src; - const unsigned char *nsrc; - unsigned char *dst; - int vmod[72]; - int hmod[72]; - int mod_hi; - int by; - int bx; - mod_hi=OC_MINI(3*_dc_scale,OC_MOD_MAX[_strong]); - dst=_idata; - src=dst; - psrc=src-(_ystride&-!(_b&4)); - for(by=0;by<9;by++){ - for(bx=0;bx<8;bx++){ - int mod; - mod=32+_dc_scale-(abs(src[bx]-psrc[bx])<<OC_MOD_SHIFT[_strong]); - vmod[(by<<3)+bx]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi); - } - psrc=src; - src+=_ystride&-(!(_b&8)|by<7); - } - nsrc=dst; - psrc=dst-!(_b&1); - for(bx=0;bx<9;bx++){ - src=nsrc; - for(by=0;by<8;by++){ - int mod; - mod=32+_dc_scale-(abs(*src-*psrc)<<OC_MOD_SHIFT[_strong]); - hmod[(bx<<3)+by]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi); - psrc+=_ystride; - src+=_ystride; - } - psrc=nsrc; - nsrc+=!(_b&2)|bx<7; - } - src=dst; - psrc=src-(_ystride&-!(_b&4)); - nsrc=src+_ystride; - for(by=0;by<8;by++){ - int a; - int b; - int w; - a=128; - b=64; - w=hmod[by]; - a-=w; - b+=w**(src-!(_b&1)); - w=vmod[by<<3]; - a-=w; - b+=w*psrc[0]; - w=vmod[by+1<<3]; - a-=w; - b+=w*nsrc[0]; - w=hmod[(1<<3)+by]; - a-=w; - b+=w*src[1]; - dst[0]=OC_CLAMP255(a*src[0]+b>>7); - for(bx=1;bx<7;bx++){ - a=128; - b=64; - w=hmod[(bx<<3)+by]; - a-=w; - b+=w*src[bx-1]; - w=vmod[(by<<3)+bx]; - a-=w; - b+=w*psrc[bx]; - w=vmod[(by+1<<3)+bx]; - a-=w; - b+=w*nsrc[bx]; - w=hmod[(bx+1<<3)+by]; - a-=w; - b+=w*src[bx+1]; - dst[bx]=OC_CLAMP255(a*src[bx]+b>>7); - } - a=128; - b=64; - w=hmod[(7<<3)+by]; - a-=w; - b+=w*src[6]; - w=vmod[(by<<3)+7]; - a-=w; - b+=w*psrc[7]; - w=vmod[(by+1<<3)+7]; - a-=w; - b+=w*nsrc[7]; - w=hmod[(8<<3)+by]; - a-=w; - b+=w*src[7+!(_b&2)]; - dst[7]=OC_CLAMP255(a*src[7]+b>>7); - dst+=_ystride; - psrc=src; - src=nsrc; - nsrc+=_ystride&-(!(_b&8)|by<6); - } -} - -#define OC_DERING_THRESH1 (384) -#define OC_DERING_THRESH2 (4*OC_DERING_THRESH1) -#define OC_DERING_THRESH3 (5*OC_DERING_THRESH1) -#define OC_DERING_THRESH4 (10*OC_DERING_THRESH1) - -static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img, - int _pli,int _fragy0,int _fragy_end){ - th_img_plane *iplane; - oc_fragment_plane *fplane; - oc_fragment *frag; - int *variance; - unsigned char *idata; - ptrdiff_t froffset; - int ystride; - int nhfrags; - int sthresh; - int strong; - int y_end; - int width; - int height; - int y; - int x; - iplane=_img+_pli; - fplane=_dec->state.fplanes+_pli; - nhfrags=fplane->nhfrags; - froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags; - variance=_dec->variances+froffset; - frag=_dec->state.frags+froffset; - strong=_dec->pp_level>=(_pli?OC_PP_LEVEL_SDERINGC:OC_PP_LEVEL_SDERINGY); - sthresh=_pli?OC_DERING_THRESH4:OC_DERING_THRESH3; - y=_fragy0<<3; - ystride=iplane->stride; - idata=iplane->data+y*(ptrdiff_t)ystride; - y_end=_fragy_end<<3; - width=iplane->width; - height=iplane->height; - for(;y<y_end;y+=8){ - for(x=0;x<width;x+=8){ - int b; - int qi; - int var; - qi=_dec->state.qis[frag->qii]; - var=*variance; - b=(x<=0)|(x+8>=width)<<1|(y<=0)<<2|(y+8>=height)<<3; - if(strong&&var>sthresh){ - oc_dering_block(idata+x,ystride,b, - _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1); - if(_pli||!(b&1)&&*(variance-1)>OC_DERING_THRESH4|| - !(b&2)&&variance[1]>OC_DERING_THRESH4|| - !(b&4)&&*(variance-nhfrags)>OC_DERING_THRESH4|| - !(b&8)&&variance[nhfrags]>OC_DERING_THRESH4){ - oc_dering_block(idata+x,ystride,b, - _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1); - oc_dering_block(idata+x,ystride,b, - _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1); - } - } - else if(var>OC_DERING_THRESH2){ - oc_dering_block(idata+x,ystride,b, - _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1); - } - else if(var>OC_DERING_THRESH1){ - oc_dering_block(idata+x,ystride,b, - _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],0); - } - frag++; - variance++; - } - idata+=ystride<<3; - } -} - - - -th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){ - oc_dec_ctx *dec; - if(_info==NULL||_setup==NULL)return NULL; - dec=oc_aligned_malloc(sizeof(*dec),16); - if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){ - oc_aligned_free(dec); - return NULL; - } - dec->state.curframe_num=0; - return dec; -} - -void th_decode_free(th_dec_ctx *_dec){ - if(_dec!=NULL){ - oc_dec_clear(_dec); - oc_aligned_free(_dec); - } -} - -int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf, - size_t _buf_sz){ - switch(_req){ - case TH_DECCTL_GET_PPLEVEL_MAX:{ - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - (*(int *)_buf)=OC_PP_LEVEL_MAX; - return 0; - }break; - case TH_DECCTL_SET_PPLEVEL:{ - int pp_level; - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - pp_level=*(int *)_buf; - if(pp_level<0||pp_level>OC_PP_LEVEL_MAX)return TH_EINVAL; - _dec->pp_level=pp_level; - return 0; - }break; - case TH_DECCTL_SET_GRANPOS:{ - ogg_int64_t granpos; - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(ogg_int64_t))return TH_EINVAL; - granpos=*(ogg_int64_t *)_buf; - if(granpos<0)return TH_EINVAL; - _dec->state.granpos=granpos; - _dec->state.keyframe_num=(granpos>>_dec->state.info.keyframe_granule_shift) - -_dec->state.granpos_bias; - _dec->state.curframe_num=_dec->state.keyframe_num - +(granpos&(1<<_dec->state.info.keyframe_granule_shift)-1); - return 0; - }break; - case TH_DECCTL_SET_STRIPE_CB:{ - th_stripe_callback *cb; - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(th_stripe_callback))return TH_EINVAL; - cb=(th_stripe_callback *)_buf; - _dec->stripe_cb.ctx=cb->ctx; - _dec->stripe_cb.stripe_decoded=cb->stripe_decoded; - return 0; - }break; -#ifdef HAVE_CAIRO - case TH_DECCTL_SET_TELEMETRY_MBMODE:{ - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; - _dec->telemetry_mbmode=*(int *)_buf; - return 0; - }break; - case TH_DECCTL_SET_TELEMETRY_MV:{ - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; - _dec->telemetry_mv=*(int *)_buf; - return 0; - }break; - case TH_DECCTL_SET_TELEMETRY_QI:{ - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; - _dec->telemetry_qi=*(int *)_buf; - return 0; - }break; - case TH_DECCTL_SET_TELEMETRY_BITS:{ - if(_dec==NULL||_buf==NULL)return TH_EFAULT; - if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; - _dec->telemetry_bits=*(int *)_buf; - return 0; - }break; -#endif - default:return TH_EIMPL; - } -} - -/*We're decoding an INTER frame, but have no initialized reference - buffers (i.e., decoding did not start on a key frame). - We initialize them to a solid gray here.*/ -static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){ - th_info *info; - size_t yplane_sz; - size_t cplane_sz; - ptrdiff_t yoffset; - int yhstride; - int yheight; - int chstride; - int cheight; - _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0; - _dec->state.ref_frame_idx[OC_FRAME_PREV]=0; - _dec->state.ref_frame_idx[OC_FRAME_SELF]=0; - _dec->state.ref_frame_data[OC_FRAME_GOLD]= - _dec->state.ref_frame_data[OC_FRAME_PREV]= - _dec->state.ref_frame_data[OC_FRAME_SELF]= - _dec->state.ref_frame_bufs[0][0].data; - memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0], - sizeof(_dec->pp_frame_buf[0])*3); - info=&_dec->state.info; - yhstride=abs(_dec->state.ref_ystride[0]); - yheight=info->frame_height+2*OC_UMV_PADDING; - chstride=abs(_dec->state.ref_ystride[1]); - cheight=yheight>>!(info->pixel_fmt&2); - yplane_sz=yhstride*(size_t)yheight+16; - cplane_sz=chstride*(size_t)cheight; - yoffset=yhstride*(ptrdiff_t)(yheight-OC_UMV_PADDING-1)+OC_UMV_PADDING; - memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz); -} - -int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, - ogg_int64_t *_granpos){ - int ret; - if(_dec==NULL||_op==NULL)return TH_EFAULT; - /*A completely empty packet indicates a dropped frame and is treated exactly - like an inter frame with no coded blocks.*/ - if(_op->bytes==0){ - _dec->state.frame_type=OC_INTER_FRAME; - _dec->state.ntotal_coded_fragis=0; - } - else{ - oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes); - ret=oc_dec_frame_header_unpack(_dec); - if(ret<0)return ret; - if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec); - else oc_dec_coded_flags_unpack(_dec); - } - /*If there have been no reference frames, and we need one, initialize one.*/ - if(_dec->state.frame_type!=OC_INTRA_FRAME&& - (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0|| - _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){ - oc_dec_init_dummy_frame(_dec); - } - /*If this was an inter frame with no coded blocks...*/ - if(_dec->state.ntotal_coded_fragis<=0){ - /*Just update the granule position and return.*/ - _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<< - _dec->state.info.keyframe_granule_shift) - +(_dec->state.curframe_num-_dec->state.keyframe_num); - _dec->state.curframe_num++; - if(_granpos!=NULL)*_granpos=_dec->state.granpos; - return TH_DUPFRAME; - } - else{ - th_ycbcr_buffer stripe_buf; - int stripe_fragy; - int refi; - int pli; - int notstart; - int notdone; - /*Select a free buffer to use for the reconstructed version of this frame.*/ - for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]|| - refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++); - _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi; - _dec->state.ref_frame_data[OC_FRAME_SELF]= - _dec->state.ref_frame_bufs[refi][0].data; -#if defined(HAVE_CAIRO) - _dec->telemetry_frame_bytes=_op->bytes; -#endif - if(_dec->state.frame_type==OC_INTRA_FRAME){ - _dec->state.keyframe_num=_dec->state.curframe_num; -#if defined(HAVE_CAIRO) - _dec->telemetry_coding_bytes= - _dec->telemetry_mode_bytes= - _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - } - else{ -#if defined(HAVE_CAIRO) - _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - oc_dec_mb_modes_unpack(_dec); -#if defined(HAVE_CAIRO) - _dec->telemetry_mode_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - oc_dec_mv_unpack_and_frag_modes_fill(_dec); -#if defined(HAVE_CAIRO) - _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - } - oc_dec_block_qis_unpack(_dec); -#if defined(HAVE_CAIRO) - _dec->telemetry_qi_bytes=oc_pack_bytes_left(&_dec->opb); -#endif - oc_dec_residual_tokens_unpack(_dec); - /*Update granule position. - This must be done before the striped decode callbacks so that the - application knows what to do with the frame data.*/ - _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<< - _dec->state.info.keyframe_granule_shift) - +(_dec->state.curframe_num-_dec->state.keyframe_num); - _dec->state.curframe_num++; - if(_granpos!=NULL)*_granpos=_dec->state.granpos; - /*All of the rest of the operations -- DC prediction reversal, - reconstructing coded fragments, copying uncoded fragments, loop - filtering, extending borders, and out-of-loop post-processing -- should - be pipelined. - I.e., DC prediction reversal, reconstruction, and uncoded fragment - copying are done for one or two super block rows, then loop filtering is - run as far as it can, then bordering copying, then post-processing. - For 4:2:0 video a Minimum Codable Unit or MCU contains two luma super - block rows, and one chroma. - Otherwise, an MCU consists of one super block row from each plane. - Inside each MCU, we perform all of the steps on one color plane before - moving on to the next. - After reconstruction, the additional filtering stages introduce a delay - since they need some pixels from the next fragment row. - Thus the actual number of decoded rows available is slightly smaller for - the first MCU, and slightly larger for the last. - - This entire process allows us to operate on the data while it is still in - cache, resulting in big performance improvements. - An application callback allows further application processing (blitting - to video memory, color conversion, etc.) to also use the data while it's - in cache.*/ - oc_dec_pipeline_init(_dec,&_dec->pipe); - oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf); - notstart=0; - notdone=1; - for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){ - int avail_fragy0; - int avail_fragy_end; - avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags; - notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end; - for(pli=0;pli<3;pli++){ - oc_fragment_plane *fplane; - int frag_shift; - int pp_offset; - int sdelay; - int edelay; - fplane=_dec->state.fplanes+pli; - /*Compute the first and last fragment row of the current MCU for this - plane.*/ - frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2); - _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift; - _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags, - _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift)); - oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli); - oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli); - sdelay=edelay=0; - if(_dec->pipe.loop_filter){ - sdelay+=notstart; - edelay+=notdone; - oc_state_loop_filter_frag_rows(&_dec->state, - _dec->pipe.bounding_values,OC_FRAME_SELF,pli, - _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); - } - /*To fill the borders, we have an additional two pixel delay, since a - fragment in the next row could filter its top edge, using two pixels - from a fragment in this row. - But there's no reason to delay a full fragment between the two.*/ - oc_state_borders_fill_rows(&_dec->state,refi,pli, - (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1), - (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1)); - /*Out-of-loop post-processing.*/ - pp_offset=3*(pli!=0); - if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){ - /*Perform de-blocking in one plane.*/ - sdelay+=notstart; - edelay+=notdone; - oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf, - _dec->state.ref_frame_bufs[refi],pli, - _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); - if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){ - /*Perform de-ringing in one plane.*/ - sdelay+=notstart; - edelay+=notdone; - oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli, - _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); - } - } - /*If no post-processing is done, we still need to delay a row for the - loop filter, thanks to the strange filtering order VP3 chose.*/ - else if(_dec->pipe.loop_filter){ - sdelay+=notstart; - edelay+=notdone; - } - /*Compute the intersection of the available rows in all planes. - If chroma is sub-sampled, the effect of each of its delays is - doubled, but luma might have more post-processing filters enabled - than chroma, so we don't know up front which one is the limiting - factor.*/ - avail_fragy0=OC_MINI(avail_fragy0, - _dec->pipe.fragy0[pli]-sdelay<<frag_shift); - avail_fragy_end=OC_MINI(avail_fragy_end, - _dec->pipe.fragy_end[pli]-edelay<<frag_shift); - } - if(_dec->stripe_cb.stripe_decoded!=NULL){ - /*The callback might want to use the FPU, so let's make sure they can. - We violate all kinds of ABI restrictions by not doing this until - now, but none of them actually matter since we don't use floating - point ourselves.*/ - oc_restore_fpu(&_dec->state); - /*Make the callback, ensuring we flip the sense of the "start" and - "end" of the available region upside down.*/ - (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf, - _dec->state.fplanes[0].nvfrags-avail_fragy_end, - _dec->state.fplanes[0].nvfrags-avail_fragy0); - } - notstart=1; - } - /*Finish filling in the reference frame borders.*/ - for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli); - /*Update the reference frame indices.*/ - if(_dec->state.frame_type==OC_INTRA_FRAME){ - /*The new frame becomes both the previous and gold reference frames.*/ - _dec->state.ref_frame_idx[OC_FRAME_GOLD]= - _dec->state.ref_frame_idx[OC_FRAME_PREV]= - _dec->state.ref_frame_idx[OC_FRAME_SELF]; - _dec->state.ref_frame_data[OC_FRAME_GOLD]= - _dec->state.ref_frame_data[OC_FRAME_PREV]= - _dec->state.ref_frame_data[OC_FRAME_SELF]; - } - else{ - /*Otherwise, just replace the previous reference frame.*/ - _dec->state.ref_frame_idx[OC_FRAME_PREV]= - _dec->state.ref_frame_idx[OC_FRAME_SELF]; - _dec->state.ref_frame_data[OC_FRAME_PREV]= - _dec->state.ref_frame_data[OC_FRAME_SELF]; - } - /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG - gamma values, if nothing else).*/ - oc_restore_fpu(&_dec->state); -#if defined(OC_DUMP_IMAGES) - /*We only dump images if there were some coded blocks.*/ - oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec"); -#endif - return 0; - } -} - -int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){ - if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT; - oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf); -#if defined(HAVE_CAIRO) - /*If telemetry ioctls are active, we need to draw to the output buffer. - Stuff the plane into cairo.*/ - if(_dec->telemetry){ - cairo_surface_t *cs; - unsigned char *data; - unsigned char *y_row; - unsigned char *u_row; - unsigned char *v_row; - unsigned char *rgb_row; - int cstride; - int w; - int h; - int x; - int y; - int hdec; - int vdec; - w=_ycbcr[0].width; - h=_ycbcr[0].height; - hdec=!(_dec->state.info.pixel_fmt&1); - vdec=!(_dec->state.info.pixel_fmt&2); - /*Lazy data buffer init. - We could try to re-use the post-processing buffer, which would save - memory, but complicate the allocation logic there. - I don't think anyone cares about memory usage when using telemetry; it is - not meant for embedded devices.*/ - if(_dec->telemetry_frame_data==NULL){ - _dec->telemetry_frame_data=_ogg_malloc( - (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data)); - if(_dec->telemetry_frame_data==NULL)return 0; - } - cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h); - /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/ - data=cairo_image_surface_get_data(cs); - if(data==NULL){ - cairo_surface_destroy(cs); - return 0; - } - cstride=cairo_image_surface_get_stride(cs); - y_row=_ycbcr[0].data; - u_row=_ycbcr[1].data; - v_row=_ycbcr[2].data; - rgb_row=data; - for(y=0;y<h;y++){ - for(x=0;x<w;x++){ - int r; - int g; - int b; - r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200; - g=(3827562*y_row[x]-1287801*u_row[x>>hdec] - -2672387*v_row[x>>hdec]+447306710)/3287200; - b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600; - rgb_row[4*x+0]=OC_CLAMP255(b); - rgb_row[4*x+1]=OC_CLAMP255(g); - rgb_row[4*x+2]=OC_CLAMP255(r); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride&-((y&1)|!vdec); - v_row+=_ycbcr[2].stride&-((y&1)|!vdec); - rgb_row+=cstride; - } - /*Draw coded identifier for each macroblock (stored in Hilbert order).*/ - { - cairo_t *c; - const oc_fragment *frags; - oc_mv *frag_mvs; - const signed char *mb_modes; - oc_mb_map *mb_maps; - size_t nmbs; - size_t mbi; - int row2; - int col2; - int qim[3]={0,0,0}; - if(_dec->state.nqis==2){ - int bqi; - bqi=_dec->state.qis[0]; - if(_dec->state.qis[1]>bqi)qim[1]=1; - if(_dec->state.qis[1]<bqi)qim[1]=-1; - } - if(_dec->state.nqis==3){ - int bqi; - int cqi; - int dqi; - bqi=_dec->state.qis[0]; - cqi=_dec->state.qis[1]; - dqi=_dec->state.qis[2]; - if(cqi>bqi&&dqi>bqi){ - if(dqi>cqi){ - qim[1]=1; - qim[2]=2; - } - else{ - qim[1]=2; - qim[2]=1; - } - } - else if(cqi<bqi&&dqi<bqi){ - if(dqi<cqi){ - qim[1]=-1; - qim[2]=-2; - } - else{ - qim[1]=-2; - qim[2]=-1; - } - } - else{ - if(cqi<bqi)qim[1]=-1; - else qim[1]=1; - if(dqi<bqi)qim[2]=-1; - else qim[2]=1; - } - } - c=cairo_create(cs); - frags=_dec->state.frags; - frag_mvs=_dec->state.frag_mvs; - mb_modes=_dec->state.mb_modes; - mb_maps=_dec->state.mb_maps; - nmbs=_dec->state.nmbs; - row2=0; - col2=0; - for(mbi=0;mbi<nmbs;mbi++){ - float x; - float y; - int bi; - y=h-(row2+((col2+1>>1)&1))*16-16; - x=(col2>>1)*16; - cairo_set_line_width(c,1.); - /*Keyframe (all intra) red box.*/ - if(_dec->state.frame_type==OC_INTRA_FRAME){ - if(_dec->telemetry_mbmode&0x02){ - cairo_set_source_rgba(c,1.,0,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,0,0,.25); - cairo_fill(c); - } - } - else{ - ptrdiff_t fragi; - int frag_mvx; - int frag_mvy; - for(bi=0;bi<4;bi++){ - fragi=mb_maps[mbi][0][bi]; - if(fragi>=0&&frags[fragi].coded){ - frag_mvx=OC_MV_X(frag_mvs[fragi]); - frag_mvy=OC_MV_Y(frag_mvs[fragi]); - break; - } - } - if(bi<4){ - switch(mb_modes[mbi]){ - case OC_MODE_INTRA:{ - if(_dec->telemetry_mbmode&0x02){ - cairo_set_source_rgba(c,1.,0,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,0,0,.25); - cairo_fill(c); - } - }break; - case OC_MODE_INTER_NOMV:{ - if(_dec->telemetry_mbmode&0x01){ - cairo_set_source_rgba(c,0,0,1.,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0,0,1.,.25); - cairo_fill(c); - } - }break; - case OC_MODE_INTER_MV:{ - if(_dec->telemetry_mbmode&0x04){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x04){ - cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_LAST:{ - if(_dec->telemetry_mbmode&0x08){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_move_to(c,x+13.5,y+2.5); - cairo_line_to(c,x+2.5,y+8); - cairo_line_to(c,x+13.5,y+13.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x08){ - cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_LAST2:{ - if(_dec->telemetry_mbmode&0x10){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_move_to(c,x+8,y+2.5); - cairo_line_to(c,x+2.5,y+8); - cairo_line_to(c,x+8,y+13.5); - cairo_move_to(c,x+13.5,y+2.5); - cairo_line_to(c,x+8,y+8); - cairo_line_to(c,x+13.5,y+13.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x10){ - cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_GOLDEN_NOMV:{ - if(_dec->telemetry_mbmode&0x20){ - cairo_set_source_rgba(c,1.,1.,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,1.,0,.25); - cairo_fill(c); - } - }break; - case OC_MODE_GOLDEN_MV:{ - if(_dec->telemetry_mbmode&0x40){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,1.,1.,0,.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x40){ - cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_FOUR:{ - if(_dec->telemetry_mbmode&0x80){ - cairo_rectangle(c,x+2.5,y+2.5,4,4); - cairo_rectangle(c,x+9.5,y+2.5,4,4); - cairo_rectangle(c,x+2.5,y+9.5,4,4); - cairo_rectangle(c,x+9.5,y+9.5,4,4); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_stroke(c); - } - /*4mv is odd, coded in raster order.*/ - fragi=mb_maps[mbi][0][0]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mvx=OC_MV_X(frag_mvs[fragi]); - frag_mvx=OC_MV_Y(frag_mvs[fragi]); - cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+4,y+12); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][1]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mvx=OC_MV_X(frag_mvs[fragi]); - frag_mvx=OC_MV_Y(frag_mvs[fragi]); - cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+12,y+12); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][2]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mvx=OC_MV_X(frag_mvs[fragi]); - frag_mvx=OC_MV_Y(frag_mvs[fragi]); - cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+4,y+4); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][3]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mvx=OC_MV_X(frag_mvs[fragi]); - frag_mvx=OC_MV_Y(frag_mvs[fragi]); - cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+12,y+4); - cairo_stroke(c); - } - }break; - } - } - } - /*qii illustration.*/ - if(_dec->telemetry_qi&0x2){ - cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE); - for(bi=0;bi<4;bi++){ - ptrdiff_t fragi; - int qiv; - int xp; - int yp; - xp=x+(bi&1)*8; - yp=y+8-(bi&2)*4; - fragi=mb_maps[mbi][0][bi]; - if(fragi>=0&&frags[fragi].coded){ - qiv=qim[frags[fragi].qii]; - cairo_set_line_width(c,3.); - cairo_set_source_rgba(c,0.,0.,0.,.5); - switch(qiv){ - /*Double plus:*/ - case 2:{ - if((bi&1)^((bi&2)>>1)){ - cairo_move_to(c,xp+2.5,yp+1.5); - cairo_line_to(c,xp+2.5,yp+3.5); - cairo_move_to(c,xp+1.5,yp+2.5); - cairo_line_to(c,xp+3.5,yp+2.5); - cairo_move_to(c,xp+5.5,yp+4.5); - cairo_line_to(c,xp+5.5,yp+6.5); - cairo_move_to(c,xp+4.5,yp+5.5); - cairo_line_to(c,xp+6.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0.,1.,1.,1.); - } - else{ - cairo_move_to(c,xp+5.5,yp+1.5); - cairo_line_to(c,xp+5.5,yp+3.5); - cairo_move_to(c,xp+4.5,yp+2.5); - cairo_line_to(c,xp+6.5,yp+2.5); - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+2.5,yp+6.5); - cairo_move_to(c,xp+1.5,yp+5.5); - cairo_line_to(c,xp+3.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0.,1.,1.,1.); - } - }break; - /*Double minus:*/ - case -2:{ - cairo_move_to(c,xp+2.5,yp+2.5); - cairo_line_to(c,xp+5.5,yp+2.5); - cairo_move_to(c,xp+2.5,yp+5.5); - cairo_line_to(c,xp+5.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,1.,1.,1.); - }break; - /*Plus:*/ - case 1:{ - if(bi&2==0)yp-=2; - if(bi&1==0)xp-=2; - cairo_move_to(c,xp+4.5,yp+2.5); - cairo_line_to(c,xp+4.5,yp+6.5); - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+6.5,yp+4.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,.1,1.,.3,1.); - break; - } - /*Fall through.*/ - /*Minus:*/ - case -1:{ - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+6.5,yp+4.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,.3,.1,1.); - }break; - default:continue; - } - cairo_set_line_width(c,1.); - cairo_stroke(c); - } - } - } - col2++; - if((col2>>1)>=_dec->state.nhmbs){ - col2=0; - row2+=2; - } - } - /*Bit usage indicator[s]:*/ - if(_dec->telemetry_bits){ - int widths[6]; - int fpsn; - int fpsd; - int mult; - int fullw; - int padw; - int i; - fpsn=_dec->state.info.fps_numerator; - fpsd=_dec->state.info.fps_denominator; - mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits); - fullw=250.f*h*fpsd*mult/fpsn; - padw=w-24; - /*Header and coded block bits.*/ - if(_dec->telemetry_frame_bytes<0|| - _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){ - _dec->telemetry_frame_bytes=0; - } - if(_dec->telemetry_coding_bytes<0|| - _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_coding_bytes=0; - } - if(_dec->telemetry_mode_bytes<0|| - _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_mode_bytes=0; - } - if(_dec->telemetry_mv_bytes<0|| - _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_mv_bytes=0; - } - if(_dec->telemetry_qi_bytes<0|| - _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_qi_bytes=0; - } - if(_dec->telemetry_dc_bytes<0|| - _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_dc_bytes=0; - } - widths[0]=padw*(_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw; - widths[1]=padw*(_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw; - widths[2]=padw*(_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw; - widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw; - widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw; - widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw; - for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w; - cairo_set_source_rgba(c,.0,.0,.0,.6); - cairo_rectangle(c,10,h-33,widths[0]+1,5); - cairo_rectangle(c,10,h-29,widths[1]+1,5); - cairo_rectangle(c,10,h-25,widths[2]+1,5); - cairo_rectangle(c,10,h-21,widths[3]+1,5); - cairo_rectangle(c,10,h-17,widths[4]+1,5); - cairo_rectangle(c,10,h-13,widths[5]+1,5); - cairo_fill(c); - cairo_set_source_rgb(c,1,0,0); - cairo_rectangle(c,10.5,h-32.5,widths[0],4); - cairo_fill(c); - cairo_set_source_rgb(c,0,1,0); - cairo_rectangle(c,10.5,h-28.5,widths[1],4); - cairo_fill(c); - cairo_set_source_rgb(c,0,0,1); - cairo_rectangle(c,10.5,h-24.5,widths[2],4); - cairo_fill(c); - cairo_set_source_rgb(c,.6,.4,.0); - cairo_rectangle(c,10.5,h-20.5,widths[3],4); - cairo_fill(c); - cairo_set_source_rgb(c,.3,.3,.3); - cairo_rectangle(c,10.5,h-16.5,widths[4],4); - cairo_fill(c); - cairo_set_source_rgb(c,.5,.5,.8); - cairo_rectangle(c,10.5,h-12.5,widths[5],4); - cairo_fill(c); - } - /*Master qi indicator[s]:*/ - if(_dec->telemetry_qi&0x1){ - cairo_text_extents_t extents; - char buffer[10]; - int p; - int y; - p=0; - y=h-7.5; - if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10; - buffer[p++]=48+_dec->state.qis[0]%10; - if(_dec->state.nqis>=2){ - buffer[p++]=' '; - if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10; - buffer[p++]=48+_dec->state.qis[1]%10; - } - if(_dec->state.nqis==3){ - buffer[p++]=' '; - if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10; - buffer[p++]=48+_dec->state.qis[2]%10; - } - buffer[p++]='\0'; - cairo_select_font_face(c,"sans", - CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD); - cairo_set_font_size(c,18); - cairo_text_extents(c,buffer,&extents); - cairo_set_source_rgb(c,1,1,1); - cairo_move_to(c,w-extents.x_advance-10,y); - cairo_show_text(c,buffer); - cairo_set_source_rgb(c,0,0,0); - cairo_move_to(c,w-extents.x_advance-10,y); - cairo_text_path(c,buffer); - cairo_set_line_width(c,.8); - cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND); - cairo_stroke(c); - } - cairo_destroy(c); - } - /*Out of the Cairo plane into the telemetry YUV buffer.*/ - _ycbcr[0].data=_dec->telemetry_frame_data; - _ycbcr[0].stride=_ycbcr[0].width; - _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride; - _ycbcr[1].stride=_ycbcr[1].width; - _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride; - _ycbcr[2].stride=_ycbcr[2].width; - y_row=_ycbcr[0].data; - u_row=_ycbcr[1].data; - v_row=_ycbcr[2].data; - rgb_row=data; - /*This is one of the few places it's worth handling chroma on a - case-by-case basis.*/ - switch(_dec->state.info.pixel_fmt){ - case TH_PF_420:{ - for(y=0;y<h;y+=2){ - unsigned char *y_row2; - unsigned char *rgb_row2; - y_row2=y_row+_ycbcr[0].stride; - rgb_row2=rgb_row+cstride; - for(x=0;x<w;x+=2){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - y_row[x]=OC_CLAMP255(y); - y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] - +24966*rgb_row[4*x+4]+4207500)/255000; - y_row[x+1]=OC_CLAMP255(y); - y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1] - +24966*rgb_row2[4*x+0]+4207500)/255000; - y_row2[x]=OC_CLAMP255(y); - y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5] - +24966*rgb_row2[4*x+4]+4207500)/255000; - y_row2[x+1]=OC_CLAMP255(y); - u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6] - +rgb_row2[4*x+2]+rgb_row2[4*x+6]) - -16436*(rgb_row[4*x+1]+rgb_row[4*x+5] - +rgb_row2[4*x+1]+rgb_row2[4*x+5]) - +24808*(rgb_row[4*x+0]+rgb_row[4*x+4] - +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930; - v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6] - +rgb_row2[4*x+2]+rgb_row2[4*x+6]) - -32872*(rgb_row[4*x+1]+rgb_row[4*x+5] - +rgb_row2[4*x+1]+rgb_row2[4*x+5]) - -6384*(rgb_row[4*x+0]+rgb_row[4*x+4] - +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510; - u_row[x>>1]=OC_CLAMP255(u); - v_row[x>>1]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride<<1; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride<<1; - } - }break; - case TH_PF_422:{ - for(y=0;y<h;y++){ - for(x=0;x<w;x+=2){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - y_row[x]=OC_CLAMP255(y); - y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] - +24966*rgb_row[4*x+4]+4207500)/255000; - y_row[x+1]=OC_CLAMP255(y); - u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6]) - -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]) - +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930; - v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6]) - -65744*(rgb_row[4*x+1]+rgb_row[4*x+5]) - -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510; - u_row[x>>1]=OC_CLAMP255(u); - v_row[x>>1]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride; - } - }break; - /*case TH_PF_444:*/ - default:{ - for(y=0;y<h;y++){ - for(x=0;x<w;x++){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1] - +99232*rgb_row[4*x+0]+29032005)/225930; - v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1] - -25536*rgb_row[4*x+0]+45940035)/357510; - y_row[x]=OC_CLAMP255(y); - u_row[x]=OC_CLAMP255(u); - v_row[x]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride; - } - }break; - } - /*Finished. - Destroy the surface.*/ - cairo_surface_destroy(cs); - } -#endif - return 0; -} diff --git a/media/libtheora/lib/dequant.c b/media/libtheora/lib/dequant.c deleted file mode 100644 index e554872d4..000000000 --- a/media/libtheora/lib/dequant.c +++ /dev/null @@ -1,182 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <ogg/ogg.h> -#include "dequant.h" -#include "decint.h" - -int oc_quant_params_unpack(oc_pack_buf *_opb,th_quant_info *_qinfo){ - th_quant_base *base_mats; - long val; - int nbase_mats; - int sizes[64]; - int indices[64]; - int nbits; - int bmi; - int ci; - int qti; - int pli; - int qri; - int qi; - int i; - val=oc_pack_read(_opb,3); - nbits=(int)val; - for(qi=0;qi<64;qi++){ - val=oc_pack_read(_opb,nbits); - _qinfo->loop_filter_limits[qi]=(unsigned char)val; - } - val=oc_pack_read(_opb,4); - nbits=(int)val+1; - for(qi=0;qi<64;qi++){ - val=oc_pack_read(_opb,nbits); - _qinfo->ac_scale[qi]=(ogg_uint16_t)val; - } - val=oc_pack_read(_opb,4); - nbits=(int)val+1; - for(qi=0;qi<64;qi++){ - val=oc_pack_read(_opb,nbits); - _qinfo->dc_scale[qi]=(ogg_uint16_t)val; - } - val=oc_pack_read(_opb,9); - nbase_mats=(int)val+1; - base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0])); - if(base_mats==NULL)return TH_EFAULT; - for(bmi=0;bmi<nbase_mats;bmi++){ - for(ci=0;ci<64;ci++){ - val=oc_pack_read(_opb,8); - base_mats[bmi][ci]=(unsigned char)val; - } - } - nbits=oc_ilog(nbase_mats-1); - for(i=0;i<6;i++){ - th_quant_ranges *qranges; - th_quant_base *qrbms; - int *qrsizes; - qti=i/3; - pli=i%3; - qranges=_qinfo->qi_ranges[qti]+pli; - if(i>0){ - val=oc_pack_read1(_opb); - if(!val){ - int qtj; - int plj; - if(qti>0){ - val=oc_pack_read1(_opb); - if(val){ - qtj=qti-1; - plj=pli; - } - else{ - qtj=(i-1)/3; - plj=(i-1)%3; - } - } - else{ - qtj=(i-1)/3; - plj=(i-1)%3; - } - *qranges=*(_qinfo->qi_ranges[qtj]+plj); - continue; - } - } - val=oc_pack_read(_opb,nbits); - indices[0]=(int)val; - for(qi=qri=0;qi<63;){ - val=oc_pack_read(_opb,oc_ilog(62-qi)); - sizes[qri]=(int)val+1; - qi+=(int)val+1; - val=oc_pack_read(_opb,nbits); - indices[++qri]=(int)val; - } - /*Note: The caller is responsible for cleaning up any partially - constructed qinfo.*/ - if(qi>63){ - _ogg_free(base_mats); - return TH_EBADHEADER; - } - qranges->nranges=qri; - qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0])); - if(qranges->sizes==NULL){ - /*Note: The caller is responsible for cleaning up any partially - constructed qinfo.*/ - _ogg_free(base_mats); - return TH_EFAULT; - } - memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0])); - qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0])); - if(qrbms==NULL){ - /*Note: The caller is responsible for cleaning up any partially - constructed qinfo.*/ - _ogg_free(base_mats); - return TH_EFAULT; - } - qranges->base_matrices=(const th_quant_base *)qrbms; - do{ - bmi=indices[qri]; - /*Note: The caller is responsible for cleaning up any partially - constructed qinfo.*/ - if(bmi>=nbase_mats){ - _ogg_free(base_mats); - return TH_EBADHEADER; - } - memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri])); - } - while(qri-->0); - } - _ogg_free(base_mats); - return 0; -} - -void oc_quant_params_clear(th_quant_info *_qinfo){ - int i; - for(i=6;i-->0;){ - int qti; - int pli; - qti=i/3; - pli=i%3; - /*Clear any duplicate pointer references.*/ - if(i>0){ - int qtj; - int plj; - qtj=(i-1)/3; - plj=(i-1)%3; - if(_qinfo->qi_ranges[qti][pli].sizes== - _qinfo->qi_ranges[qtj][plj].sizes){ - _qinfo->qi_ranges[qti][pli].sizes=NULL; - } - if(_qinfo->qi_ranges[qti][pli].base_matrices== - _qinfo->qi_ranges[qtj][plj].base_matrices){ - _qinfo->qi_ranges[qti][pli].base_matrices=NULL; - } - } - if(qti>0){ - if(_qinfo->qi_ranges[1][pli].sizes== - _qinfo->qi_ranges[0][pli].sizes){ - _qinfo->qi_ranges[1][pli].sizes=NULL; - } - if(_qinfo->qi_ranges[1][pli].base_matrices== - _qinfo->qi_ranges[0][pli].base_matrices){ - _qinfo->qi_ranges[1][pli].base_matrices=NULL; - } - } - /*Now free all the non-duplicate storage.*/ - _ogg_free((void *)_qinfo->qi_ranges[qti][pli].sizes); - _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices); - } -} diff --git a/media/libtheora/lib/dequant.h b/media/libtheora/lib/dequant.h deleted file mode 100644 index ef25838e3..000000000 --- a/media/libtheora/lib/dequant.h +++ /dev/null @@ -1,27 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#if !defined(_dequant_H) -# define _dequant_H (1) -# include "quant.h" -# include "bitpack.h" - -int oc_quant_params_unpack(oc_pack_buf *_opb, - th_quant_info *_qinfo); -void oc_quant_params_clear(th_quant_info *_qinfo); - -#endif diff --git a/media/libtheora/lib/fragment.c b/media/libtheora/lib/fragment.c deleted file mode 100644 index 4ba6af1b7..000000000 --- a/media/libtheora/lib/fragment.c +++ /dev/null @@ -1,82 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: fragment.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ -#include <string.h> -#include "internal.h" - -void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){ - int i; - for(i=8;i-->0;){ - memcpy(_dst,_src,8*sizeof(*_dst)); - _dst+=_ystride; - _src+=_ystride; - } -} - -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _ystride: The row stride of the reference frames. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _frag_buf_offs: The offsets of fragments in the reference frames.*/ -void oc_frag_copy_list_c(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ - ptrdiff_t fragii; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=_frag_buf_offs[_fragis[fragii]]; - oc_frag_copy_c(_dst_frame+frag_buf_off, - _src_frame+frag_buf_off,_ystride); - } -} - -void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride, - const ogg_int16_t _residue[64]){ - int i; - for(i=0;i<8;i++){ - int j; - for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+128); - _dst+=_ystride; - } -} - -void oc_frag_recon_inter_c(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){ - int i; - for(i=0;i<8;i++){ - int j; - for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+_src[j]); - _dst+=_ystride; - _src+=_ystride; - } -} - -void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){ - int i; - for(i=0;i<8;i++){ - int j; - for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+(_src1[j]+_src2[j]>>1)); - _dst+=_ystride; - _src1+=_ystride; - _src2+=_ystride; - } -} - -void oc_restore_fpu_c(void){} diff --git a/media/libtheora/lib/huffdec.c b/media/libtheora/lib/huffdec.c deleted file mode 100644 index fe013c611..000000000 --- a/media/libtheora/lib/huffdec.c +++ /dev/null @@ -1,521 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: huffdec.c 17577 2010-10-29 04:00:07Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <ogg/ogg.h> -#include "huffdec.h" -#include "decint.h" - - - -/*Instead of storing every branching in the tree, subtrees can be collapsed - into one node, with a table of size 1<<nbits pointing directly to its - descedents nbits levels down. - This allows more than one bit to be read at a time, and avoids following all - the intermediate branches with next to no increased code complexity once - the collapsed tree has been built. - We do _not_ require that a subtree be complete to be collapsed, but instead - store duplicate pointers in the table, and record the actual depth of the - node below its parent. - This tells us the number of bits to advance the stream after reaching it. - - This turns out to be equivalent to the method described in \cite{Hash95}, - without the requirement that codewords be sorted by length. - If the codewords were sorted by length (so-called ``canonical-codes''), they - could be decoded much faster via either Lindell and Moffat's approach or - Hashemian's Condensed Huffman Code approach, the latter of which has an - extremely small memory footprint. - We can't use Choueka et al.'s finite state machine approach, which is - extremely fast, because we can't allow multiple symbols to be output at a - time; the codebook can and does change between symbols. - It also has very large memory requirements, which impairs cache coherency. - - We store the tree packed in an array of 16-bit integers (words). - Each node consists of a single word, followed consecutively by two or more - indices of its children. - Let n be the value of this first word. - This is the number of bits that need to be read to traverse the node, and - must be positive. - 1<<n entries follow in the array, each an index to a child node. - If the child is positive, then it is the index of another internal node in - the table. - If the child is negative or zero, then it is a leaf node. - These are stored directly in the child pointer to save space, since they only - require a single word. - If a leaf node would have been encountered before reading n bits, then it is - duplicated the necessary number of times in this table. - Leaf nodes pack both a token value and their actual depth in the tree. - The token in the leaf node is (-leaf&255). - The number of bits that need to be consumed to reach the leaf, starting from - the current node, is (-leaf>>8). - - @ARTICLE{Hash95, - author="Reza Hashemian", - title="Memory Efficient and High-Speed Search {Huffman} Coding", - journal="{IEEE} Transactions on Communications", - volume=43, - number=10, - pages="2576--2581", - month=Oct, - year=1995 - }*/ - - - -/*The map from external spec-defined tokens to internal tokens. - This is constructed so that any extra bits read with the original token value - can be masked off the least significant bits of its internal token index. - In addition, all of the tokens which require additional extra bits are placed - at the start of the list, and grouped by type. - OC_DCT_REPEAT_RUN3_TOKEN is placed first, as it is an extra-special case, so - giving it index 0 may simplify comparisons on some architectures. - These requirements require some substantial reordering.*/ -static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={ - /*OC_DCT_EOB1_TOKEN (0 extra bits)*/ - 15, - /*OC_DCT_EOB2_TOKEN (0 extra bits)*/ - 16, - /*OC_DCT_EOB3_TOKEN (0 extra bits)*/ - 17, - /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits)*/ - 88, - /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits)*/ - 80, - /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/ - 1, - /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/ - 0, - /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits)*/ - 48, - /*OC_DCT_ZRL_TOKEN (6 extra bits)*/ - 14, - /*OC_ONE_TOKEN (0 extra bits)*/ - 56, - /*OC_MINUS_ONE_TOKEN (0 extra bits)*/ - 57, - /*OC_TWO_TOKEN (0 extra bits)*/ - 58, - /*OC_MINUS_TWO_TOKEN (0 extra bits)*/ - 59, - /*OC_DCT_VAL_CAT2 (1 extra bit)*/ - 60, - 62, - 64, - 66, - /*OC_DCT_VAL_CAT3 (2 extra bits)*/ - 68, - /*OC_DCT_VAL_CAT4 (3 extra bits)*/ - 72, - /*OC_DCT_VAL_CAT5 (4 extra bits)*/ - 2, - /*OC_DCT_VAL_CAT6 (5 extra bits)*/ - 4, - /*OC_DCT_VAL_CAT7 (6 extra bits)*/ - 6, - /*OC_DCT_VAL_CAT8 (10 extra bits)*/ - 8, - /*OC_DCT_RUN_CAT1A (1 extra bit)*/ - 18, - 20, - 22, - 24, - 26, - /*OC_DCT_RUN_CAT1B (3 extra bits)*/ - 32, - /*OC_DCT_RUN_CAT1C (4 extra bits)*/ - 12, - /*OC_DCT_RUN_CAT2A (2 extra bits)*/ - 28, - /*OC_DCT_RUN_CAT2B (3 extra bits)*/ - 40 -}; - -/*The log base 2 of number of internal tokens associated with each of the spec - tokens (i.e., how many of the extra bits are folded into the token value). - Increasing the maximum value beyond 3 will enlarge the amount of stack - required for tree construction.*/ -static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={ - 0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3 -}; - - -/*The size a lookup table is allowed to grow to relative to the number of - unique nodes it contains. - E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is - wasted (1/4 of the space must be used). - Larger numbers can decode tokens with fewer read operations, while smaller - numbers may save more space. - With a sample file: - 32233473 read calls are required when no tree collapsing is done (100.0%). - 19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%). - 11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%). - 10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%). - 10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%). - Since a value of 2 gets us the vast majority of the speed-up with only a - small amount of wasted memory, this is what we use. - This value must be less than 128, or you could create a tree with more than - 32767 entries, which would overflow the 16-bit words used to index it.*/ -#define OC_HUFF_SLUSH (2) -/*The root of the tree is on the fast path, and a larger value here is more - beneficial than elsewhere in the tree. - 7 appears to give the best performance, trading off between increased use of - the single-read fast path and cache footprint for the tables, though - obviously this will depend on your cache size. - Using 7 here, the VP3 tables are about twice as large compared to using 2.*/ -#define OC_ROOT_HUFF_SLUSH (7) - - - -/*Unpacks a Huffman codebook. - _opb: The buffer to unpack from. - _tokens: Stores a list of internal tokens, in the order they were found in - the codebook, and the lengths of their corresponding codewords. - This is enough to completely define the codebook, while minimizing - stack usage and avoiding temporary allocations (for platforms - where free() is a no-op). - Return: The number of internal tokens in the codebook, or a negative value - on error.*/ -int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){ - ogg_uint32_t code; - int len; - int ntokens; - int nleaves; - code=0; - len=ntokens=nleaves=0; - for(;;){ - long bits; - bits=oc_pack_read1(_opb); - /*Only process nodes so long as there's more bits in the buffer.*/ - if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; - /*Read an internal node:*/ - if(!bits){ - len++; - /*Don't allow codewords longer than 32 bits.*/ - if(len>32)return TH_EBADHEADER; - } - /*Read a leaf node:*/ - else{ - ogg_uint32_t code_bit; - int neb; - int nentries; - int token; - /*Don't allow more than 32 spec-tokens per codebook.*/ - if(++nleaves>32)return TH_EBADHEADER; - bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS); - neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits]; - token=OC_DCT_TOKEN_MAP[bits]; - nentries=1<<neb; - while(nentries-->0){ - _tokens[ntokens][0]=(unsigned char)token++; - _tokens[ntokens][1]=(unsigned char)(len+neb); - ntokens++; - } - code_bit=0x80000000U>>len-1; - while(len>0&&(code&code_bit)){ - code^=code_bit; - code_bit<<=1; - len--; - } - if(len<=0)break; - code|=code_bit; - } - } - return ntokens; -} - -/*Count how many tokens would be required to fill a subtree at depth _depth. - _tokens: A list of internal tokens, in the order they are found in the - codebook, and the lengths of their corresponding codewords. - _depth: The depth of the desired node in the corresponding tree structure. - Return: The number of tokens that belong to that subtree.*/ -static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){ - ogg_uint32_t code; - int ti; - code=0; - ti=0; - do{ - if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth; - else{ - /*Because of the expanded internal tokens, we can have codewords as long - as 35 bits. - A single recursion here is enough to advance past them.*/ - code++; - ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31); - } - } - while(code<0x80000000U); - return ti; -} - -/*Compute the number of bits to use for a collapsed tree node at the given - depth. - _tokens: A list of internal tokens, in the order they are found in the - codebook, and the lengths of their corresponding codewords. - _ntokens: The number of tokens corresponding to this tree node. - _depth: The depth of this tree node. - Return: The number of bits to use for a collapsed tree node rooted here. - This is always at least one, even if this was a leaf node.*/ -static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2], - int _ntokens,int _depth){ - int got_leaves; - int loccupancy; - int occupancy; - int slush; - int nbits; - int best_nbits; - slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH; - /*It's legal to have a tree with just a single node, which requires no bits - to decode and always returns the same token. - However, no encoder actually does this (yet). - To avoid a special case in oc_huff_token_decode(), we force the number of - lookahead bits to be at least one. - This will produce a tree that looks ahead one bit and then advances the - stream zero bits.*/ - nbits=1; - occupancy=2; - got_leaves=1; - do{ - int ti; - if(got_leaves)best_nbits=nbits; - nbits++; - got_leaves=0; - loccupancy=occupancy; - for(occupancy=ti=0;ti<_ntokens;occupancy++){ - if(_tokens[ti][1]<_depth+nbits)ti++; - else if(_tokens[ti][1]==_depth+nbits){ - got_leaves=1; - ti++; - } - else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits); - } - } - while(occupancy>loccupancy&&occupancy*slush>=1<<nbits); - return best_nbits; -} - -/*Determines the size in words of a Huffman tree node that represents a - subtree of depth _nbits. - _nbits: The depth of the subtree. - This must be greater than zero. - Return: The number of words required to store the node.*/ -static size_t oc_huff_node_size(int _nbits){ - return 1+(1<<_nbits); -} - -/*Produces a collapsed-tree representation of the given token list. - _tree: The storage for the collapsed Huffman tree. - This may be NULL to compute the required storage size instead of - constructing the tree. - _tokens: A list of internal tokens, in the order they are found in the - codebook, and the lengths of their corresponding codewords. - _ntokens: The number of tokens corresponding to this tree node. - Return: The number of words required to store the tree.*/ -#if defined(_MSC_VER) && _MSC_VER >= 1700 -#pragma optimize( "", off ) -#endif -static size_t oc_huff_tree_collapse(ogg_int16_t *_tree, - unsigned char _tokens[][2],int _ntokens){ - ogg_int16_t node[34]; - unsigned char depth[34]; - unsigned char last[34]; - size_t ntree; - int ti; - int l; - depth[0]=0; - last[0]=(unsigned char)(_ntokens-1); - ntree=0; - ti=0; - l=0; - do{ - int nbits; - nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]); - node[l]=(ogg_int16_t)ntree; - ntree+=oc_huff_node_size(nbits); - if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits; - do{ - while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){ - if(_tree!=NULL){ - ogg_int16_t leaf; - int nentries; - nentries=1<<depth[l]+nbits-_tokens[ti][1]; - leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]); - while(nentries-->0)_tree[node[l]++]=leaf; - } - ti++; - } - if(ti<=last[l]){ - /*We need to recurse*/ - depth[l+1]=(unsigned char)(depth[l]+nbits); - if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree; - l++; - last[l]= - (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1); - break; - } - /*Pop back up a level of recursion.*/ - else if(l-->0)nbits=depth[l+1]-depth[l]; - } - while(l>=0); - } - while(l>=0); - return ntree; -} -#if defined(_MSC_VER) && _MSC_VER >= 1700 -#pragma optimize( "", on ) -#endif - -/*Unpacks a set of Huffman trees, and reduces them to a collapsed - representation. - _opb: The buffer to unpack the trees from. - _nodes: The table to fill with the Huffman trees. - Return: 0 on success, or a negative value on error. - The caller is responsible for cleaning up any partially initialized - _nodes on failure.*/ -int oc_huff_trees_unpack(oc_pack_buf *_opb, - ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ - int i; - for(i=0;i<TH_NHUFFMAN_TABLES;i++){ - unsigned char tokens[256][2]; - int ntokens; - ogg_int16_t *tree; - size_t size; - /*Unpack the full tree into a temporary buffer.*/ - ntokens=oc_huff_tree_unpack(_opb,tokens); - if(ntokens<0)return ntokens; - /*Figure out how big the collapsed tree will be and allocate space for it.*/ - size=oc_huff_tree_collapse(NULL,tokens,ntokens); - /*This should never happen; if it does it means you set OC_HUFF_SLUSH or - OC_ROOT_HUFF_SLUSH too large.*/ - if(size>32767)return TH_EIMPL; - tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree)); - if(tree==NULL)return TH_EFAULT; - /*Construct the collapsed the tree.*/ - oc_huff_tree_collapse(tree,tokens,ntokens); - _nodes[i]=tree; - } - return 0; -} - -/*Determines the size in words of a Huffman subtree. - _tree: The complete Huffman tree. - _node: The index of the root of the desired subtree. - Return: The number of words required to store the tree.*/ -static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){ - size_t size; - int nchildren; - int n; - int i; - n=_tree[_node]; - size=oc_huff_node_size(n); - nchildren=1<<n; - i=0; - do{ - int child; - child=_tree[_node+i+1]; - if(child<=0)i+=1<<n-(-child>>8); - else{ - size+=oc_huff_tree_size(_tree,child); - i++; - } - } - while(i<nchildren); - return size; -} - -/*Makes a copy of the given set of Huffman trees. - _dst: The array to store the copy in. - _src: The array of trees to copy.*/ -int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES], - const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){ - int total; - int i; - total=0; - for(i=0;i<TH_NHUFFMAN_TABLES;i++){ - size_t size; - size=oc_huff_tree_size(_src[i],0); - total+=size; - _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i])); - if(_dst[i]==NULL){ - while(i-->0)_ogg_free(_dst[i]); - return TH_EFAULT; - } - memcpy(_dst[i],_src[i],size*sizeof(*_dst[i])); - } - return 0; -} - -/*Frees the memory used by a set of Huffman trees. - _nodes: The array of trees to free.*/ -void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ - int i; - for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]); -} - - -/*Unpacks a single token using the given Huffman tree. - _opb: The buffer to unpack the token from. - _node: The tree to unpack the token with. - Return: The token value.*/ -int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){ - const unsigned char *ptr; - const unsigned char *stop; - oc_pb_window window; - int available; - long bits; - int node; - int n; - ptr=_opb->ptr; - window=_opb->window; - stop=_opb->stop; - available=_opb->bits; - node=0; - for(;;){ - n=_tree[node]; - if(n>available){ - unsigned shift; - shift=OC_PB_WINDOW_SIZE-available; - do{ - /*We don't bother setting eof because we won't check for it after we've - started decoding DCT tokens.*/ - if(ptr>=stop){ - shift=(unsigned)-OC_LOTS_OF_BITS; - break; - } - shift-=8; - window|=(oc_pb_window)*ptr++<<shift; - } - while(shift>=8); - /*Note: We never request more than 24 bits, so there's no need to fill in - the last partial byte here.*/ - available=OC_PB_WINDOW_SIZE-shift; - } - bits=window>>OC_PB_WINDOW_SIZE-n; - node=_tree[node+1+bits]; - if(node<=0)break; - window<<=n; - available-=n; - } - node=-node; - n=node>>8; - window<<=n; - available-=n; - _opb->ptr=ptr; - _opb->window=window; - _opb->bits=available; - return node&255; -} diff --git a/media/libtheora/lib/huffdec.h b/media/libtheora/lib/huffdec.h deleted file mode 100644 index 2fd112a90..000000000 --- a/media/libtheora/lib/huffdec.h +++ /dev/null @@ -1,32 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: huffdec.h 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#if !defined(_huffdec_H) -# define _huffdec_H (1) -# include "huffman.h" -# include "bitpack.h" - - - -int oc_huff_trees_unpack(oc_pack_buf *_opb, - ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]); -int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES], - const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]); -void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]); -int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node); - -#endif diff --git a/media/libtheora/lib/huffman.h b/media/libtheora/lib/huffman.h deleted file mode 100644 index 36cf7572e..000000000 --- a/media/libtheora/lib/huffman.h +++ /dev/null @@ -1,70 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: huffman.h 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#if !defined(_huffman_H) -# define _hufffman_H (1) -# include "theora/codec.h" -# include "ocintrin.h" - -/*The range of valid quantized DCT coefficient values. - VP3 used 511 in the encoder, but the bitstream is capable of 580.*/ -#define OC_DCT_VAL_RANGE (580) - -#define OC_NDCT_TOKEN_BITS (5) - -#define OC_DCT_EOB1_TOKEN (0) -#define OC_DCT_EOB2_TOKEN (1) -#define OC_DCT_EOB3_TOKEN (2) -#define OC_DCT_REPEAT_RUN0_TOKEN (3) -#define OC_DCT_REPEAT_RUN1_TOKEN (4) -#define OC_DCT_REPEAT_RUN2_TOKEN (5) -#define OC_DCT_REPEAT_RUN3_TOKEN (6) - -#define OC_DCT_SHORT_ZRL_TOKEN (7) -#define OC_DCT_ZRL_TOKEN (8) - -#define OC_ONE_TOKEN (9) -#define OC_MINUS_ONE_TOKEN (10) -#define OC_TWO_TOKEN (11) -#define OC_MINUS_TWO_TOKEN (12) - -#define OC_DCT_VAL_CAT2 (13) -#define OC_DCT_VAL_CAT3 (17) -#define OC_DCT_VAL_CAT4 (18) -#define OC_DCT_VAL_CAT5 (19) -#define OC_DCT_VAL_CAT6 (20) -#define OC_DCT_VAL_CAT7 (21) -#define OC_DCT_VAL_CAT8 (22) - -#define OC_DCT_RUN_CAT1A (23) -#define OC_DCT_RUN_CAT1B (28) -#define OC_DCT_RUN_CAT1C (29) -#define OC_DCT_RUN_CAT2A (30) -#define OC_DCT_RUN_CAT2B (31) - -#define OC_NDCT_EOB_TOKEN_MAX (7) -#define OC_NDCT_ZRL_TOKEN_MAX (9) -#define OC_NDCT_VAL_MAX (23) -#define OC_NDCT_VAL_CAT1_MAX (13) -#define OC_NDCT_VAL_CAT2_MAX (17) -#define OC_NDCT_VAL_CAT2_SIZE (OC_NDCT_VAL_CAT2_MAX-OC_DCT_VAL_CAT2) -#define OC_NDCT_RUN_MAX (32) -#define OC_NDCT_RUN_CAT1A_MAX (28) - -extern const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]; - -#endif diff --git a/media/libtheora/lib/idct.c b/media/libtheora/lib/idct.c deleted file mode 100644 index c56eb94c5..000000000 --- a/media/libtheora/lib/idct.c +++ /dev/null @@ -1,329 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: idct.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#include <string.h> -#include "internal.h" -#include "dct.h" - -/*Performs an inverse 8 point Type-II DCT transform. - The output is scaled by a factor of 2 relative to the orthonormal version of - the transform. - _y: The buffer to store the result in. - Data will be placed in every 8th entry (e.g., in a column of an 8x8 - block). - _x: The input coefficients. - The first 8 entries are used (e.g., from a row of an 8x8 block).*/ -static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){ - ogg_int32_t t[8]; - ogg_int32_t r; - /*Stage 1:*/ - /*0-1 butterfly.*/ - t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16; - t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16; - /*2-3 rotation by 6pi/16.*/ - t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16); - t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16); - /*4-7 rotation by 7pi/16.*/ - t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16); - /*5-6 rotation by 3pi/16.*/ - t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16); - t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16); - t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16); - /*Stage 2:*/ - /*4-5 butterfly.*/ - r=t[4]+t[5]; - t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; - t[4]=r; - /*7-6 butterfly.*/ - r=t[7]+t[6]; - t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; - t[7]=r; - /*Stage 3:*/ - /*0-3 butterfly.*/ - r=t[0]+t[3]; - t[3]=t[0]-t[3]; - t[0]=r; - /*1-2 butterfly.*/ - r=t[1]+t[2]; - t[2]=t[1]-t[2]; - t[1]=r; - /*6-5 butterfly.*/ - r=t[6]+t[5]; - t[5]=t[6]-t[5]; - t[6]=r; - /*Stage 4:*/ - /*0-7 butterfly.*/ - _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); - /*1-6 butterfly.*/ - _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); - /*2-5 butterfly.*/ - _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); - /*3-4 butterfly.*/ - _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); - _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); - _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); - _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); - _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); -} - -/*Performs an inverse 8 point Type-II DCT transform. - The output is scaled by a factor of 2 relative to the orthonormal version of - the transform. - _y: The buffer to store the result in. - Data will be placed in every 8th entry (e.g., in a column of an 8x8 - block). - _x: The input coefficients. - Only the first 4 entries are used. - The other 4 are assumed to be 0.*/ -static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){ - ogg_int32_t t[8]; - ogg_int32_t r; - /*Stage 1:*/ - t[0]=OC_C4S4*_x[0]>>16; - t[2]=OC_C6S2*_x[2]>>16; - t[3]=OC_C2S6*_x[2]>>16; - t[4]=OC_C7S1*_x[1]>>16; - t[5]=-(OC_C5S3*_x[3]>>16); - t[6]=OC_C3S5*_x[3]>>16; - t[7]=OC_C1S7*_x[1]>>16; - /*Stage 2:*/ - r=t[4]+t[5]; - t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; - t[4]=r; - r=t[7]+t[6]; - t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; - t[7]=r; - /*Stage 3:*/ - t[1]=t[0]+t[2]; - t[2]=t[0]-t[2]; - r=t[0]+t[3]; - t[3]=t[0]-t[3]; - t[0]=r; - r=t[6]+t[5]; - t[5]=t[6]-t[5]; - t[6]=r; - /*Stage 4:*/ - _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); - _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); - _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); - _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); - _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); - _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); - _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); - _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); -} - -/*Performs an inverse 8 point Type-II DCT transform. - The output is scaled by a factor of 2 relative to the orthonormal version of - the transform. - _y: The buffer to store the result in. - Data will be placed in every 8th entry (e.g., in a column of an 8x8 - block). - _x: The input coefficients. - Only the first 3 entries are used. - The other 5 are assumed to be 0.*/ -static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){ - ogg_int32_t t[8]; - ogg_int32_t r; - /*Stage 1:*/ - t[0]=OC_C4S4*_x[0]>>16; - t[2]=OC_C6S2*_x[2]>>16; - t[3]=OC_C2S6*_x[2]>>16; - t[4]=OC_C7S1*_x[1]>>16; - t[7]=OC_C1S7*_x[1]>>16; - /*Stage 2:*/ - t[5]=OC_C4S4*t[4]>>16; - t[6]=OC_C4S4*t[7]>>16; - /*Stage 3:*/ - t[1]=t[0]+t[2]; - t[2]=t[0]-t[2]; - r=t[0]+t[3]; - t[3]=t[0]-t[3]; - t[0]=r; - r=t[6]+t[5]; - t[5]=t[6]-t[5]; - t[6]=r; - /*Stage 4:*/ - _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); - _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); - _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); - _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); - _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); - _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); - _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); - _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); -} - -/*Performs an inverse 8 point Type-II DCT transform. - The output is scaled by a factor of 2 relative to the orthonormal version of - the transform. - _y: The buffer to store the result in. - Data will be placed in every 8th entry (e.g., in a column of an 8x8 - block). - _x: The input coefficients. - Only the first 2 entries are used. - The other 6 are assumed to be 0.*/ -static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){ - ogg_int32_t t[8]; - ogg_int32_t r; - /*Stage 1:*/ - t[0]=OC_C4S4*_x[0]>>16; - t[4]=OC_C7S1*_x[1]>>16; - t[7]=OC_C1S7*_x[1]>>16; - /*Stage 2:*/ - t[5]=OC_C4S4*t[4]>>16; - t[6]=OC_C4S4*t[7]>>16; - /*Stage 3:*/ - r=t[6]+t[5]; - t[5]=t[6]-t[5]; - t[6]=r; - /*Stage 4:*/ - _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); - _y[1<<3]=(ogg_int16_t)(t[0]+t[6]); - _y[2<<3]=(ogg_int16_t)(t[0]+t[5]); - _y[3<<3]=(ogg_int16_t)(t[0]+t[4]); - _y[4<<3]=(ogg_int16_t)(t[0]-t[4]); - _y[5<<3]=(ogg_int16_t)(t[0]-t[5]); - _y[6<<3]=(ogg_int16_t)(t[0]-t[6]); - _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); -} - -/*Performs an inverse 8 point Type-II DCT transform. - The output is scaled by a factor of 2 relative to the orthonormal version of - the transform. - _y: The buffer to store the result in. - Data will be placed in every 8th entry (e.g., in a column of an 8x8 - block). - _x: The input coefficients. - Only the first entry is used. - The other 7 are assumed to be 0.*/ -static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){ - _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]= - _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16); -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform. - All coefficients but the first 3 in zig-zag scan order are assumed to be 0: - x x 0 0 0 0 0 0 - x 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - _y: The buffer to store the result in. - This may be the same as _x. - _x: The input coefficients.*/ -static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - ogg_int16_t w[64]; - int i; - /*Transform rows of x into columns of w.*/ - idct8_2(w,_x); - idct8_1(w+1,_x+8); - /*Transform rows of w into columns of y.*/ - for(i=0;i<8;i++)idct8_2(_y+i,w+i*8); - /*Adjust for the scale factor.*/ - for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); - /*Clear input data for next block (decoder only).*/ - if(_x!=_y)_x[0]=_x[1]=_x[8]=0; -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform. - All coefficients but the first 10 in zig-zag scan order are assumed to be 0: - x x x x 0 0 0 0 - x x x 0 0 0 0 0 - x x 0 0 0 0 0 0 - x 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 - _y: The buffer to store the result in. - This may be the same as _x. - _x: The input coefficients.*/ -static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - ogg_int16_t w[64]; - int i; - /*Transform rows of x into columns of w.*/ - idct8_4(w,_x); - idct8_3(w+1,_x+8); - idct8_2(w+2,_x+16); - idct8_1(w+3,_x+24); - /*Transform rows of w into columns of y.*/ - for(i=0;i<8;i++)idct8_4(_y+i,w+i*8); - /*Adjust for the scale factor.*/ - for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); - /*Clear input data for next block (decoder only).*/ - if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0; -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform. - _y: The buffer to store the result in. - This may be the same as _x. - _x: The input coefficients.*/ -static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - ogg_int16_t w[64]; - int i; - /*Transform rows of x into columns of w.*/ - for(i=0;i<8;i++)idct8(w+i,_x+i*8); - /*Transform rows of w into columns of y.*/ - for(i=0;i<8;i++)idct8(_y+i,w+i*8); - /*Adjust for the scale factor.*/ - for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); - if(_x!=_y)for(i=0;i<64;i++)_x[i]=0; -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform.*/ -void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ - /*_last_zzi is subtly different from an actual count of the number of - coefficients we decoded for this block. - It contains the value of zzi BEFORE the final token in the block was - decoded. - In most cases this is an EOB token (the continuation of an EOB run from a - previous block counts), and so this is the same as the coefficient count. - However, in the case that the last token was NOT an EOB token, but filled - the block up with exactly 64 coefficients, _last_zzi will be less than 64. - Provided the last token was not a pure zero run, the minimum value it can - be is 46, and so that doesn't affect any of the cases in this routine. - However, if the last token WAS a pure zero run of length 63, then _last_zzi - will be 1 while the number of coefficients decoded is 64. - Thus, we will trigger the following special case, where the real - coefficient count would not. - Note also that a zero run of length 64 will give _last_zzi a value of 0, - but we still process the DC coefficient, which might have a non-zero value - due to DC prediction. - Although convoluted, this is arguably the correct behavior: it allows us to - use a smaller transform when the block ends with a long zero run instead - of a normal EOB token. - It could be smarter... multiple separate zero runs at the end of a block - will fool it, but an encoder that generates these really deserves what it - gets. - Needless to say we inherited this approach from VP3.*/ - /*Then perform the iDCT.*/ - if(_last_zzi<=3)oc_idct8x8_3(_y,_x); - else if(_last_zzi<=10)oc_idct8x8_10(_y,_x); - else oc_idct8x8_slow(_y,_x); -} diff --git a/media/libtheora/lib/info.c b/media/libtheora/lib/info.c deleted file mode 100644 index 6b9762978..000000000 --- a/media/libtheora/lib/info.c +++ /dev/null @@ -1,131 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: info.c 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#include <stdlib.h> -#include <ctype.h> -#include <string.h> -#include "internal.h" - - - -/*This is more or less the same as strncasecmp, but that doesn't exist - everywhere, and this is a fairly trivial function, so we include it. - Note: We take advantage of the fact that we know _n is less than or equal to - the length of at least one of the strings.*/ -static int oc_tagcompare(const char *_s1,const char *_s2,int _n){ - int c; - for(c=0;c<_n;c++){ - if(toupper(_s1[c])!=toupper(_s2[c]))return !0; - } - return _s1[c]!='='; -} - - - -void th_info_init(th_info *_info){ - memset(_info,0,sizeof(*_info)); - _info->version_major=TH_VERSION_MAJOR; - _info->version_minor=TH_VERSION_MINOR; - _info->version_subminor=TH_VERSION_SUB; - _info->keyframe_granule_shift=6; -} - -void th_info_clear(th_info *_info){ - memset(_info,0,sizeof(*_info)); -} - - - -void th_comment_init(th_comment *_tc){ - memset(_tc,0,sizeof(*_tc)); -} - -void th_comment_add(th_comment *_tc,char *_comment){ - char **user_comments; - int *comment_lengths; - int comment_len; - user_comments=_ogg_realloc(_tc->user_comments, - (_tc->comments+2)*sizeof(*_tc->user_comments)); - if(user_comments==NULL)return; - _tc->user_comments=user_comments; - comment_lengths=_ogg_realloc(_tc->comment_lengths, - (_tc->comments+2)*sizeof(*_tc->comment_lengths)); - if(comment_lengths==NULL)return; - _tc->comment_lengths=comment_lengths; - comment_len=strlen(_comment); - comment_lengths[_tc->comments]=comment_len; - user_comments[_tc->comments]=_ogg_malloc(comment_len+1); - if(user_comments[_tc->comments]==NULL)return; - memcpy(_tc->user_comments[_tc->comments],_comment,comment_len+1); - _tc->comments++; - _tc->user_comments[_tc->comments]=NULL; -} - -void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){ - char *comment; - int tag_len; - int val_len; - tag_len=strlen(_tag); - val_len=strlen(_val); - /*+2 for '=' and '\0'.*/ - comment=_ogg_malloc(tag_len+val_len+2); - if(comment==NULL)return; - memcpy(comment,_tag,tag_len); - comment[tag_len]='='; - memcpy(comment+tag_len+1,_val,val_len+1); - th_comment_add(_tc,comment); - _ogg_free(comment); -} - -char *th_comment_query(th_comment *_tc,char *_tag,int _count){ - long i; - int found; - int tag_len; - tag_len=strlen(_tag); - found=0; - for(i=0;i<_tc->comments;i++){ - if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len)){ - /*We return a pointer to the data, not a copy.*/ - if(_count==found++)return _tc->user_comments[i]+tag_len+1; - } - } - /*Didn't find anything.*/ - return NULL; -} - -int th_comment_query_count(th_comment *_tc,char *_tag){ - long i; - int tag_len; - int count; - tag_len=strlen(_tag); - count=0; - for(i=0;i<_tc->comments;i++){ - if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len))count++; - } - return count; -} - -void th_comment_clear(th_comment *_tc){ - if(_tc!=NULL){ - long i; - for(i=0;i<_tc->comments;i++)_ogg_free(_tc->user_comments[i]); - _ogg_free(_tc->user_comments); - _ogg_free(_tc->comment_lengths); - _ogg_free(_tc->vendor); - memset(_tc,0,sizeof(*_tc)); - } -} diff --git a/media/libtheora/lib/internal.c b/media/libtheora/lib/internal.c deleted file mode 100644 index 1b2611da1..000000000 --- a/media/libtheora/lib/internal.c +++ /dev/null @@ -1,212 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: internal.c 17506 2010-10-13 02:52:41Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <limits.h> -#include <string.h> -#include "internal.h" - - - -/*A map from the index in the zig zag scan to the coefficient number in a - block. - All zig zag indices beyond 63 are sent to coefficient 64, so that zero runs - past the end of a block in bogus streams get mapped to a known location.*/ -const unsigned char OC_FZIG_ZAG[128]={ - 0, 1, 8,16, 9, 2, 3,10, - 17,24,32,25,18,11, 4, 5, - 12,19,26,33,40,48,41,34, - 27,20,13, 6, 7,14,21,28, - 35,42,49,56,57,50,43,36, - 29,22,15,23,30,37,44,51, - 58,59,52,45,38,31,39,46, - 53,60,61,54,47,55,62,63, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64 -}; - -/*A map from the coefficient number in a block to its index in the zig zag - scan.*/ -const unsigned char OC_IZIG_ZAG[64]={ - 0, 1, 5, 6,14,15,27,28, - 2, 4, 7,13,16,26,29,42, - 3, 8,12,17,25,30,41,43, - 9,11,18,24,31,40,44,53, - 10,19,23,32,39,45,52,54, - 20,22,33,38,46,51,55,60, - 21,34,37,47,50,56,59,61, - 35,36,48,49,57,58,62,63 -}; - -/*A map from physical macro block ordering to bitstream macro block - ordering within a super block.*/ -const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}}; - -/*A list of the indices in the oc_mb.map array that can be valid for each of - the various chroma decimation types.*/ -const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12]={ - {0,1,2,3,4,8}, - {0,1,2,3,4,5,8,9}, - {0,1,2,3,4,6,8,10}, - {0,1,2,3,4,5,6,7,8,9,10,11} -}; - -/*The number of indices in the oc_mb.map array that can be valid for each of - the various chroma decimation types.*/ -const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS]={6,8,8,12}; - -/*The number of extra bits that are coded with each of the DCT tokens. - Each DCT token has some fixed number of additional bits (possibly 0) stored - after the token itself, containing, for example, coefficient magnitude, - sign bits, etc.*/ -const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]={ - 0,0,0,2,3,4,12,3,6, - 0,0,0,0, - 1,1,1,1,2,3,4,5,6,10, - 1,1,1,1,1,3,4, - 2,3 -}; - - - -int oc_ilog(unsigned _v){ - int ret; - for(ret=0;_v;ret++)_v>>=1; - return ret; -} - - - -void *oc_aligned_malloc(size_t _sz,size_t _align){ - unsigned char *p; - if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL; - p=(unsigned char *)_ogg_malloc(_sz+_align); - if(p!=NULL){ - int offs; - offs=((p-(unsigned char *)0)-1&_align-1); - p[offs]=offs; - p+=offs+1; - } - return p; -} - -void oc_aligned_free(void *_ptr){ - unsigned char *p; - p=(unsigned char *)_ptr; - if(p!=NULL){ - int offs; - offs=*--p; - _ogg_free(p-offs); - } -} - - -void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){ - size_t rowsz; - size_t colsz; - size_t datsz; - char *ret; - colsz=_height*sizeof(void *); - rowsz=_sz*_width; - datsz=rowsz*_height; - /*Alloc array and row pointers.*/ - ret=(char *)_ogg_malloc(datsz+colsz); - if(ret==NULL)return NULL; - /*Initialize the array.*/ - if(ret!=NULL){ - size_t i; - void **p; - char *datptr; - p=(void **)ret; - i=_height; - for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr; - } - return (void **)ret; -} - -void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){ - size_t colsz; - size_t rowsz; - size_t datsz; - char *ret; - colsz=_height*sizeof(void *); - rowsz=_sz*_width; - datsz=rowsz*_height; - /*Alloc array and row pointers.*/ - ret=(char *)_ogg_calloc(datsz+colsz,1); - if(ret==NULL)return NULL; - /*Initialize the array.*/ - if(ret!=NULL){ - size_t i; - void **p; - char *datptr; - p=(void **)ret; - i=_height; - for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr; - } - return (void **)ret; -} - -void oc_free_2d(void *_ptr){ - _ogg_free(_ptr); -} - -/*Fills in a Y'CbCr buffer with a pointer to the image data in the first - buffer, but with the opposite vertical orientation. - _dst: The destination buffer. - This can be the same as _src. - _src: The source buffer.*/ -void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst, - const th_ycbcr_buffer _src){ - int pli; - for(pli=0;pli<3;pli++){ - _dst[pli].width=_src[pli].width; - _dst[pli].height=_src[pli].height; - _dst[pli].stride=-_src[pli].stride; - _dst[pli].data=_src[pli].data - +(1-_dst[pli].height)*(ptrdiff_t)_dst[pli].stride; - } -} - -const char *th_version_string(void){ - return OC_VENDOR_STRING; -} - -ogg_uint32_t th_version_number(void){ - return (TH_VERSION_MAJOR<<16)+(TH_VERSION_MINOR<<8)+TH_VERSION_SUB; -} - -/*Determines the packet type. - Note that this correctly interprets a 0-byte packet as a video data packet. - Return: 1 for a header packet, 0 for a data packet.*/ -int th_packet_isheader(ogg_packet *_op){ - return _op->bytes>0?_op->packet[0]>>7:0; -} - -/*Determines the frame type of a video data packet. - Note that this correctly interprets a 0-byte packet as a delta frame. - Return: 1 for a key frame, 0 for a delta frame, and -1 for a header - packet.*/ -int th_packet_iskeyframe(ogg_packet *_op){ - return _op->bytes<=0?0:_op->packet[0]&0x80?-1:!(_op->packet[0]&0x40); -} diff --git a/media/libtheora/lib/internal.h b/media/libtheora/lib/internal.h deleted file mode 100644 index 24e1b5125..000000000 --- a/media/libtheora/lib/internal.h +++ /dev/null @@ -1,116 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: internal.h 17578 2010-10-29 04:21:26Z tterribe $ - - ********************************************************************/ -#if !defined(_internal_H) -# define _internal_H (1) -# include <stdlib.h> -# include <limits.h> -# if defined(HAVE_CONFIG_H) -# include "config.h" -# endif -# include "theora/codec.h" -# include "theora/theora.h" -# include "ocintrin.h" - -# if !defined(__GNUC_PREREQ) -# if defined(__GNUC__)&&defined(__GNUC_MINOR__) -# define __GNUC_PREREQ(_maj,_min) \ - ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) -# else -# define __GNUC_PREREQ(_maj,_min) 0 -# endif -# endif - -# if defined(_MSC_VER) -/*Disable missing EMMS warnings.*/ -# pragma warning(disable:4799) -/*Thank you Microsoft, I know the order of operations.*/ -# pragma warning(disable:4554) -# endif -/*You, too, gcc.*/ -# if __GNUC_PREREQ(4,2) -# pragma GCC diagnostic ignored "-Wparentheses" -# endif - -/*Some assembly constructs require aligned operands. - The following macros are _only_ intended for structure member declarations. - Although they will sometimes work on stack variables, gcc will often silently - ignore them. - A separate set of macros could be made for manual stack alignment, but we - don't actually require it anywhere.*/ -# if defined(OC_X86_ASM)||defined(OC_ARM_ASM) -# if defined(__GNUC__) -# define OC_ALIGN8(expr) expr __attribute__((aligned(8))) -# define OC_ALIGN16(expr) expr __attribute__((aligned(16))) -# elif defined(_MSC_VER) -# define OC_ALIGN8(expr) __declspec (align(8)) expr -# define OC_ALIGN16(expr) __declspec (align(16)) expr -# else -# error "Alignment macros required for this platform." -# endif -# endif -# if !defined(OC_ALIGN8) -# define OC_ALIGN8(expr) expr -# endif -# if !defined(OC_ALIGN16) -# define OC_ALIGN16(expr) expr -# endif - - - -/*This library's version.*/ -# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)" - -/*Theora bitstream version.*/ -# define TH_VERSION_MAJOR (3) -# define TH_VERSION_MINOR (2) -# define TH_VERSION_SUB (1) -# define TH_VERSION_CHECK(_info,_maj,_min,_sub) \ - ((_info)->version_major>(_maj)||(_info)->version_major==(_maj)&& \ - ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \ - (_info)->version_subminor>=(_sub))) - - - -/*A map from the index in the zig zag scan to the coefficient number in a - block.*/ -extern const unsigned char OC_FZIG_ZAG[128]; -/*A map from the coefficient number in a block to its index in the zig zag - scan.*/ -extern const unsigned char OC_IZIG_ZAG[64]; -/*A map from physical macro block ordering to bitstream macro block - ordering within a super block.*/ -extern const unsigned char OC_MB_MAP[2][2]; -/*A list of the indices in the oc_mb_map array that can be valid for each of - the various chroma decimation types.*/ -extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12]; -/*The number of indices in the oc_mb_map array that can be valid for each of - the various chroma decimation types.*/ -extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS]; - - - -int oc_ilog(unsigned _v); -void *oc_aligned_malloc(size_t _sz,size_t _align); -void oc_aligned_free(void *_ptr); -void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz); -void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz); -void oc_free_2d(void *_ptr); - -void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst, - const th_ycbcr_buffer _src); - -#endif diff --git a/media/libtheora/lib/mathops.h b/media/libtheora/lib/mathops.h deleted file mode 100644 index a1a4f9df0..000000000 --- a/media/libtheora/lib/mathops.h +++ /dev/null @@ -1,143 +0,0 @@ -#if !defined(_mathops_H) -# define _mathops_H (1) -# include <ogg/ogg.h> - -# if __GNUC_PREREQ(3,4) -# include <limits.h> -/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from - "upgrading" the type of an entire expression to an (unsigned) size_t.*/ -# if INT_MAX>=2147483647 -# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT) -# define OC_CLZ32(_x) (__builtin_clz(_x)) -# elif LONG_MAX>=2147483647L -# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) -# define OC_CLZ32(_x) (__builtin_clzl(_x)) -# endif -# if INT_MAX>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clz(_x)) -# elif LONG_MAX>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clzl(_x)) -# elif LLONG_MAX>=9223372036854775807LL|| \ - __LONG_LONG_MAX__>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clzll(_x)) -# endif -# endif - - - -/** - * oc_ilog32 - Integer binary logarithm of a 32-bit value. - * @_v: A 32-bit value. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * The OC_ILOG_32() or OC_ILOGNZ_32() macros may be able to use a builtin - * function instead, which should be faster. - */ -int oc_ilog32(ogg_uint32_t _v); -/** - * oc_ilog64 - Integer binary logarithm of a 64-bit value. - * @_v: A 64-bit value. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * The OC_ILOG_64() or OC_ILOGNZ_64() macros may be able to use a builtin - * function instead, which should be faster. - */ -int oc_ilog64(ogg_int64_t _v); - - -# if defined(OC_CLZ32) -/** - * OC_ILOGNZ_32 - Integer binary logarithm of a non-zero 32-bit value. - * @_v: A non-zero 32-bit value. - * Returns floor(log2(_v))+1. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * If _v is zero, the return value is undefined; use OC_ILOG_32() instead. - */ -# define OC_ILOGNZ_32(_v) (OC_CLZ32_OFFS-OC_CLZ32(_v)) -/** - * OC_ILOG_32 - Integer binary logarithm of a 32-bit value. - * @_v: A 32-bit value. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - */ -# define OC_ILOG_32(_v) (OC_ILOGNZ_32(_v)&-!!(_v)) -# else -# define OC_ILOGNZ_32(_v) (oc_ilog32(_v)) -# define OC_ILOG_32(_v) (oc_ilog32(_v)) -# endif - -# if defined(CLZ64) -/** - * OC_ILOGNZ_64 - Integer binary logarithm of a non-zero 64-bit value. - * @_v: A non-zero 64-bit value. - * Returns floor(log2(_v))+1. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * If _v is zero, the return value is undefined; use OC_ILOG_64() instead. - */ -# define OC_ILOGNZ_64(_v) (CLZ64_OFFS-CLZ64(_v)) -/** - * OC_ILOG_64 - Integer binary logarithm of a 64-bit value. - * @_v: A 64-bit value. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - */ -# define OC_ILOG_64(_v) (OC_ILOGNZ_64(_v)&-!!(_v)) -# else -# define OC_ILOGNZ_64(_v) (oc_ilog64(_v)) -# define OC_ILOG_64(_v) (oc_ilog64(_v)) -# endif - -# define OC_STATIC_ILOG0(_v) (!!(_v)) -# define OC_STATIC_ILOG1(_v) (((_v)&0x2)?2:OC_STATIC_ILOG0(_v)) -# define OC_STATIC_ILOG2(_v) \ - (((_v)&0xC)?2+OC_STATIC_ILOG1((_v)>>2):OC_STATIC_ILOG1(_v)) -# define OC_STATIC_ILOG3(_v) \ - (((_v)&0xF0)?4+OC_STATIC_ILOG2((_v)>>4):OC_STATIC_ILOG2(_v)) -# define OC_STATIC_ILOG4(_v) \ - (((_v)&0xFF00)?8+OC_STATIC_ILOG3((_v)>>8):OC_STATIC_ILOG3(_v)) -# define OC_STATIC_ILOG5(_v) \ - (((_v)&0xFFFF0000)?16+OC_STATIC_ILOG4((_v)>>16):OC_STATIC_ILOG4(_v)) -# define OC_STATIC_ILOG6(_v) \ - (((_v)&0xFFFFFFFF00000000ULL)?32+OC_STATIC_ILOG5((_v)>>32):OC_STATIC_ILOG5(_v)) -/** - * OC_STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant. - * @_v: A non-negative 32-bit constant. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * This macro is suitable for evaluation at compile time, but it should not be - * used on values that can change at runtime, as it operates via exhaustive - * search. - */ -# define OC_STATIC_ILOG_32(_v) (OC_STATIC_ILOG5((ogg_uint32_t)(_v))) -/** - * OC_STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant. - * @_v: A non-negative 64-bit constant. - * Returns floor(log2(_v))+1, or 0 if _v==0. - * This is the number of bits that would be required to represent _v in two's - * complement notation with all of the leading zeros stripped. - * This macro is suitable for evaluation at compile time, but it should not be - * used on values that can change at runtime, as it operates via exhaustive - * search. - */ -# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v))) - -#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57) -#define OC_Q10(_v) ((_v)<<10) - -ogg_int64_t oc_bexp64(ogg_int64_t _z); -ogg_int64_t oc_blog64(ogg_int64_t _w); - -ogg_uint32_t oc_bexp32_q10(int _z); -int oc_blog32_q10(ogg_uint32_t _w); - -#endif diff --git a/media/libtheora/lib/ocintrin.h b/media/libtheora/lib/ocintrin.h deleted file mode 100644 index d49ebb215..000000000 --- a/media/libtheora/lib/ocintrin.h +++ /dev/null @@ -1,128 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: ocintrin.h 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -/*Some common macros for potential platform-specific optimization.*/ -#include <math.h> -#if !defined(_ocintrin_H) -# define _ocintrin_H (1) - -/*Some specific platforms may have optimized intrinsic or inline assembly - versions of these functions which can substantially improve performance. - We define macros for them to allow easy incorporation of these non-ANSI - features.*/ - -/*Note that we do not provide a macro for abs(), because it is provided as a - library function, which we assume is translated into an intrinsic to avoid - the function call overhead and then implemented in the smartest way for the - target platform. - With modern gcc (4.x), this is true: it uses cmov instructions if the - architecture supports it and branchless bit-twiddling if it does not (the - speed difference between the two approaches is not measurable). - Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150) - by Sun Microsystems, despite prior art dating back to at least 1996: - http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT - On gcc 3.x, however, our assumption is not true, as abs() is translated to a - conditional jump, which is horrible on deeply piplined architectures (e.g., - all consumer architectures for the past decade or more). - Also be warned that -C*abs(x) where C is a constant is mis-optimized as - abs(C*x) on every gcc release before 4.2.3. - See bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34130 */ - -/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if - given an appropriate architecture, but the branchless bit-twiddling versions - are just as fast, and do not require any special target architecture. - Earlier gcc versions (3.x) compiled both code to the same assembly - instructions, because of the way they represented ((_b)>(_a)) internally.*/ -#define OC_MAXI(_a,_b) ((_a)-((_a)-(_b)&-((_b)>(_a)))) -#define OC_MINI(_a,_b) ((_a)+((_b)-(_a)&-((_b)<(_a)))) -/*Clamps an integer into the given range. - If _a>_c, then the lower bound _a is respected over the upper bound _c (this - behavior is required to meet our documented API behavior). - _a: The lower bound. - _b: The value to clamp. - _c: The upper boud.*/ -#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c))) -#define OC_CLAMP255(_x) ((unsigned char)((((_x)<0)-1)&((_x)|-((_x)>255)))) -/*This has a chance of compiling branchless, and is just as fast as the - bit-twiddling method, which is slightly less portable, since it relies on a - sign-extended rightshift, which is not guaranteed by ANSI (but present on - every relevant platform).*/ -#define OC_SIGNI(_a) (((_a)>0)-((_a)<0)) -/*Slightly more portable than relying on a sign-extended right-shift (which is - not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both) - compile it into the right-shift anyway.*/ -#define OC_SIGNMASK(_a) (-((_a)<0)) -/*Divides an integer by a power of two, truncating towards 0. - _dividend: The integer to divide. - _shift: The non-negative power of two to divide by. - _rmask: (1<<_shift)-1*/ -#define OC_DIV_POW2(_dividend,_shift,_rmask)\ - ((_dividend)+(OC_SIGNMASK(_dividend)&(_rmask))>>(_shift)) -/*Divides _x by 65536, truncating towards 0.*/ -#define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF) -/*Divides _x by 2, truncating towards 0.*/ -#define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1) -/*Divides _x by 8, truncating towards 0.*/ -#define OC_DIV8(_x) OC_DIV_POW2(_x,3,0x7) -/*Divides _x by 16, truncating towards 0.*/ -#define OC_DIV16(_x) OC_DIV_POW2(_x,4,0xF) -/*Right shifts _dividend by _shift, adding _rval, and subtracting one for - negative dividends first. - When _rval is (1<<_shift-1), this is equivalent to division with rounding - ties away from zero.*/ -#define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\ - ((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift)) -/*Divides a _x by 2, rounding towards even numbers.*/ -#define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1) -/*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/ -#define OC_DIV_POW2_RE(_x,_shift) \ - ((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift)) -/*Swaps two integers _a and _b if _a>_b.*/ -#define OC_SORT2I(_a,_b) \ - do{ \ - int t__; \ - t__=((_a)^(_b))&-((_b)<(_a)); \ - (_a)^=t__; \ - (_b)^=t__; \ - } \ - while(0) - -/*Accesses one of four (signed) bytes given an index. - This can be used to avoid small lookup tables.*/ -#define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \ - ((signed char) \ - (((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8)) -/*Accesses one of eight (unsigned) nibbles given an index. - This can be used to avoid small lookup tables.*/ -#define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \ - ((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \ - ((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF) - - - -/*All of these macros should expect floats as arguments.*/ -#define OC_MAXF(_a,_b) ((_a)<(_b)?(_b):(_a)) -#define OC_MINF(_a,_b) ((_a)>(_b)?(_b):(_a)) -#define OC_CLAMPF(_a,_b,_c) (OC_MINF(_a,OC_MAXF(_b,_c))) -#define OC_FABSF(_f) ((float)fabs(_f)) -#define OC_SQRTF(_f) ((float)sqrt(_f)) -#define OC_POWF(_b,_e) ((float)pow(_b,_e)) -#define OC_LOGF(_f) ((float)log(_f)) -#define OC_IFLOORF(_f) ((int)floor(_f)) -#define OC_ICEILF(_f) ((int)ceil(_f)) - -#endif diff --git a/media/libtheora/lib/quant.c b/media/libtheora/lib/quant.c deleted file mode 100644 index c3f3f4771..000000000 --- a/media/libtheora/lib/quant.c +++ /dev/null @@ -1,127 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: quant.c 17307 2010-06-27 06:02:15Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include <ogg/ogg.h> -#include "quant.h" -#include "decint.h" - -/*The maximum output of the DCT with +/- 255 inputs is +/- 8157. - These minimum quantizers ensure the result after quantization (and after - prediction for DC) will be no more than +/- 510. - The tokenization system can handle values up to +/- 580, so there is no need - to do any coefficient clamping. - I would rather have allowed smaller quantizers and had to clamp, but these - minimums were required when constructing the original VP3 matrices and have - been formalized in the spec.*/ -static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2}; -static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2}; - -/*Initializes the dequantization tables from a set of quantizer info. - Currently the dequantizer (and elsewhere enquantizer) tables are expected to - be initialized as pointing to the storage reserved for them in the - oc_theora_state (resp. oc_enc_ctx) structure. - If some tables are duplicates of others, the pointers will be adjusted to - point to a single copy of the tables, but the storage for them will not be - freed. - If you're concerned about the memory footprint, the obvious thing to do is - to move the storage out of its fixed place in the structures and allocate - it on demand. - However, a much, much better option is to only store the quantization - matrices being used for the current frame, and to recalculate these as the - qi values change between frames (this is what VP3 did).*/ -void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2], - int _pp_dc_scale[64],const th_quant_info *_qinfo){ - /*Coding mode: intra or inter.*/ - int qti; - /*Y', C_b, C_r*/ - int pli; - for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){ - /*Quality index.*/ - int qi; - /*Range iterator.*/ - int qri; - for(qi=0,qri=0;qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){ - th_quant_base base; - ogg_uint32_t q; - int qi_start; - int qi_end; - memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri], - sizeof(base)); - qi_start=qi; - if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1; - else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri]; - /*Iterate over quality indicies in this range.*/ - for(;;){ - ogg_uint32_t qfac; - int zzi; - int ci; - /*In the original VP3.2 code, the rounding offset and the size of the - dead zone around 0 were controlled by a "sharpness" parameter. - The size of our dead zone is now controlled by the per-coefficient - quality thresholds returned by our HVS module. - We round down from a more accurate value when the quality of the - reconstruction does not fall below our threshold and it saves bits. - Hence, all of that VP3.2 code is gone from here, and the remaining - floating point code has been implemented as equivalent integer code - with exact precision.*/ - qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]; - /*For postprocessing, not dequantization.*/ - if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160); - /*Scale DC the coefficient from the proper table.*/ - q=(qfac/100)<<2; - q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX); - _dequant[qi][pli][qti][0]=(ogg_uint16_t)q; - /*Now scale AC coefficients from the proper table.*/ - for(zzi=1;zzi<64;zzi++){ - q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[OC_FZIG_ZAG[zzi]]/100)<<2; - q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX); - _dequant[qi][pli][qti][zzi]=(ogg_uint16_t)q; - } - /*If this is a duplicate of a previous matrix, use that instead. - This simple check helps us improve cache coherency later.*/ - { - int dupe; - int qtj; - int plj; - dupe=0; - for(qtj=0;qtj<=qti;qtj++){ - for(plj=0;plj<(qtj<qti?3:pli);plj++){ - if(!memcmp(_dequant[qi][pli][qti],_dequant[qi][plj][qtj], - sizeof(oc_quant_table))){ - dupe=1; - break; - } - } - if(dupe)break; - } - if(dupe)_dequant[qi][pli][qti]=_dequant[qi][plj][qtj]; - } - if(++qi>=qi_end)break; - /*Interpolate the next base matrix.*/ - for(ci=0;ci<64;ci++){ - base[ci]=(unsigned char)( - (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+ - (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci]) - +_qinfo->qi_ranges[qti][pli].sizes[qri])/ - (2*_qinfo->qi_ranges[qti][pli].sizes[qri])); - } - } - } - } -} diff --git a/media/libtheora/lib/quant.h b/media/libtheora/lib/quant.h deleted file mode 100644 index 49ce13a65..000000000 --- a/media/libtheora/lib/quant.h +++ /dev/null @@ -1,33 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: quant.h 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -#if !defined(_quant_H) -# define _quant_H (1) -# include "theora/codec.h" -# include "ocintrin.h" - -typedef ogg_uint16_t oc_quant_table[64]; - - -/*Maximum scaled quantizer value.*/ -#define OC_QUANT_MAX (1024<<2) - - -void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2], - int _pp_dc_scale[64],const th_quant_info *_qinfo); - -#endif diff --git a/media/libtheora/lib/state.c b/media/libtheora/lib/state.c deleted file mode 100644 index 5e7b0ae65..000000000 --- a/media/libtheora/lib/state.c +++ /dev/null @@ -1,1260 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: state.c 17576 2010-10-29 01:07:51Z tterribe $ - - ********************************************************************/ - -#include <stdlib.h> -#include <string.h> -#include "state.h" -#if defined(OC_DUMP_IMAGES) -# include <stdio.h> -# include "png.h" -#endif - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the X and Y directions - (4:2:0). - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]) - +OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]); - dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]) - +OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]); - _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2)); -} - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the Y direction. - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]); - dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]); - _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); - dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]); - dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]); - _cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); -} - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the X direction (4:2:2). - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]); - dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]); - _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); - dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]); - dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]); - _cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); -} - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with no chroma decimation (4:4:4). - _cbmvs: The chroma block-level motion vectors to fill in. - _lmbmv: The luma macro-block level motion vector to fill in for use in - prediction. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - _cbmvs[0]=_lbmvs[0]; - _cbmvs[1]=_lbmvs[1]; - _cbmvs[2]=_lbmvs[2]; - _cbmvs[3]=_lbmvs[3]; -} - -/*A table of functions used to fill in the chroma plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane.*/ -const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={ - (oc_set_chroma_mvs_func)oc_set_chroma_mvs00, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs01, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs10, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs11 -}; - - - -/*Returns the fragment index of the top-left block in a macro block. - This can be used to test whether or not the whole macro block is valid. - _sb_map: The super block map. - _quadi: The quadrant number. - Return: The index of the fragment of the upper left block in the macro - block, or -1 if the block lies outside the coded frame.*/ -static ptrdiff_t oc_sb_quad_top_left_frag(oc_sb_map_quad _sb_map[4],int _quadi){ - /*It so happens that under the Hilbert curve ordering described below, the - upper-left block in each macro block is at index 0, except in macro block - 3, where it is at index 2.*/ - return _sb_map[_quadi][_quadi&_quadi<<1]; -} - -/*Fills in the mapping from block positions to fragment numbers for a single - color plane. - This function also fills in the "valid" flag of each quadrant in the super - block flags. - _sb_maps: The array of super block maps for the color plane. - _sb_flags: The array of super block flags for the color plane. - _frag0: The index of the first fragment in the plane. - _hfrags: The number of horizontal fragments in a coded frame. - _vfrags: The number of vertical fragments in a coded frame.*/ -static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[], - oc_sb_flags _sb_flags[],ptrdiff_t _frag0,int _hfrags,int _vfrags){ - /*Contains the (macro_block,block) indices for a 4x4 grid of - fragments. - The pattern is a 4x4 Hilbert space-filling curve. - A Hilbert curve has the nice property that as the curve grows larger, its - fractal dimension approaches 2. - The intuition is that nearby blocks in the curve are also close spatially, - with the previous element always an immediate neighbor, so that runs of - blocks should be well correlated.*/ - static const int SB_MAP[4][4][2]={ - {{0,0},{0,1},{3,2},{3,3}}, - {{0,3},{0,2},{3,1},{3,0}}, - {{1,0},{1,3},{2,0},{2,3}}, - {{1,1},{1,2},{2,1},{2,2}} - }; - ptrdiff_t yfrag; - unsigned sbi; - int y; - sbi=0; - yfrag=_frag0; - for(y=0;;y+=4){ - int imax; - int x; - /*Figure out how many columns of blocks in this super block lie within the - image.*/ - imax=_vfrags-y; - if(imax>4)imax=4; - else if(imax<=0)break; - for(x=0;;x+=4,sbi++){ - ptrdiff_t xfrag; - int jmax; - int quadi; - int i; - /*Figure out how many rows of blocks in this super block lie within the - image.*/ - jmax=_hfrags-x; - if(jmax>4)jmax=4; - else if(jmax<=0)break; - /*By default, set all fragment indices to -1.*/ - memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi])); - /*Fill in the fragment map for this super block.*/ - xfrag=yfrag+x; - for(i=0;i<imax;i++){ - int j; - for(j=0;j<jmax;j++){ - _sb_maps[sbi][SB_MAP[i][j][0]][SB_MAP[i][j][1]]=xfrag+j; - } - xfrag+=_hfrags; - } - /*Mark which quadrants of this super block lie within the image.*/ - for(quadi=0;quadi<4;quadi++){ - _sb_flags[sbi].quad_valid|= - (oc_sb_quad_top_left_frag(_sb_maps[sbi],quadi)>=0)<<quadi; - } - } - yfrag+=_hfrags<<2; - } -} - -/*Fills in the Y plane fragment map for a macro block given the fragment - coordinates of its upper-left hand corner. - _mb_map: The macro block map to fill. - _fplane: The description of the Y plane. - _xfrag0: The X location of the upper-left hand fragment in the luma plane. - _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ -static void oc_mb_fill_ymapping(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane *_fplane,int _xfrag0,int _yfrag0){ - int i; - int j; - for(i=0;i<2;i++)for(j=0;j<2;j++){ - _mb_map[0][i<<1|j]=(_yfrag0+i)*(ptrdiff_t)_fplane->nhfrags+_xfrag0+j; - } -} - -/*Fills in the chroma plane fragment maps for a macro block. - This version is for use with chroma decimated in the X and Y directions - (4:2:0). - _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes. - _xfrag0: The X location of the upper-left hand fragment in the luma plane. - _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ -static void oc_mb_fill_cmapping00(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){ - ptrdiff_t fragi; - _xfrag0>>=1; - _yfrag0>>=1; - fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0; - _mb_map[1][0]=fragi+_fplanes[1].froffset; - _mb_map[2][0]=fragi+_fplanes[2].froffset; -} - -/*Fills in the chroma plane fragment maps for a macro block. - This version is for use with chroma decimated in the Y direction. - _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes. - _xfrag0: The X location of the upper-left hand fragment in the luma plane. - _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ -static void oc_mb_fill_cmapping01(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){ - ptrdiff_t fragi; - int j; - _yfrag0>>=1; - fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0; - for(j=0;j<2;j++){ - _mb_map[1][j]=fragi+_fplanes[1].froffset; - _mb_map[2][j]=fragi+_fplanes[2].froffset; - fragi++; - } -} - -/*Fills in the chroma plane fragment maps for a macro block. - This version is for use with chroma decimated in the X direction (4:2:2). - _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes. - _xfrag0: The X location of the upper-left hand fragment in the luma plane. - _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ -static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){ - ptrdiff_t fragi; - int i; - _xfrag0>>=1; - fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0; - for(i=0;i<2;i++){ - _mb_map[1][i<<1]=fragi+_fplanes[1].froffset; - _mb_map[2][i<<1]=fragi+_fplanes[2].froffset; - fragi+=_fplanes[1].nhfrags; - } -} - -/*Fills in the chroma plane fragment maps for a macro block. - This version is for use with no chroma decimation (4:4:4). - This uses the already filled-in luma plane values. - _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes.*/ -static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3]){ - int k; - for(k=0;k<4;k++){ - _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset; - _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset; - } -} - -/*The function type used to fill in the chroma plane fragment maps for a - macro block. - _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes. - _xfrag0: The X location of the upper-left hand fragment in the luma plane. - _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ -typedef void (*oc_mb_fill_cmapping_func)(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0); - -/*A table of functions used to fill in the chroma plane fragment maps for a - macro block for each type of chrominance decimation.*/ -static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={ - oc_mb_fill_cmapping00, - oc_mb_fill_cmapping01, - oc_mb_fill_cmapping10, - (oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11 -}; - -/*Fills in the mapping from macro blocks to their corresponding fragment - numbers in each plane. - _mb_maps: The list of macro block maps. - _mb_modes: The list of macro block modes; macro blocks completely outside - the coded region are marked invalid. - _fplanes: The descriptions of the fragment planes. - _pixel_fmt: The chroma decimation type.*/ -static void oc_mb_create_mapping(oc_mb_map _mb_maps[], - signed char _mb_modes[],const oc_fragment_plane _fplanes[3],int _pixel_fmt){ - oc_mb_fill_cmapping_func mb_fill_cmapping; - unsigned sbi; - int y; - mb_fill_cmapping=OC_MB_FILL_CMAPPING_TABLE[_pixel_fmt]; - /*Loop through the luma plane super blocks.*/ - for(sbi=y=0;y<_fplanes[0].nvfrags;y+=4){ - int x; - for(x=0;x<_fplanes[0].nhfrags;x+=4,sbi++){ - int ymb; - /*Loop through the macro blocks in each super block in display order.*/ - for(ymb=0;ymb<2;ymb++){ - int xmb; - for(xmb=0;xmb<2;xmb++){ - unsigned mbi; - int mbx; - int mby; - mbi=sbi<<2|OC_MB_MAP[ymb][xmb]; - mbx=x|xmb<<1; - mby=y|ymb<<1; - /*Initialize fragment indices to -1.*/ - memset(_mb_maps[mbi],0xFF,sizeof(_mb_maps[mbi])); - /*Make sure this macro block is within the encoded region.*/ - if(mbx>=_fplanes[0].nhfrags||mby>=_fplanes[0].nvfrags){ - _mb_modes[mbi]=OC_MODE_INVALID; - continue; - } - /*Fill in the fragment indices for the luma plane.*/ - oc_mb_fill_ymapping(_mb_maps[mbi],_fplanes,mbx,mby); - /*Fill in the fragment indices for the chroma planes.*/ - (*mb_fill_cmapping)(_mb_maps[mbi],_fplanes,mbx,mby); - } - } - } - } -} - -/*Marks the fragments which fall all or partially outside the displayable - region of the frame. - _state: The Theora state containing the fragments to be marked.*/ -static void oc_state_border_init(oc_theora_state *_state){ - oc_fragment *frag; - oc_fragment *yfrag_end; - oc_fragment *xfrag_end; - oc_fragment_plane *fplane; - int crop_x0; - int crop_y0; - int crop_xf; - int crop_yf; - int pli; - int y; - int x; - /*The method we use here is slow, but the code is dead simple and handles - all the special cases easily. - We only ever need to do it once.*/ - /*Loop through the fragments, marking those completely outside the - displayable region and constructing a border mask for those that straddle - the border.*/ - _state->nborders=0; - yfrag_end=frag=_state->frags; - for(pli=0;pli<3;pli++){ - fplane=_state->fplanes+pli; - /*Set up the cropping rectangle for this plane.*/ - crop_x0=_state->info.pic_x; - crop_xf=_state->info.pic_x+_state->info.pic_width; - crop_y0=_state->info.pic_y; - crop_yf=_state->info.pic_y+_state->info.pic_height; - if(pli>0){ - if(!(_state->info.pixel_fmt&1)){ - crop_x0=crop_x0>>1; - crop_xf=crop_xf+1>>1; - } - if(!(_state->info.pixel_fmt&2)){ - crop_y0=crop_y0>>1; - crop_yf=crop_yf+1>>1; - } - } - y=0; - for(yfrag_end+=fplane->nfrags;frag<yfrag_end;y+=8){ - x=0; - for(xfrag_end=frag+fplane->nhfrags;frag<xfrag_end;frag++,x+=8){ - /*First check to see if this fragment is completely outside the - displayable region.*/ - /*Note the special checks for an empty cropping rectangle. - This guarantees that if we count a fragment as straddling the - border below, at least one pixel in the fragment will be inside - the displayable region.*/ - if(x+8<=crop_x0||crop_xf<=x||y+8<=crop_y0||crop_yf<=y|| - crop_x0>=crop_xf||crop_y0>=crop_yf){ - frag->invalid=1; - } - /*Otherwise, check to see if it straddles the border.*/ - else if(x<crop_x0&&crop_x0<x+8||x<crop_xf&&crop_xf<x+8|| - y<crop_y0&&crop_y0<y+8||y<crop_yf&&crop_yf<y+8){ - ogg_int64_t mask; - int npixels; - int i; - mask=npixels=0; - for(i=0;i<8;i++){ - int j; - for(j=0;j<8;j++){ - if(x+j>=crop_x0&&x+j<crop_xf&&y+i>=crop_y0&&y+i<crop_yf){ - mask|=(ogg_int64_t)1<<(i<<3|j); - npixels++; - } - } - } - /*Search the fragment array for border info with the same pattern. - In general, there will be at most 8 different patterns (per - plane).*/ - for(i=0;;i++){ - if(i>=_state->nborders){ - _state->nborders++; - _state->borders[i].mask=mask; - _state->borders[i].npixels=npixels; - } - else if(_state->borders[i].mask!=mask)continue; - frag->borderi=i; - break; - } - } - else frag->borderi=-1; - } - } - } -} - -static int oc_state_frarray_init(oc_theora_state *_state){ - int yhfrags; - int yvfrags; - int chfrags; - int cvfrags; - ptrdiff_t yfrags; - ptrdiff_t cfrags; - ptrdiff_t nfrags; - unsigned yhsbs; - unsigned yvsbs; - unsigned chsbs; - unsigned cvsbs; - unsigned ysbs; - unsigned csbs; - unsigned nsbs; - size_t nmbs; - int hdec; - int vdec; - int pli; - /*Figure out the number of fragments in each plane.*/ - /*These parameters have already been validated to be multiples of 16.*/ - yhfrags=_state->info.frame_width>>3; - yvfrags=_state->info.frame_height>>3; - hdec=!(_state->info.pixel_fmt&1); - vdec=!(_state->info.pixel_fmt&2); - chfrags=yhfrags+hdec>>hdec; - cvfrags=yvfrags+vdec>>vdec; - yfrags=yhfrags*(ptrdiff_t)yvfrags; - cfrags=chfrags*(ptrdiff_t)cvfrags; - nfrags=yfrags+2*cfrags; - /*Figure out the number of super blocks in each plane.*/ - yhsbs=yhfrags+3>>2; - yvsbs=yvfrags+3>>2; - chsbs=chfrags+3>>2; - cvsbs=cvfrags+3>>2; - ysbs=yhsbs*yvsbs; - csbs=chsbs*cvsbs; - nsbs=ysbs+2*csbs; - nmbs=(size_t)ysbs<<2; - /*Check for overflow. - We support the ridiculous upper limits of the specification (1048560 by - 1048560, or 3 TB frames) if the target architecture has 64-bit pointers, - but for those with 32-bit pointers (or smaller!) we have to check. - If the caller wants to prevent denial-of-service by imposing a more - reasonable upper limit on the size of attempted allocations, they must do - so themselves; we have no platform independent way to determine how much - system memory there is nor an application-independent way to decide what a - "reasonable" allocation is.*/ - if(yfrags/yhfrags!=yvfrags||2*cfrags<cfrags||nfrags<yfrags|| - ysbs/yhsbs!=yvsbs||2*csbs<csbs||nsbs<ysbs||nmbs>>2!=ysbs){ - return TH_EIMPL; - } - /*Initialize the fragment array.*/ - _state->fplanes[0].nhfrags=yhfrags; - _state->fplanes[0].nvfrags=yvfrags; - _state->fplanes[0].froffset=0; - _state->fplanes[0].nfrags=yfrags; - _state->fplanes[0].nhsbs=yhsbs; - _state->fplanes[0].nvsbs=yvsbs; - _state->fplanes[0].sboffset=0; - _state->fplanes[0].nsbs=ysbs; - _state->fplanes[1].nhfrags=_state->fplanes[2].nhfrags=chfrags; - _state->fplanes[1].nvfrags=_state->fplanes[2].nvfrags=cvfrags; - _state->fplanes[1].froffset=yfrags; - _state->fplanes[2].froffset=yfrags+cfrags; - _state->fplanes[1].nfrags=_state->fplanes[2].nfrags=cfrags; - _state->fplanes[1].nhsbs=_state->fplanes[2].nhsbs=chsbs; - _state->fplanes[1].nvsbs=_state->fplanes[2].nvsbs=cvsbs; - _state->fplanes[1].sboffset=ysbs; - _state->fplanes[2].sboffset=ysbs+csbs; - _state->fplanes[1].nsbs=_state->fplanes[2].nsbs=csbs; - _state->nfrags=nfrags; - _state->frags=_ogg_calloc(nfrags,sizeof(*_state->frags)); - _state->frag_mvs=_ogg_malloc(nfrags*sizeof(*_state->frag_mvs)); - _state->nsbs=nsbs; - _state->sb_maps=_ogg_malloc(nsbs*sizeof(*_state->sb_maps)); - _state->sb_flags=_ogg_calloc(nsbs,sizeof(*_state->sb_flags)); - _state->nhmbs=yhsbs<<1; - _state->nvmbs=yvsbs<<1; - _state->nmbs=nmbs; - _state->mb_maps=_ogg_calloc(nmbs,sizeof(*_state->mb_maps)); - _state->mb_modes=_ogg_calloc(nmbs,sizeof(*_state->mb_modes)); - _state->coded_fragis=_ogg_malloc(nfrags*sizeof(*_state->coded_fragis)); - if(_state->frags==NULL||_state->frag_mvs==NULL||_state->sb_maps==NULL|| - _state->sb_flags==NULL||_state->mb_maps==NULL||_state->mb_modes==NULL|| - _state->coded_fragis==NULL){ - return TH_EFAULT; - } - /*Create the mapping from super blocks to fragments.*/ - for(pli=0;pli<3;pli++){ - oc_fragment_plane *fplane; - fplane=_state->fplanes+pli; - oc_sb_create_plane_mapping(_state->sb_maps+fplane->sboffset, - _state->sb_flags+fplane->sboffset,fplane->froffset, - fplane->nhfrags,fplane->nvfrags); - } - /*Create the mapping from macro blocks to fragments.*/ - oc_mb_create_mapping(_state->mb_maps,_state->mb_modes, - _state->fplanes,_state->info.pixel_fmt); - /*Initialize the invalid and borderi fields of each fragment.*/ - oc_state_border_init(_state); - return 0; -} - -static void oc_state_frarray_clear(oc_theora_state *_state){ - _ogg_free(_state->coded_fragis); - _ogg_free(_state->mb_modes); - _ogg_free(_state->mb_maps); - _ogg_free(_state->sb_flags); - _ogg_free(_state->sb_maps); - _ogg_free(_state->frag_mvs); - _ogg_free(_state->frags); -} - - -/*Initializes the buffers used for reconstructed frames. - These buffers are padded with 16 extra pixels on each side, to allow - unrestricted motion vectors without special casing the boundary. - If chroma is decimated in either direction, the padding is reduced by a - factor of 2 on the appropriate sides. - _nrefs: The number of reference buffers to init; must be in the range 3...6.*/ -static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ - th_info *info; - unsigned char *ref_frame_data; - size_t ref_frame_data_sz; - size_t ref_frame_sz; - size_t yplane_sz; - size_t cplane_sz; - int yhstride; - int yheight; - int chstride; - int cheight; - ptrdiff_t align; - ptrdiff_t yoffset; - ptrdiff_t coffset; - ptrdiff_t *frag_buf_offs; - ptrdiff_t fragi; - int hdec; - int vdec; - int rfi; - int pli; - if(_nrefs<3||_nrefs>6)return TH_EINVAL; - info=&_state->info; - /*Compute the image buffer parameters for each plane.*/ - hdec=!(info->pixel_fmt&1); - vdec=!(info->pixel_fmt&2); - yhstride=info->frame_width+2*OC_UMV_PADDING; - yheight=info->frame_height+2*OC_UMV_PADDING; - /*Require 16-byte aligned rows in the chroma planes.*/ - chstride=(yhstride>>hdec)+15&~15; - cheight=yheight>>vdec; - yplane_sz=yhstride*(size_t)yheight; - cplane_sz=chstride*(size_t)cheight; - yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride; - coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride; - /*Although we guarantee the rows of the chroma planes are a multiple of 16 - bytes, the initial padding on the first row may only be 8 bytes. - Compute the offset needed to the actual image data to a multiple of 16.*/ - align=-coffset&15; - ref_frame_sz=yplane_sz+2*cplane_sz+16; - ref_frame_data_sz=_nrefs*ref_frame_sz; - /*Check for overflow. - The same caveats apply as for oc_state_frarray_init().*/ - if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz|| - ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){ - return TH_EIMPL; - } - ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16); - frag_buf_offs=_state->frag_buf_offs= - _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs)); - if(ref_frame_data==NULL||frag_buf_offs==NULL){ - _ogg_free(frag_buf_offs); - oc_aligned_free(ref_frame_data); - return TH_EFAULT; - } - /*Set up the width, height and stride for the image buffers.*/ - _state->ref_frame_bufs[0][0].width=info->frame_width; - _state->ref_frame_bufs[0][0].height=info->frame_height; - _state->ref_frame_bufs[0][0].stride=yhstride; - _state->ref_frame_bufs[0][1].width=_state->ref_frame_bufs[0][2].width= - info->frame_width>>hdec; - _state->ref_frame_bufs[0][1].height=_state->ref_frame_bufs[0][2].height= - info->frame_height>>vdec; - _state->ref_frame_bufs[0][1].stride=_state->ref_frame_bufs[0][2].stride= - chstride; - for(rfi=1;rfi<_nrefs;rfi++){ - memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0], - sizeof(_state->ref_frame_bufs[0])); - } - _state->ref_frame_handle=ref_frame_data; - /*Set up the data pointers for the image buffers.*/ - for(rfi=0;rfi<_nrefs;rfi++){ - _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset; - ref_frame_data+=yplane_sz+align; - _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset; - ref_frame_data+=cplane_sz; - _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset; - ref_frame_data+=cplane_sz+(16-align); - /*Flip the buffer upside down. - This allows us to decode Theora's bottom-up frames in their natural - order, yet return a top-down buffer with a positive stride to the user.*/ - oc_ycbcr_buffer_flip(_state->ref_frame_bufs[rfi], - _state->ref_frame_bufs[rfi]); - } - _state->ref_ystride[0]=-yhstride; - _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride; - /*Initialize the fragment buffer offsets.*/ - ref_frame_data=_state->ref_frame_bufs[0][0].data; - fragi=0; - for(pli=0;pli<3;pli++){ - th_img_plane *iplane; - oc_fragment_plane *fplane; - unsigned char *vpix; - ptrdiff_t stride; - ptrdiff_t vfragi_end; - int nhfrags; - iplane=_state->ref_frame_bufs[0]+pli; - fplane=_state->fplanes+pli; - vpix=iplane->data; - vfragi_end=fplane->froffset+fplane->nfrags; - nhfrags=fplane->nhfrags; - stride=iplane->stride; - while(fragi<vfragi_end){ - ptrdiff_t hfragi_end; - unsigned char *hpix; - hpix=vpix; - for(hfragi_end=fragi+nhfrags;fragi<hfragi_end;fragi++){ - frag_buf_offs[fragi]=hpix-ref_frame_data; - hpix+=8; - } - vpix+=stride<<3; - } - } - /*Initialize the reference frame pointers and indices.*/ - _state->ref_frame_idx[OC_FRAME_GOLD]= - _state->ref_frame_idx[OC_FRAME_PREV]= - _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]= - _state->ref_frame_idx[OC_FRAME_PREV_ORIG]= - _state->ref_frame_idx[OC_FRAME_SELF]= - _state->ref_frame_idx[OC_FRAME_IO]=-1; - _state->ref_frame_data[OC_FRAME_GOLD]= - _state->ref_frame_data[OC_FRAME_PREV]= - _state->ref_frame_data[OC_FRAME_GOLD_ORIG]= - _state->ref_frame_data[OC_FRAME_PREV_ORIG]= - _state->ref_frame_data[OC_FRAME_SELF]= - _state->ref_frame_data[OC_FRAME_IO]=NULL; - return 0; -} - -static void oc_state_ref_bufs_clear(oc_theora_state *_state){ - _ogg_free(_state->frag_buf_offs); - oc_aligned_free(_state->ref_frame_handle); -} - - -void oc_state_accel_init_c(oc_theora_state *_state){ - _state->cpu_flags=0; -#if defined(OC_STATE_USE_VTABLE) - _state->opt_vtable.frag_copy=oc_frag_copy_c; - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c; - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c; - _state->opt_vtable.idct8x8=oc_idct8x8_c; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c; - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c; - _state->opt_vtable.state_loop_filter_frag_rows= - oc_state_loop_filter_frag_rows_c; - _state->opt_vtable.restore_fpu=oc_restore_fpu_c; -#endif - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG; -} - - -int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){ - int ret; - /*First validate the parameters.*/ - if(_info==NULL)return TH_EFAULT; - /*The width and height of the encoded frame must be multiples of 16. - They must also, when divided by 16, fit into a 16-bit unsigned integer. - The displayable frame offset coordinates must fit into an 8-bit unsigned - integer. - Note that the offset Y in the API is specified on the opposite side from - how it is specified in the bitstream, because the Y axis is flipped in - the bitstream. - The displayable frame must fit inside the encoded frame. - The color space must be one known by the encoder.*/ - if((_info->frame_width&0xF)||(_info->frame_height&0xF)|| - _info->frame_width<=0||_info->frame_width>=0x100000|| - _info->frame_height<=0||_info->frame_height>=0x100000|| - _info->pic_x+_info->pic_width>_info->frame_width|| - _info->pic_y+_info->pic_height>_info->frame_height|| - _info->pic_x>255||_info->frame_height-_info->pic_height-_info->pic_y>255|| - /*Note: the following <0 comparisons may generate spurious warnings on - platforms where enums are unsigned. - We could cast them to unsigned and just use the following >= comparison, - but there are a number of compilers which will mis-optimize this. - It's better to live with the spurious warnings.*/ - _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES|| - _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){ - return TH_EINVAL; - } - memset(_state,0,sizeof(*_state)); - memcpy(&_state->info,_info,sizeof(*_info)); - /*Invert the sense of pic_y to match Theora's right-handed coordinate - system.*/ - _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y; - _state->frame_type=OC_UNKWN_FRAME; - oc_state_accel_init(_state); - ret=oc_state_frarray_init(_state); - if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs); - if(ret<0){ - oc_state_frarray_clear(_state); - return ret; - } - /*If the keyframe_granule_shift is out of range, use the maximum allowable - value.*/ - if(_info->keyframe_granule_shift<0||_info->keyframe_granule_shift>31){ - _state->info.keyframe_granule_shift=31; - } - _state->keyframe_num=0; - _state->curframe_num=-1; - /*3.2.0 streams mark the frame index instead of the frame count. - This was changed with stream version 3.2.1 to conform to other Ogg - codecs. - We add an extra bias when computing granule positions for new streams.*/ - _state->granpos_bias=TH_VERSION_CHECK(_info,3,2,1); - return 0; -} - -void oc_state_clear(oc_theora_state *_state){ - oc_state_ref_bufs_clear(_state); - oc_state_frarray_clear(_state); -} - - -/*Duplicates the pixels on the border of the image plane out into the - surrounding padding for use by unrestricted motion vectors. - This function only adds the left and right borders, and only for the fragment - rows specified. - _refi: The index of the reference buffer to pad. - _pli: The color plane. - _y0: The Y coordinate of the first row to pad. - _yend: The Y coordinate of the row to stop padding at.*/ -void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli, - int _y0,int _yend){ - th_img_plane *iplane; - unsigned char *apix; - unsigned char *bpix; - unsigned char *epix; - int stride; - int hpadding; - hpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&1)); - iplane=_state->ref_frame_bufs[_refi]+_pli; - stride=iplane->stride; - apix=iplane->data+_y0*(ptrdiff_t)stride; - bpix=apix+iplane->width-1; - epix=iplane->data+_yend*(ptrdiff_t)stride; - /*Note the use of != instead of <, which allows the stride to be negative.*/ - while(apix!=epix){ - memset(apix-hpadding,apix[0],hpadding); - memset(bpix+1,bpix[0],hpadding); - apix+=stride; - bpix+=stride; - } -} - -/*Duplicates the pixels on the border of the image plane out into the - surrounding padding for use by unrestricted motion vectors. - This function only adds the top and bottom borders, and must be called after - the left and right borders are added. - _refi: The index of the reference buffer to pad. - _pli: The color plane.*/ -void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli){ - th_img_plane *iplane; - unsigned char *apix; - unsigned char *bpix; - unsigned char *epix; - int stride; - int hpadding; - int vpadding; - int fullw; - hpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&1)); - vpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&2)); - iplane=_state->ref_frame_bufs[_refi]+_pli; - stride=iplane->stride; - fullw=iplane->width+(hpadding<<1); - apix=iplane->data-hpadding; - bpix=iplane->data+(iplane->height-1)*(ptrdiff_t)stride-hpadding; - epix=apix-stride*(ptrdiff_t)vpadding; - while(apix!=epix){ - memcpy(apix-stride,apix,fullw); - memcpy(bpix+stride,bpix,fullw); - apix-=stride; - bpix+=stride; - } -} - -/*Duplicates the pixels on the border of the given reference image out into - the surrounding padding for use by unrestricted motion vectors. - _state: The context containing the reference buffers. - _refi: The index of the reference buffer to pad.*/ -void oc_state_borders_fill(oc_theora_state *_state,int _refi){ - int pli; - for(pli=0;pli<3;pli++){ - oc_state_borders_fill_rows(_state,_refi,pli,0, - _state->ref_frame_bufs[_refi][pli].height); - oc_state_borders_fill_caps(_state,_refi,pli); - } -} - -/*Determines the offsets in an image buffer to use for motion compensation. - _state: The Theora state the offsets are to be computed with. - _offsets: Returns the offset for the buffer(s). - _offsets[0] is always set. - _offsets[1] is set if the motion vector has non-zero fractional - components. - _pli: The color plane index. - _mv: The motion vector. - Return: The number of offsets returned: 1 or 2.*/ -int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], - int _pli,oc_mv _mv){ - /*Here is a brief description of how Theora handles motion vectors: - Motion vector components are specified to half-pixel accuracy in - undecimated directions of each plane, and quarter-pixel accuracy in - decimated directions. - Integer parts are extracted by dividing (not shifting) by the - appropriate amount, with truncation towards zero. - These integer values are used to calculate the first offset. - - If either of the fractional parts are non-zero, then a second offset is - computed. - No third or fourth offsets are computed, even if both components have - non-zero fractional parts. - The second offset is computed by dividing (not shifting) by the - appropriate amount, always truncating _away_ from zero.*/ -#if 0 - /*This version of the code doesn't use any tables, but is slower.*/ - int ystride; - int xprec; - int yprec; - int xfrac; - int yfrac; - int offs; - int dx; - int dy; - ystride=_state->ref_ystride[_pli]; - /*These two variables decide whether we are in half- or quarter-pixel - precision in each component.*/ - xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1)); - yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2)); - dx=OC_MV_X(_mv); - dy=OC_MV_Y(_mv); - /*These two variables are either 0 if all the fractional bits are zero or -1 - if any of them are non-zero.*/ - xfrac=OC_SIGNMASK(-(dx&(xprec|1))); - yfrac=OC_SIGNMASK(-(dy&(yprec|1))); - offs=(dx>>xprec)+(dy>>yprec)*ystride; - if(xfrac||yfrac){ - int xmask; - int ymask; - xmask=OC_SIGNMASK(dx); - ymask=OC_SIGNMASK(dy); - yfrac&=ystride; - _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask); - _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask); - return 2; - } - else{ - _offsets[0]=offs; - return 1; - } -#else - /*Using tables simplifies the code, and there's enough arithmetic to hide the - latencies of the memory references.*/ - static const signed char OC_MVMAP[2][64]={ - { - -15,-15,-14,-14,-13,-13,-12,-12,-11,-11,-10,-10, -9, -9, -8, - -8, -7, -7, -6, -6, -5, -5, -4, -4, -3, -3, -2, -2, -1, -1, 0, - 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, - 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 - }, - { - -7, -7, -7, -7, -6, -6, -6, -6, -5, -5, -5, -5, -4, -4, -4, - -4, -3, -3, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, - 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7 - } - }; - static const signed char OC_MVMAP2[2][64]={ - { - -1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, - 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 - }, - { - -1,-1,-1, 0,-1,-1,-1, 0,-1,-1,-1, 0,-1,-1,-1, - 0,-1,-1,-1, 0,-1,-1,-1, 0,-1,-1,-1, 0,-1,-1,-1, - 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, - 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 - } - }; - int ystride; - int qpx; - int qpy; - int mx; - int my; - int mx2; - int my2; - int offs; - int dx; - int dy; - ystride=_state->ref_ystride[_pli]; - qpy=_pli!=0&&!(_state->info.pixel_fmt&2); - dx=OC_MV_X(_mv); - dy=OC_MV_Y(_mv); - my=OC_MVMAP[qpy][dy+31]; - my2=OC_MVMAP2[qpy][dy+31]; - qpx=_pli!=0&&!(_state->info.pixel_fmt&1); - mx=OC_MVMAP[qpx][dx+31]; - mx2=OC_MVMAP2[qpx][dx+31]; - offs=my*ystride+mx; - if(mx2||my2){ - _offsets[1]=offs+my2*ystride+mx2; - _offsets[0]=offs; - return 2; - } - _offsets[0]=offs; - return 1; -#endif -} - -void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - ogg_int16_t p; - int ci; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - /*LOOP VECTORIZES.*/ - for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p; - } - else{ - /*First, dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2(_state, - dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64); - } - else{ - oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } - } -} - -static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){ - int y; - _pix-=2; - for(y=0;y<8;y++){ - int f; - f=_pix[0]-_pix[3]+3*(_pix[2]-_pix[1]); - /*The _bv array is used to compute the function - f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0)); - where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/ - f=*(_bv+(f+4>>3)); - _pix[1]=OC_CLAMP255(_pix[1]+f); - _pix[2]=OC_CLAMP255(_pix[2]-f); - _pix+=_ystride; - } -} - -static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){ - int x; - _pix-=_ystride*2; - for(x=0;x<8;x++){ - int f; - f=_pix[x]-_pix[_ystride*3+x]+3*(_pix[_ystride*2+x]-_pix[_ystride+x]); - /*The _bv array is used to compute the function - f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0)); - where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/ - f=*(_bv+(f+4>>3)); - _pix[_ystride+x]=OC_CLAMP255(_pix[_ystride+x]+f); - _pix[_ystride*2+x]=OC_CLAMP255(_pix[_ystride*2+x]-f); - } -} - -/*Initialize the bounding values array used by the loop filter. - _bv: Storage for the array. - _flimit: The filter limit as defined in Section 7.10 of the spec.*/ -void oc_loop_filter_init_c(signed char _bv[256],int _flimit){ - int i; - memset(_bv,0,sizeof(_bv[0])*256); - for(i=0;i<_flimit;i++){ - if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit); - _bv[127-i]=(signed char)(-i); - _bv[127+i]=(signed char)(i); - if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i); - } -} - -/*Apply the loop filter to a given set of fragment rows in the given plane. - The filter may be run on the bottom edge, affecting pixels in the next row of - fragments, so this row also needs to be available. - _bv: The bounding values array. - _refi: The index of the frame buffer to filter. - _pli: The color plane to filter. - _fragy0: The Y coordinate of the first fragment row to filter. - _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state, - signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){ - const oc_fragment_plane *fplane; - const oc_fragment *frags; - const ptrdiff_t *frag_buf_offs; - unsigned char *ref_frame_data; - ptrdiff_t fragi_top; - ptrdiff_t fragi_bot; - ptrdiff_t fragi0; - ptrdiff_t fragi0_end; - int ystride; - int nhfrags; - _bv+=127; - fplane=_state->fplanes+_pli; - nhfrags=fplane->nhfrags; - fragi_top=fplane->froffset; - fragi_bot=fragi_top+fplane->nfrags; - fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; - ystride=_state->ref_ystride[_pli]; - frags=_state->frags; - frag_buf_offs=_state->frag_buf_offs; - ref_frame_data=_state->ref_frame_data[_refi]; - /*The following loops are constructed somewhat non-intuitively on purpose. - The main idea is: if a block boundary has at least one coded fragment on - it, the filter is applied to it. - However, the order that the filters are applied in matters, and VP3 chose - the somewhat strange ordering used below.*/ - while(fragi0<fragi0_end){ - ptrdiff_t fragi; - ptrdiff_t fragi_end; - fragi=fragi0; - fragi_end=fragi+nhfrags; - while(fragi<fragi_end){ - if(frags[fragi].coded){ - unsigned char *ref; - ref=ref_frame_data+frag_buf_offs[fragi]; - if(fragi>fragi0)loop_filter_h(ref,ystride,_bv); - if(fragi0>fragi_top)loop_filter_v(ref,ystride,_bv); - if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - loop_filter_h(ref+8,ystride,_bv); - } - if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - loop_filter_v(ref+(ystride<<3),ystride,_bv); - } - } - fragi++; - } - fragi0+=nhfrags; - } -} - -#if defined(OC_DUMP_IMAGES) -int oc_state_dump_frame(const oc_theora_state *_state,int _frame, - const char *_suf){ - /*Dump a PNG of the reconstructed image.*/ - png_structp png; - png_infop info; - png_bytep *image; - FILE *fp; - char fname[16]; - unsigned char *y_row; - unsigned char *u_row; - unsigned char *v_row; - unsigned char *y; - unsigned char *u; - unsigned char *v; - ogg_int64_t iframe; - ogg_int64_t pframe; - int y_stride; - int u_stride; - int v_stride; - int framei; - int width; - int height; - int imgi; - int imgj; - width=_state->info.frame_width; - height=_state->info.frame_height; - iframe=_state->granpos>>_state->info.keyframe_granule_shift; - pframe=_state->granpos-(iframe<<_state->info.keyframe_granule_shift); - sprintf(fname,"%08i%s.png",(int)(iframe+pframe),_suf); - fp=fopen(fname,"wb"); - if(fp==NULL)return TH_EFAULT; - image=(png_bytep *)oc_malloc_2d(height,6*width,sizeof(**image)); - if(image==NULL){ - fclose(fp); - return TH_EFAULT; - } - png=png_create_write_struct(PNG_LIBPNG_VER_STRING,NULL,NULL,NULL); - if(png==NULL){ - oc_free_2d(image); - fclose(fp); - return TH_EFAULT; - } - info=png_create_info_struct(png); - if(info==NULL){ - png_destroy_write_struct(&png,NULL); - oc_free_2d(image); - fclose(fp); - return TH_EFAULT; - } - if(setjmp(png_jmpbuf(png))){ - png_destroy_write_struct(&png,&info); - oc_free_2d(image); - fclose(fp); - return TH_EFAULT; - } - framei=_state->ref_frame_idx[_frame]; - y_row=_state->ref_frame_bufs[framei][0].data; - u_row=_state->ref_frame_bufs[framei][1].data; - v_row=_state->ref_frame_bufs[framei][2].data; - y_stride=_state->ref_frame_bufs[framei][0].stride; - u_stride=_state->ref_frame_bufs[framei][1].stride; - v_stride=_state->ref_frame_bufs[framei][2].stride; - /*Chroma up-sampling is just done with a box filter. - This is very likely what will actually be used in practice on a real - display, and also removes one more layer to search in for the source of - artifacts. - As an added bonus, it's dead simple.*/ - for(imgi=height;imgi-->0;){ - int dc; - y=y_row; - u=u_row; - v=v_row; - for(imgj=0;imgj<6*width;){ - float yval; - float uval; - float vval; - unsigned rval; - unsigned gval; - unsigned bval; - /*This is intentionally slow and very accurate.*/ - yval=(*y-16)*(1.0F/219); - uval=(*u-128)*(2*(1-0.114F)/224); - vval=(*v-128)*(2*(1-0.299F)/224); - rval=OC_CLAMPI(0,(int)(65535*(yval+vval)+0.5F),65535); - gval=OC_CLAMPI(0,(int)(65535*( - yval-uval*(0.114F/0.587F)-vval*(0.299F/0.587F))+0.5F),65535); - bval=OC_CLAMPI(0,(int)(65535*(yval+uval)+0.5F),65535); - image[imgi][imgj++]=(unsigned char)(rval>>8); - image[imgi][imgj++]=(unsigned char)(rval&0xFF); - image[imgi][imgj++]=(unsigned char)(gval>>8); - image[imgi][imgj++]=(unsigned char)(gval&0xFF); - image[imgi][imgj++]=(unsigned char)(bval>>8); - image[imgi][imgj++]=(unsigned char)(bval&0xFF); - dc=(y-y_row&1)|(_state->info.pixel_fmt&1); - y++; - u+=dc; - v+=dc; - } - dc=-((height-1-imgi&1)|_state->info.pixel_fmt>>1); - y_row+=y_stride; - u_row+=dc&u_stride; - v_row+=dc&v_stride; - } - png_init_io(png,fp); - png_set_compression_level(png,Z_BEST_COMPRESSION); - png_set_IHDR(png,info,width,height,16,PNG_COLOR_TYPE_RGB, - PNG_INTERLACE_NONE,PNG_COMPRESSION_TYPE_DEFAULT,PNG_FILTER_TYPE_DEFAULT); - switch(_state->info.colorspace){ - case TH_CS_ITU_REC_470M:{ - png_set_gAMA(png,info,2.2); - png_set_cHRM_fixed(png,info,31006,31616, - 67000,32000,21000,71000,14000,8000); - }break; - case TH_CS_ITU_REC_470BG:{ - png_set_gAMA(png,info,2.67); - png_set_cHRM_fixed(png,info,31271,32902, - 64000,33000,29000,60000,15000,6000); - }break; - default:break; - } - png_set_pHYs(png,info,_state->info.aspect_numerator, - _state->info.aspect_denominator,0); - png_set_rows(png,info,image); - png_write_png(png,info,PNG_TRANSFORM_IDENTITY,NULL); - png_write_end(png,info); - png_destroy_write_struct(&png,&info); - oc_free_2d(image); - fclose(fp); - return 0; -} -#endif - - - -ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos){ - oc_theora_state *state; - state=(oc_theora_state *)_encdec; - if(_granpos>=0){ - ogg_int64_t iframe; - ogg_int64_t pframe; - iframe=_granpos>>state->info.keyframe_granule_shift; - pframe=_granpos-(iframe<<state->info.keyframe_granule_shift); - /*3.2.0 streams store the frame index in the granule position. - 3.2.1 and later store the frame count. - We return the index, so adjust the value if we have a 3.2.1 or later - stream.*/ - return iframe+pframe-TH_VERSION_CHECK(&state->info,3,2,1); - } - return -1; -} - -double th_granule_time(void *_encdec,ogg_int64_t _granpos){ - oc_theora_state *state; - state=(oc_theora_state *)_encdec; - if(_granpos>=0){ - return (th_granule_frame(_encdec, _granpos)+1)*( - (double)state->info.fps_denominator/state->info.fps_numerator); - } - return -1; -} diff --git a/media/libtheora/lib/state.h b/media/libtheora/lib/state.h deleted file mode 100644 index f176a53ce..000000000 --- a/media/libtheora/lib/state.h +++ /dev/null @@ -1,552 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $ - - ********************************************************************/ -#if !defined(_state_H) -# define _state_H (1) -# include "internal.h" -# include "huffman.h" -# include "quant.h" - - - -/*A single quadrant of the map from a super block to fragment numbers.*/ -typedef ptrdiff_t oc_sb_map_quad[4]; -/*A map from a super block to fragment numbers.*/ -typedef oc_sb_map_quad oc_sb_map[4]; -/*A single plane of the map from a macro block to fragment numbers.*/ -typedef ptrdiff_t oc_mb_map_plane[4]; -/*A map from a macro block to fragment numbers.*/ -typedef oc_mb_map_plane oc_mb_map[3]; -/*A motion vector.*/ -typedef ogg_int16_t oc_mv; - -typedef struct oc_sb_flags oc_sb_flags; -typedef struct oc_border_info oc_border_info; -typedef struct oc_fragment oc_fragment; -typedef struct oc_fragment_plane oc_fragment_plane; -typedef struct oc_base_opt_vtable oc_base_opt_vtable; -typedef struct oc_base_opt_data oc_base_opt_data; -typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable; -typedef struct oc_theora_state oc_theora_state; - - - -/*Shared accelerated functions.*/ -# if defined(OC_X86_ASM) -# if defined(_MSC_VER) -# include "x86_vc/x86int.h" -# else -# include "x86/x86int.h" -# endif -# endif -# if defined(OC_ARM_ASM) -# include "arm/armint.h" -# endif -# if defined(OC_C64X_ASM) -# include "c64x/c64xint.h" -# endif - -# if !defined(oc_state_accel_init) -# define oc_state_accel_init oc_state_accel_init_c -# endif -# if defined(OC_STATE_USE_VTABLE) -# if !defined(oc_frag_copy) -# define oc_frag_copy(_state,_dst,_src,_ystride) \ - ((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride)) -# endif -# if !defined(oc_frag_copy_list) -# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs) \ - ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs)) -# endif -# if !defined(oc_frag_recon_intra) -# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \ - ((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue)) -# endif -# if !defined(oc_frag_recon_inter) -# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ - ((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue)) -# endif -# if !defined(oc_frag_recon_inter2) -# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ - ((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \ - _src1,_src2,_ystride,_residue)) -# endif -# if !defined(oc_idct8x8) -# define oc_idct8x8(_state,_y,_x,_last_zzi) \ - ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi)) -# endif -# if !defined(oc_state_frag_recon) -# define oc_state_frag_recon(_state,_fragi, \ - _pli,_dct_coeffs,_last_zzi,_dc_quant) \ - ((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \ - _pli,_dct_coeffs,_last_zzi,_dc_quant)) -# endif -# if !defined(oc_loop_filter_init) -# define oc_loop_filter_init(_state,_bv,_flimit) \ - ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit)) -# endif -# if !defined(oc_state_loop_filter_frag_rows) -# define oc_state_loop_filter_frag_rows(_state, \ - _bv,_refi,_pli,_fragy0,_fragy_end) \ - ((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \ - _bv,_refi,_pli,_fragy0,_fragy_end)) -# endif -# if !defined(oc_restore_fpu) -# define oc_restore_fpu(_state) \ - ((*(_state)->opt_vtable.restore_fpu)()) -# endif -# else -# if !defined(oc_frag_copy) -# define oc_frag_copy(_state,_dst,_src,_ystride) \ - oc_frag_copy_c(_dst,_src,_ystride) -# endif -# if !defined(oc_frag_copy_list) -# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs) \ - oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs) -# endif -# if !defined(oc_frag_recon_intra) -# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \ - oc_frag_recon_intra_c(_dst,_dst_ystride,_residue) -# endif -# if !defined(oc_frag_recon_inter) -# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ - oc_frag_recon_inter_c(_dst,_src,_ystride,_residue) -# endif -# if !defined(oc_frag_recon_inter2) -# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ - oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue) -# endif -# if !defined(oc_idct8x8) -# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi) -# endif -# if !defined(oc_state_frag_recon) -# define oc_state_frag_recon oc_state_frag_recon_c -# endif -# if !defined(oc_loop_filter_init) -# define oc_loop_filter_init(_state,_bv,_flimit) \ - oc_loop_filter_init_c(_bv,_flimit) -# endif -# if !defined(oc_state_loop_filter_frag_rows) -# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c -# endif -# if !defined(oc_restore_fpu) -# define oc_restore_fpu(_state) do{}while(0) -# endif -# endif - - - -/*A keyframe.*/ -# define OC_INTRA_FRAME (0) -/*A predicted frame.*/ -# define OC_INTER_FRAME (1) -/*A frame of unknown type (frame type decision has not yet been made).*/ -# define OC_UNKWN_FRAME (-1) - -/*The amount of padding to add to the reconstructed frame buffers on all - sides. - This is used to allow unrestricted motion vectors without special casing. - This must be a multiple of 2.*/ -# define OC_UMV_PADDING (16) - -/*Frame classification indices.*/ -/*The previous golden frame.*/ -# define OC_FRAME_GOLD (0) -/*The previous frame.*/ -# define OC_FRAME_PREV (1) -/*The current frame.*/ -# define OC_FRAME_SELF (2) -/*Used to mark uncoded fragments (for DC prediction).*/ -# define OC_FRAME_NONE (3) - -/*The input or output buffer.*/ -# define OC_FRAME_IO (3) -/*Uncompressed prev golden frame.*/ -# define OC_FRAME_GOLD_ORIG (4) -/*Uncompressed previous frame. */ -# define OC_FRAME_PREV_ORIG (5) - -/*Macroblock modes.*/ -/*Macro block is invalid: It is never coded.*/ -# define OC_MODE_INVALID (-1) -/*Encoded difference from the same macro block in the previous frame.*/ -# define OC_MODE_INTER_NOMV (0) -/*Encoded with no motion compensated prediction.*/ -# define OC_MODE_INTRA (1) -/*Encoded difference from the previous frame offset by the given motion - vector.*/ -# define OC_MODE_INTER_MV (2) -/*Encoded difference from the previous frame offset by the last coded motion - vector.*/ -# define OC_MODE_INTER_MV_LAST (3) -/*Encoded difference from the previous frame offset by the second to last - coded motion vector.*/ -# define OC_MODE_INTER_MV_LAST2 (4) -/*Encoded difference from the same macro block in the previous golden - frame.*/ -# define OC_MODE_GOLDEN_NOMV (5) -/*Encoded difference from the previous golden frame offset by the given motion - vector.*/ -# define OC_MODE_GOLDEN_MV (6) -/*Encoded difference from the previous frame offset by the individual motion - vectors given for each block.*/ -# define OC_MODE_INTER_MV_FOUR (7) -/*The number of (coded) modes.*/ -# define OC_NMODES (8) - -/*Determines the reference frame used for a given MB mode.*/ -# define OC_FRAME_FOR_MODE(_x) \ - OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \ - OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x)) - -/*Constants for the packet state machine common between encoder and decoder.*/ - -/*Next packet to emit/read: Codec info header.*/ -# define OC_PACKET_INFO_HDR (-3) -/*Next packet to emit/read: Comment header.*/ -# define OC_PACKET_COMMENT_HDR (-2) -/*Next packet to emit/read: Codec setup header.*/ -# define OC_PACKET_SETUP_HDR (-1) -/*No more packets to emit/read.*/ -# define OC_PACKET_DONE (INT_MAX) - - - -#define OC_MV(_x,_y) ((oc_mv)((_x)&0xFF|(_y)<<8)) -#define OC_MV_X(_mv) ((signed char)(_mv)) -#define OC_MV_Y(_mv) ((_mv)>>8) -#define OC_MV_ADD(_mv1,_mv2) \ - OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \ - OC_MV_Y(_mv1)+OC_MV_Y(_mv2)) -#define OC_MV_SUB(_mv1,_mv2) \ - OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \ - OC_MV_Y(_mv1)-OC_MV_Y(_mv2)) - - - -/*Super blocks are 32x32 segments of pixels in a single color plane indexed - in image order. - Internally, super blocks are broken up into four quadrants, each of which - contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels. - Quadrants, and the blocks within them, are indexed in a special order called - a "Hilbert curve" within the super block. - - In order to differentiate between the Hilbert-curve indexing strategy and - the regular image order indexing strategy, blocks indexed in image order - are called "fragments". - Fragments are indexed in image order, left to right, then bottom to top, - from Y' plane to Cb plane to Cr plane. - - The co-located fragments in all image planes corresponding to the location - of a single quadrant of a luma plane super block form a macro block. - Thus there is only a single set of macro blocks for all planes, each of which - contains between 6 and 12 fragments, depending on the pixel format. - Therefore macro block information is kept in a separate set of arrays from - super blocks to avoid unused space in the other planes. - The lists are indexed in super block order. - That is, the macro block corresponding to the macro block mbi in (luma plane) - super block sbi is at index (sbi<<2|mbi). - Thus the number of macro blocks in each dimension is always twice the number - of super blocks, even when only an odd number fall inside the coded frame. - These "extra" macro blocks are just an artifact of our internal data layout, - and not part of the coded stream; they are flagged with a negative MB mode.*/ - - - -/*Super block information.*/ -struct oc_sb_flags{ - unsigned char coded_fully:1; - unsigned char coded_partially:1; - unsigned char quad_valid:4; -}; - - - -/*Information about a fragment which intersects the border of the displayable - region. - This marks which pixels belong to the displayable region.*/ -struct oc_border_info{ - /*A bit mask marking which pixels are in the displayable region. - Pixel (x,y) corresponds to bit (y<<3|x).*/ - ogg_int64_t mask; - /*The number of pixels in the displayable region. - This is always positive, and always less than 64.*/ - int npixels; -}; - - - -/*Fragment information.*/ -struct oc_fragment{ - /*A flag indicating whether or not this fragment is coded.*/ - unsigned coded:1; - /*A flag indicating that this entire fragment lies outside the displayable - region of the frame. - Note the contrast with an invalid macro block, which is outside the coded - frame, not just the displayable one. - There are no fragments outside the coded frame by construction.*/ - unsigned invalid:1; - /*The index of the quality index used for this fragment's AC coefficients.*/ - unsigned qii:4; - /*The index of the reference frame this fragment is predicted from.*/ - unsigned refi:2; - /*The mode of the macroblock this fragment belongs to.*/ - unsigned mb_mode:3; - /*The index of the associated border information for fragments which lie - partially outside the displayable region. - For fragments completely inside or outside this region, this is -1. - Note that the C standard requires an explicit signed keyword for bitfield - types, since some compilers may treat them as unsigned without it.*/ - signed int borderi:5; - /*The prediction-corrected DC component. - Note that the C standard requires an explicit signed keyword for bitfield - types, since some compilers may treat them as unsigned without it.*/ - signed int dc:16; -}; - - - -/*A description of each fragment plane.*/ -struct oc_fragment_plane{ - /*The number of fragments in the horizontal direction.*/ - int nhfrags; - /*The number of fragments in the vertical direction.*/ - int nvfrags; - /*The offset of the first fragment in the plane.*/ - ptrdiff_t froffset; - /*The total number of fragments in the plane.*/ - ptrdiff_t nfrags; - /*The number of super blocks in the horizontal direction.*/ - unsigned nhsbs; - /*The number of super blocks in the vertical direction.*/ - unsigned nvsbs; - /*The offset of the first super block in the plane.*/ - unsigned sboffset; - /*The total number of super blocks in the plane.*/ - unsigned nsbs; -}; - - -typedef void (*oc_state_loop_filter_frag_rows_func)( - const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli, - int _fragy0,int _fragy_end); - -/*The shared (encoder and decoder) functions that have accelerated variants.*/ -struct oc_base_opt_vtable{ - void (*frag_copy)(unsigned char *_dst, - const unsigned char *_src,int _ystride); - void (*frag_copy_list)(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); - void (*frag_recon_intra)(unsigned char *_dst,int _ystride, - const ogg_int16_t _residue[64]); - void (*frag_recon_inter)(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); - void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); - void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); - void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); - void (*loop_filter_init)(signed char _bv[256],int _flimit); - oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows; - void (*restore_fpu)(void); -}; - -/*The shared (encoder and decoder) tables that vary according to which variants - of the above functions are used.*/ -struct oc_base_opt_data{ - const unsigned char *dct_fzig_zag; -}; - - -/*State information common to both the encoder and decoder.*/ -struct oc_theora_state{ - /*The stream information.*/ - th_info info; -# if defined(OC_STATE_USE_VTABLE) - /*Table for shared accelerated functions.*/ - oc_base_opt_vtable opt_vtable; -# endif - /*Table for shared data used by accelerated functions.*/ - oc_base_opt_data opt_data; - /*CPU flags to detect the presence of extended instruction sets.*/ - ogg_uint32_t cpu_flags; - /*The fragment plane descriptions.*/ - oc_fragment_plane fplanes[3]; - /*The list of fragments, indexed in image order.*/ - oc_fragment *frags; - /*The the offset into the reference frame buffer to the upper-left pixel of - each fragment.*/ - ptrdiff_t *frag_buf_offs; - /*The motion vector for each fragment.*/ - oc_mv *frag_mvs; - /*The total number of fragments in a single frame.*/ - ptrdiff_t nfrags; - /*The list of super block maps, indexed in image order.*/ - oc_sb_map *sb_maps; - /*The list of super block flags, indexed in image order.*/ - oc_sb_flags *sb_flags; - /*The total number of super blocks in a single frame.*/ - unsigned nsbs; - /*The fragments from each color plane that belong to each macro block. - Fragments are stored in image order (left to right then top to bottom). - When chroma components are decimated, the extra fragments have an index of - -1.*/ - oc_mb_map *mb_maps; - /*The list of macro block modes. - A negative number indicates the macro block lies entirely outside the - coded frame.*/ - signed char *mb_modes; - /*The number of macro blocks in the X direction.*/ - unsigned nhmbs; - /*The number of macro blocks in the Y direction.*/ - unsigned nvmbs; - /*The total number of macro blocks.*/ - size_t nmbs; - /*The list of coded fragments, in coded order. - Uncoded fragments are stored in reverse order from the end of the list.*/ - ptrdiff_t *coded_fragis; - /*The number of coded fragments in each plane.*/ - ptrdiff_t ncoded_fragis[3]; - /*The total number of coded fragments.*/ - ptrdiff_t ntotal_coded_fragis; - /*The actual buffers used for the reference frames.*/ - th_ycbcr_buffer ref_frame_bufs[6]; - /*The index of the buffers being used for each OC_FRAME_* reference frame.*/ - int ref_frame_idx[6]; - /*The storage for the reference frame buffers. - This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here - for faster look-up.*/ - unsigned char *ref_frame_data[6]; - /*The handle used to allocate the reference frame buffers.*/ - unsigned char *ref_frame_handle; - /*The strides for each plane in the reference frames.*/ - int ref_ystride[3]; - /*The number of unique border patterns.*/ - int nborders; - /*The unique border patterns for all border fragments. - The borderi field of fragments which straddle the border indexes this - list.*/ - oc_border_info borders[16]; - /*The frame number of the last keyframe.*/ - ogg_int64_t keyframe_num; - /*The frame number of the current frame.*/ - ogg_int64_t curframe_num; - /*The granpos of the current frame.*/ - ogg_int64_t granpos; - /*The type of the current frame.*/ - signed char frame_type; - /*The bias to add to the frame count when computing granule positions.*/ - unsigned char granpos_bias; - /*The number of quality indices used in the current frame.*/ - unsigned char nqis; - /*The quality indices of the current frame.*/ - unsigned char qis[3]; - /*The dequantization tables, stored in zig-zag order, and indexed by - qi, pli, qti, and zzi.*/ - ogg_uint16_t *dequant_tables[64][3][2]; - OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]); - /*Loop filter strength parameters.*/ - unsigned char loop_filter_limits[64]; -}; - - - -/*The function type used to fill in the chroma plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane. - _cbmvs: The chroma block-level motion vectors to fill in. - _lmbmv: The luma macro-block level motion vector to fill in for use in - prediction. - _lbmvs: The luma block-level motion vectors.*/ -typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]); - - - -/*A table of functions used to fill in the Cb,Cr plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane.*/ -extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]; - - - -int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs); -void oc_state_clear(oc_theora_state *_state); -void oc_state_accel_init_c(oc_theora_state *_state); -void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli, - int _y0,int _yend); -void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli); -void oc_state_borders_fill(oc_theora_state *_state,int _refi); -void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx, - th_ycbcr_buffer _img); -int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby); -int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], - int _pli,oc_mv _mv); - -void oc_loop_filter_init_c(signed char _bv[256],int _flimit); -void oc_state_loop_filter(oc_theora_state *_state,int _frame); -# if defined(OC_DUMP_IMAGES) -int oc_state_dump_frame(const oc_theora_state *_state,int _frame, - const char *_suf); -# endif - -/*Default pure-C implementations of shared accelerated functions.*/ -void oc_frag_copy_c(unsigned char *_dst, - const unsigned char *_src,int _src_ystride); -void oc_frag_copy_list_c(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); -void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride, - const ogg_int16_t _residue[64]); -void oc_frag_recon_inter_c(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); -void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); -void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_restore_fpu_c(void); - -/*We need a way to call a few encoder functions without introducing a link-time - dependency into the decoder, while still allowing the old alpha API which - does not distinguish between encoder and decoder objects to be used. - We do this by placing a function table at the start of the encoder object - which can dispatch into the encoder library. - We do a similar thing for the decoder in case we ever decide to split off a - common base library.*/ -typedef void (*oc_state_clear_func)(theora_state *_th); -typedef int (*oc_state_control_func)(theora_state *th,int _req, - void *_buf,size_t _buf_sz); -typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th, - ogg_int64_t _granulepos); -typedef double (*oc_state_granule_time_func)(theora_state *_th, - ogg_int64_t _granulepos); - - -struct oc_state_dispatch_vtable{ - oc_state_clear_func clear; - oc_state_control_func control; - oc_state_granule_frame_func granule_frame; - oc_state_granule_time_func granule_time; -}; - -#endif diff --git a/media/libtheora/lib/x86/mmxfrag.c b/media/libtheora/lib/x86/mmxfrag.c deleted file mode 100644 index b7df1c1ec..000000000 --- a/media/libtheora/lib/x86/mmxfrag.c +++ /dev/null @@ -1,368 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of fragment reconstruction for motion compensation. - Originally written by Rudolf Marek. - Additional optimization by Nils Pipenbrinck. - Note: Loops are unrolled for best performance. - The iteration each instruction belongs to is marked in the comments as #i.*/ -#include <stddef.h> -#include "x86int.h" - -#if defined(OC_X86_ASM) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ - do{ \ - const unsigned char *src; \ - unsigned char *dst; \ - ptrdiff_t ystride3; \ - src=(_src); \ - dst=(_dst); \ - __asm__ __volatile__( \ - /*src+0*ystride*/ \ - "movq (%[src]),%%mm0\n\t" \ - /*src+1*ystride*/ \ - "movq (%[src],%[ystride]),%%mm1\n\t" \ - /*ystride3=ystride*3*/ \ - "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ - /*src+2*ystride*/ \ - "movq (%[src],%[ystride],2),%%mm2\n\t" \ - /*src+3*ystride*/ \ - "movq (%[src],%[ystride3]),%%mm3\n\t" \ - /*dst+0*ystride*/ \ - "movq %%mm0,(%[dst])\n\t" \ - /*dst+1*ystride*/ \ - "movq %%mm1,(%[dst],%[ystride])\n\t" \ - /*Pointer to next 4.*/ \ - "lea (%[src],%[ystride],4),%[src]\n\t" \ - /*dst+2*ystride*/ \ - "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ - /*dst+3*ystride*/ \ - "movq %%mm3,(%[dst],%[ystride3])\n\t" \ - /*Pointer to next 4.*/ \ - "lea (%[dst],%[ystride],4),%[dst]\n\t" \ - /*src+0*ystride*/ \ - "movq (%[src]),%%mm0\n\t" \ - /*src+1*ystride*/ \ - "movq (%[src],%[ystride]),%%mm1\n\t" \ - /*src+2*ystride*/ \ - "movq (%[src],%[ystride],2),%%mm2\n\t" \ - /*src+3*ystride*/ \ - "movq (%[src],%[ystride3]),%%mm3\n\t" \ - /*dst+0*ystride*/ \ - "movq %%mm0,(%[dst])\n\t" \ - /*dst+1*ystride*/ \ - "movq %%mm1,(%[dst],%[ystride])\n\t" \ - /*dst+2*ystride*/ \ - "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ - /*dst+3*ystride*/ \ - "movq %%mm3,(%[dst],%[ystride3])\n\t" \ - :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ - :[ystride]"r"((ptrdiff_t)(_ystride)) \ - :"memory" \ - ); \ - } \ - while(0) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -void oc_frag_copy_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride){ - OC_FRAG_COPY_MMX(_dst,_src,_ystride); -} - -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _ystride: The row stride of the reference frames. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _frag_buf_offs: The offsets of fragments in the reference frames.*/ -void oc_frag_copy_list_mmx(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ - ptrdiff_t fragii; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=_frag_buf_offs[_fragis[fragii]]; - OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, - _src_frame+frag_buf_off,_ystride); - } -} - - -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue){ - __asm__ __volatile__( - /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ - "pcmpeqw %%mm0,%%mm0\n\t" - /*#0 Load low residue.*/ - "movq 0*8(%[residue]),%%mm1\n\t" - /*#0 Load high residue.*/ - "movq 1*8(%[residue]),%%mm2\n\t" - /*Set mm0 to 0x8000800080008000.*/ - "psllw $15,%%mm0\n\t" - /*#1 Load low residue.*/ - "movq 2*8(%[residue]),%%mm3\n\t" - /*#1 Load high residue.*/ - "movq 3*8(%[residue]),%%mm4\n\t" - /*Set mm0 to 0x0080008000800080.*/ - "psrlw $8,%%mm0\n\t" - /*#2 Load low residue.*/ - "movq 4*8(%[residue]),%%mm5\n\t" - /*#2 Load high residue.*/ - "movq 5*8(%[residue]),%%mm6\n\t" - /*#0 Bias low residue.*/ - "paddsw %%mm0,%%mm1\n\t" - /*#0 Bias high residue.*/ - "paddsw %%mm0,%%mm2\n\t" - /*#0 Pack to byte.*/ - "packuswb %%mm2,%%mm1\n\t" - /*#1 Bias low residue.*/ - "paddsw %%mm0,%%mm3\n\t" - /*#1 Bias high residue.*/ - "paddsw %%mm0,%%mm4\n\t" - /*#1 Pack to byte.*/ - "packuswb %%mm4,%%mm3\n\t" - /*#2 Bias low residue.*/ - "paddsw %%mm0,%%mm5\n\t" - /*#2 Bias high residue.*/ - "paddsw %%mm0,%%mm6\n\t" - /*#2 Pack to byte.*/ - "packuswb %%mm6,%%mm5\n\t" - /*#0 Write row.*/ - "movq %%mm1,(%[dst])\n\t" - /*#1 Write row.*/ - "movq %%mm3,(%[dst],%[ystride])\n\t" - /*#2 Write row.*/ - "movq %%mm5,(%[dst],%[ystride],2)\n\t" - /*#3 Load low residue.*/ - "movq 6*8(%[residue]),%%mm1\n\t" - /*#3 Load high residue.*/ - "movq 7*8(%[residue]),%%mm2\n\t" - /*#4 Load high residue.*/ - "movq 8*8(%[residue]),%%mm3\n\t" - /*#4 Load high residue.*/ - "movq 9*8(%[residue]),%%mm4\n\t" - /*#5 Load high residue.*/ - "movq 10*8(%[residue]),%%mm5\n\t" - /*#5 Load high residue.*/ - "movq 11*8(%[residue]),%%mm6\n\t" - /*#3 Bias low residue.*/ - "paddsw %%mm0,%%mm1\n\t" - /*#3 Bias high residue.*/ - "paddsw %%mm0,%%mm2\n\t" - /*#3 Pack to byte.*/ - "packuswb %%mm2,%%mm1\n\t" - /*#4 Bias low residue.*/ - "paddsw %%mm0,%%mm3\n\t" - /*#4 Bias high residue.*/ - "paddsw %%mm0,%%mm4\n\t" - /*#4 Pack to byte.*/ - "packuswb %%mm4,%%mm3\n\t" - /*#5 Bias low residue.*/ - "paddsw %%mm0,%%mm5\n\t" - /*#5 Bias high residue.*/ - "paddsw %%mm0,%%mm6\n\t" - /*#5 Pack to byte.*/ - "packuswb %%mm6,%%mm5\n\t" - /*#3 Write row.*/ - "movq %%mm1,(%[dst],%[ystride3])\n\t" - /*#4 Write row.*/ - "movq %%mm3,(%[dst4])\n\t" - /*#5 Write row.*/ - "movq %%mm5,(%[dst4],%[ystride])\n\t" - /*#6 Load low residue.*/ - "movq 12*8(%[residue]),%%mm1\n\t" - /*#6 Load high residue.*/ - "movq 13*8(%[residue]),%%mm2\n\t" - /*#7 Load low residue.*/ - "movq 14*8(%[residue]),%%mm3\n\t" - /*#7 Load high residue.*/ - "movq 15*8(%[residue]),%%mm4\n\t" - /*#6 Bias low residue.*/ - "paddsw %%mm0,%%mm1\n\t" - /*#6 Bias high residue.*/ - "paddsw %%mm0,%%mm2\n\t" - /*#6 Pack to byte.*/ - "packuswb %%mm2,%%mm1\n\t" - /*#7 Bias low residue.*/ - "paddsw %%mm0,%%mm3\n\t" - /*#7 Bias high residue.*/ - "paddsw %%mm0,%%mm4\n\t" - /*#7 Pack to byte.*/ - "packuswb %%mm4,%%mm3\n\t" - /*#6 Write row.*/ - "movq %%mm1,(%[dst4],%[ystride],2)\n\t" - /*#7 Write row.*/ - "movq %%mm3,(%[dst4],%[ystride3])\n\t" - : - :[residue]"r"(_residue), - [dst]"r"(_dst), - [dst4]"r"(_dst+(_ystride<<2)), - [ystride]"r"((ptrdiff_t)_ystride), - [ystride3]"r"((ptrdiff_t)_ystride*3) - :"memory" - ); -} - -void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, - int _ystride,const ogg_int16_t *_residue){ - int i; - /*Zero mm0.*/ - __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); - for(i=4;i-->0;){ - __asm__ __volatile__( - /*#0 Load source.*/ - "movq (%[src]),%%mm3\n\t" - /*#1 Load source.*/ - "movq (%[src],%[ystride]),%%mm7\n\t" - /*#0 Get copy of src.*/ - "movq %%mm3,%%mm4\n\t" - /*#0 Expand high source.*/ - "punpckhbw %%mm0,%%mm4\n\t" - /*#0 Expand low source.*/ - "punpcklbw %%mm0,%%mm3\n\t" - /*#0 Add residue high.*/ - "paddsw 8(%[residue]),%%mm4\n\t" - /*#1 Get copy of src.*/ - "movq %%mm7,%%mm2\n\t" - /*#0 Add residue low.*/ - "paddsw (%[residue]), %%mm3\n\t" - /*#1 Expand high source.*/ - "punpckhbw %%mm0,%%mm2\n\t" - /*#0 Pack final row pixels.*/ - "packuswb %%mm4,%%mm3\n\t" - /*#1 Expand low source.*/ - "punpcklbw %%mm0,%%mm7\n\t" - /*#1 Add residue low.*/ - "paddsw 16(%[residue]),%%mm7\n\t" - /*#1 Add residue high.*/ - "paddsw 24(%[residue]),%%mm2\n\t" - /*Advance residue.*/ - "lea 32(%[residue]),%[residue]\n\t" - /*#1 Pack final row pixels.*/ - "packuswb %%mm2,%%mm7\n\t" - /*Advance src.*/ - "lea (%[src],%[ystride],2),%[src]\n\t" - /*#0 Write row.*/ - "movq %%mm3,(%[dst])\n\t" - /*#1 Write row.*/ - "movq %%mm7,(%[dst],%[ystride])\n\t" - /*Advance dst.*/ - "lea (%[dst],%[ystride],2),%[dst]\n\t" - :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src) - :[ystride]"r"((ptrdiff_t)_ystride) - :"memory" - ); - } -} - -void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ - int i; - /*Zero mm7.*/ - __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); - for(i=4;i-->0;){ - __asm__ __volatile__( - /*#0 Load src1.*/ - "movq (%[src1]),%%mm0\n\t" - /*#0 Load src2.*/ - "movq (%[src2]),%%mm2\n\t" - /*#0 Copy src1.*/ - "movq %%mm0,%%mm1\n\t" - /*#0 Copy src2.*/ - "movq %%mm2,%%mm3\n\t" - /*#1 Load src1.*/ - "movq (%[src1],%[ystride]),%%mm4\n\t" - /*#0 Unpack lower src1.*/ - "punpcklbw %%mm7,%%mm0\n\t" - /*#1 Load src2.*/ - "movq (%[src2],%[ystride]),%%mm5\n\t" - /*#0 Unpack higher src1.*/ - "punpckhbw %%mm7,%%mm1\n\t" - /*#0 Unpack lower src2.*/ - "punpcklbw %%mm7,%%mm2\n\t" - /*#0 Unpack higher src2.*/ - "punpckhbw %%mm7,%%mm3\n\t" - /*Advance src1 ptr.*/ - "lea (%[src1],%[ystride],2),%[src1]\n\t" - /*Advance src2 ptr.*/ - "lea (%[src2],%[ystride],2),%[src2]\n\t" - /*#0 Lower src1+src2.*/ - "paddsw %%mm2,%%mm0\n\t" - /*#0 Higher src1+src2.*/ - "paddsw %%mm3,%%mm1\n\t" - /*#1 Copy src1.*/ - "movq %%mm4,%%mm2\n\t" - /*#0 Build lo average.*/ - "psraw $1,%%mm0\n\t" - /*#1 Copy src2.*/ - "movq %%mm5,%%mm3\n\t" - /*#1 Unpack lower src1.*/ - "punpcklbw %%mm7,%%mm4\n\t" - /*#0 Build hi average.*/ - "psraw $1,%%mm1\n\t" - /*#1 Unpack higher src1.*/ - "punpckhbw %%mm7,%%mm2\n\t" - /*#0 low+=residue.*/ - "paddsw (%[residue]),%%mm0\n\t" - /*#1 Unpack lower src2.*/ - "punpcklbw %%mm7,%%mm5\n\t" - /*#0 high+=residue.*/ - "paddsw 8(%[residue]),%%mm1\n\t" - /*#1 Unpack higher src2.*/ - "punpckhbw %%mm7,%%mm3\n\t" - /*#1 Lower src1+src2.*/ - "paddsw %%mm4,%%mm5\n\t" - /*#0 Pack and saturate.*/ - "packuswb %%mm1,%%mm0\n\t" - /*#1 Higher src1+src2.*/ - "paddsw %%mm2,%%mm3\n\t" - /*#0 Write row.*/ - "movq %%mm0,(%[dst])\n\t" - /*#1 Build lo average.*/ - "psraw $1,%%mm5\n\t" - /*#1 Build hi average.*/ - "psraw $1,%%mm3\n\t" - /*#1 low+=residue.*/ - "paddsw 16(%[residue]),%%mm5\n\t" - /*#1 high+=residue.*/ - "paddsw 24(%[residue]),%%mm3\n\t" - /*#1 Pack and saturate.*/ - "packuswb %%mm3,%%mm5\n\t" - /*#1 Write row ptr.*/ - "movq %%mm5,(%[dst],%[ystride])\n\t" - /*Advance residue ptr.*/ - "add $32,%[residue]\n\t" - /*Advance dest ptr.*/ - "lea (%[dst],%[ystride],2),%[dst]\n\t" - :[dst]"+r"(_dst),[residue]"+r"(_residue), - [src1]"+%r"(_src1),[src2]"+r"(_src2) - :[ystride]"r"((ptrdiff_t)_ystride) - :"memory" - ); - } -} - -void oc_restore_fpu_mmx(void){ - __asm__ __volatile__("emms\n\t"); -} -#endif diff --git a/media/libtheora/lib/x86/mmxidct.c b/media/libtheora/lib/x86/mmxidct.c deleted file mode 100644 index 8d61bdfb1..000000000 --- a/media/libtheora/lib/x86/mmxidct.c +++ /dev/null @@ -1,562 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of Theora's iDCT. - Originally written by Rudolf Marek, based on code from On2's VP3.*/ -#include "x86int.h" -#include "../dct.h" - -#if defined(OC_X86_ASM) - -/*These are offsets into the table of constants below.*/ -/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ -#define OC_COSINE_OFFSET (0) -/*A row of 8's.*/ -#define OC_EIGHT_OFFSET (56) - - - -/*38 cycles*/ -#define OC_IDCT_BEGIN(_y,_x) \ - "#OC_IDCT_BEGIN\n\t" \ - "movq "OC_I(3,_x)",%%mm2\n\t" \ - "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ - "movq %%mm2,%%mm4\n\t" \ - "movq "OC_J(5,_x)",%%mm7\n\t" \ - "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ - "pmulhw %%mm7,%%mm6\n\t" \ - "movq %%mm1,%%mm5\n\t" \ - "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_I(1,_x)",%%mm3\n\t" \ - "pmulhw %%mm7,%%mm5\n\t" \ - "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ - "paddw %%mm2,%%mm4\n\t" \ - "paddw %%mm7,%%mm6\n\t" \ - "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_J(7,_x)",%%mm1\n\t" \ - "paddw %%mm5,%%mm7\n\t" \ - "movq %%mm0,%%mm5\n\t" \ - "pmulhw %%mm3,%%mm0\n\t" \ - "paddw %%mm7,%%mm4\n\t" \ - "pmulhw %%mm1,%%mm5\n\t" \ - "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ - "psubw %%mm2,%%mm6\n\t" \ - "paddw %%mm3,%%mm0\n\t" \ - "pmulhw %%mm7,%%mm3\n\t" \ - "movq "OC_I(2,_x)",%%mm2\n\t" \ - "pmulhw %%mm1,%%mm7\n\t" \ - "paddw %%mm1,%%mm5\n\t" \ - "movq %%mm2,%%mm1\n\t" \ - "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ - "psubw %%mm5,%%mm3\n\t" \ - "movq "OC_J(6,_x)",%%mm5\n\t" \ - "paddw %%mm7,%%mm0\n\t" \ - "movq %%mm5,%%mm7\n\t" \ - "psubw %%mm4,%%mm0\n\t" \ - "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ - "paddw %%mm1,%%mm2\n\t" \ - "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ - "paddw %%mm4,%%mm4\n\t" \ - "paddw %%mm0,%%mm4\n\t" \ - "psubw %%mm6,%%mm3\n\t" \ - "paddw %%mm7,%%mm5\n\t" \ - "paddw %%mm6,%%mm6\n\t" \ - "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ - "paddw %%mm3,%%mm6\n\t" \ - "movq %%mm4,"OC_I(1,_y)"\n\t" \ - "psubw %%mm5,%%mm1\n\t" \ - "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ - "movq %%mm3,%%mm5\n\t" \ - "pmulhw %%mm4,%%mm3\n\t" \ - "paddw %%mm2,%%mm7\n\t" \ - "movq %%mm6,"OC_I(2,_y)"\n\t" \ - "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0,_x)",%%mm6\n\t" \ - "pmulhw %%mm4,%%mm0\n\t" \ - "paddw %%mm3,%%mm5\n\t" \ - "movq "OC_J(4,_x)",%%mm3\n\t" \ - "psubw %%mm1,%%mm5\n\t" \ - "paddw %%mm0,%%mm2\n\t" \ - "psubw %%mm3,%%mm6\n\t" \ - "movq %%mm6,%%mm0\n\t" \ - "pmulhw %%mm4,%%mm6\n\t" \ - "paddw %%mm3,%%mm3\n\t" \ - "paddw %%mm1,%%mm1\n\t" \ - "paddw %%mm0,%%mm3\n\t" \ - "paddw %%mm5,%%mm1\n\t" \ - "pmulhw %%mm3,%%mm4\n\t" \ - "paddw %%mm0,%%mm6\n\t" \ - "psubw %%mm2,%%mm6\n\t" \ - "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1,_y)",%%mm0\n\t" \ - "paddw %%mm6,%%mm2\n\t" \ - "paddw %%mm3,%%mm4\n\t" \ - "psubw %%mm1,%%mm2\n\t" \ - "#end OC_IDCT_BEGIN\n\t" \ - -/*38+8=46 cycles.*/ -#define OC_ROW_IDCT(_y,_x) \ - "#OC_ROW_IDCT\n" \ - OC_IDCT_BEGIN(_y,_x) \ - /*r3=D'*/ \ - "movq "OC_I(2,_y)",%%mm3\n\t" \ - /*r4=E'=E-G*/ \ - "psubw %%mm7,%%mm4\n\t" \ - /*r1=H'+H'*/ \ - "paddw %%mm1,%%mm1\n\t" \ - /*r7=G+G*/ \ - "paddw %%mm7,%%mm7\n\t" \ - /*r1=R1=A''+H'*/ \ - "paddw %%mm2,%%mm1\n\t" \ - /*r7=G'=E+G*/ \ - "paddw %%mm4,%%mm7\n\t" \ - /*r4=R4=E'-D'*/ \ - "psubw %%mm3,%%mm4\n\t" \ - "paddw %%mm3,%%mm3\n\t" \ - /*r6=R6=F'-B''*/ \ - "psubw %%mm5,%%mm6\n\t" \ - "paddw %%mm5,%%mm5\n\t" \ - /*r3=R3=E'+D'*/ \ - "paddw %%mm4,%%mm3\n\t" \ - /*r5=R5=F'+B''*/ \ - "paddw %%mm6,%%mm5\n\t" \ - /*r7=R7=G'-C'*/ \ - "psubw %%mm0,%%mm7\n\t" \ - "paddw %%mm0,%%mm0\n\t" \ - /*Save R1.*/ \ - "movq %%mm1,"OC_I(1,_y)"\n\t" \ - /*r0=R0=G.+C.*/ \ - "paddw %%mm7,%%mm0\n\t" \ - "#end OC_ROW_IDCT\n\t" \ - -/*The following macro does two 4x4 transposes in place. - At entry, we assume: - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 - - At exit, we have: - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 - - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 - - I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. - J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. - - Since r1 is free at entry, we calculate the Js first.*/ -/*19 cycles.*/ -#define OC_TRANSPOSE(_y) \ - "#OC_TRANSPOSE\n\t" \ - "movq %%mm4,%%mm1\n\t" \ - "punpcklwd %%mm5,%%mm4\n\t" \ - "movq %%mm0,"OC_I(0,_y)"\n\t" \ - "punpckhwd %%mm5,%%mm1\n\t" \ - "movq %%mm6,%%mm0\n\t" \ - "punpcklwd %%mm7,%%mm6\n\t" \ - "movq %%mm4,%%mm5\n\t" \ - "punpckldq %%mm6,%%mm4\n\t" \ - "punpckhdq %%mm6,%%mm5\n\t" \ - "movq %%mm1,%%mm6\n\t" \ - "movq %%mm4,"OC_J(4,_y)"\n\t" \ - "punpckhwd %%mm7,%%mm0\n\t" \ - "movq %%mm5,"OC_J(5,_y)"\n\t" \ - "punpckhdq %%mm0,%%mm6\n\t" \ - "movq "OC_I(0,_y)",%%mm4\n\t" \ - "punpckldq %%mm0,%%mm1\n\t" \ - "movq "OC_I(1,_y)",%%mm5\n\t" \ - "movq %%mm4,%%mm0\n\t" \ - "movq %%mm6,"OC_J(7,_y)"\n\t" \ - "punpcklwd %%mm5,%%mm0\n\t" \ - "movq %%mm1,"OC_J(6,_y)"\n\t" \ - "punpckhwd %%mm5,%%mm4\n\t" \ - "movq %%mm2,%%mm5\n\t" \ - "punpcklwd %%mm3,%%mm2\n\t" \ - "movq %%mm0,%%mm1\n\t" \ - "punpckldq %%mm2,%%mm0\n\t" \ - "punpckhdq %%mm2,%%mm1\n\t" \ - "movq %%mm4,%%mm2\n\t" \ - "movq %%mm0,"OC_I(0,_y)"\n\t" \ - "punpckhwd %%mm3,%%mm5\n\t" \ - "movq %%mm1,"OC_I(1,_y)"\n\t" \ - "punpckhdq %%mm5,%%mm4\n\t" \ - "punpckldq %%mm5,%%mm2\n\t" \ - "movq %%mm4,"OC_I(3,_y)"\n\t" \ - "movq %%mm2,"OC_I(2,_y)"\n\t" \ - "#end OC_TRANSPOSE\n\t" \ - -/*38+19=57 cycles.*/ -#define OC_COLUMN_IDCT(_y) \ - "#OC_COLUMN_IDCT\n" \ - OC_IDCT_BEGIN(_y,_y) \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ - /*r1=H'+H'*/ \ - "paddw %%mm1,%%mm1\n\t" \ - /*r1=R1=A''+H'*/ \ - "paddw %%mm2,%%mm1\n\t" \ - /*r2=NR2*/ \ - "psraw $4,%%mm2\n\t" \ - /*r4=E'=E-G*/ \ - "psubw %%mm7,%%mm4\n\t" \ - /*r1=NR1*/ \ - "psraw $4,%%mm1\n\t" \ - /*r3=D'*/ \ - "movq "OC_I(2,_y)",%%mm3\n\t" \ - /*r7=G+G*/ \ - "paddw %%mm7,%%mm7\n\t" \ - /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2,_y)"\n\t" \ - /*r7=G'=E+G*/ \ - "paddw %%mm4,%%mm7\n\t" \ - /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1,_y)"\n\t" \ - /*r4=R4=E'-D'*/ \ - "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ - /*r3=D'+D'*/ \ - "paddw %%mm3,%%mm3\n\t" \ - /*r3=R3=E'+D'*/ \ - "paddw %%mm4,%%mm3\n\t" \ - /*r4=NR4*/ \ - "psraw $4,%%mm4\n\t" \ - /*r6=R6=F'-B''*/ \ - "psubw %%mm5,%%mm6\n\t" \ - /*r3=NR3*/ \ - "psraw $4,%%mm3\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ - /*r5=B''+B''*/ \ - "paddw %%mm5,%%mm5\n\t" \ - /*r5=R5=F'+B''*/ \ - "paddw %%mm6,%%mm5\n\t" \ - /*r6=NR6*/ \ - "psraw $4,%%mm6\n\t" \ - /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4,_y)"\n\t" \ - /*r5=NR5*/ \ - "psraw $4,%%mm5\n\t" \ - /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3,_y)"\n\t" \ - /*r7=R7=G'-C'*/ \ - "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ - /*r0=C'+C'*/ \ - "paddw %%mm0,%%mm0\n\t" \ - /*r0=R0=G'+C'*/ \ - "paddw %%mm7,%%mm0\n\t" \ - /*r7=NR7*/ \ - "psraw $4,%%mm7\n\t" \ - /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6,_y)"\n\t" \ - /*r0=NR0*/ \ - "psraw $4,%%mm0\n\t" \ - /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5,_y)"\n\t" \ - /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7,_y)"\n\t" \ - /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0,_y)"\n\t" \ - "#end OC_COLUMN_IDCT\n\t" \ - -static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - /*This routine accepts an 8x8 matrix, but in partially transposed form. - Every 4x4 block is transposed.*/ - __asm__ __volatile__( -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) -#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) - OC_ROW_IDCT(y,x) - OC_TRANSPOSE(y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) -#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) - OC_ROW_IDCT(y,x) - OC_TRANSPOSE(y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT(y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT(y) -#undef OC_I -#undef OC_J - :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) - :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), - [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) - ); - if(_x!=_y){ - int i; - __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); - for(i=0;i<4;i++){ - __asm__ __volatile__( - "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" - :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) - ); - } - } -} - -/*25 cycles.*/ -#define OC_IDCT_BEGIN_10(_y,_x) \ - "#OC_IDCT_BEGIN_10\n\t" \ - "movq "OC_I(3,_x)",%%mm2\n\t" \ - "nop\n\t" \ - "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ - "movq %%mm2,%%mm4\n\t" \ - "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ - "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_I(1,_x)",%%mm3\n\t" \ - "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ - "paddw %%mm2,%%mm4\n\t" \ - "pxor %%mm6,%%mm6\n\t" \ - "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_I(2,_x)",%%mm5\n\t" \ - "pmulhw %%mm3,%%mm0\n\t" \ - "movq %%mm5,%%mm1\n\t" \ - "paddw %%mm3,%%mm0\n\t" \ - "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ - "psubw %%mm2,%%mm6\n\t" \ - "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ - "psubw %%mm4,%%mm0\n\t" \ - "movq "OC_I(2,_x)",%%mm7\n\t" \ - "paddw %%mm4,%%mm4\n\t" \ - "paddw %%mm5,%%mm7\n\t" \ - "paddw %%mm0,%%mm4\n\t" \ - "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ - "psubw %%mm6,%%mm3\n\t" \ - "movq %%mm4,"OC_I(1,_y)"\n\t" \ - "paddw %%mm6,%%mm6\n\t" \ - "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ - "paddw %%mm3,%%mm6\n\t" \ - "movq %%mm3,%%mm5\n\t" \ - "pmulhw %%mm4,%%mm3\n\t" \ - "movq %%mm6,"OC_I(2,_y)"\n\t" \ - "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0,_x)",%%mm6\n\t" \ - "pmulhw %%mm4,%%mm0\n\t" \ - "paddw %%mm3,%%mm5\n\t" \ - "paddw %%mm0,%%mm2\n\t" \ - "psubw %%mm1,%%mm5\n\t" \ - "pmulhw %%mm4,%%mm6\n\t" \ - "paddw "OC_I(0,_x)",%%mm6\n\t" \ - "paddw %%mm1,%%mm1\n\t" \ - "movq %%mm6,%%mm4\n\t" \ - "paddw %%mm5,%%mm1\n\t" \ - "psubw %%mm2,%%mm6\n\t" \ - "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1,_y)",%%mm0\n\t" \ - "paddw %%mm6,%%mm2\n\t" \ - "psubw %%mm1,%%mm2\n\t" \ - "nop\n\t" \ - "#end OC_IDCT_BEGIN_10\n\t" \ - -/*25+8=33 cycles.*/ -#define OC_ROW_IDCT_10(_y,_x) \ - "#OC_ROW_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10(_y,_x) \ - /*r3=D'*/ \ - "movq "OC_I(2,_y)",%%mm3\n\t" \ - /*r4=E'=E-G*/ \ - "psubw %%mm7,%%mm4\n\t" \ - /*r1=H'+H'*/ \ - "paddw %%mm1,%%mm1\n\t" \ - /*r7=G+G*/ \ - "paddw %%mm7,%%mm7\n\t" \ - /*r1=R1=A''+H'*/ \ - "paddw %%mm2,%%mm1\n\t" \ - /*r7=G'=E+G*/ \ - "paddw %%mm4,%%mm7\n\t" \ - /*r4=R4=E'-D'*/ \ - "psubw %%mm3,%%mm4\n\t" \ - "paddw %%mm3,%%mm3\n\t" \ - /*r6=R6=F'-B''*/ \ - "psubw %%mm5,%%mm6\n\t" \ - "paddw %%mm5,%%mm5\n\t" \ - /*r3=R3=E'+D'*/ \ - "paddw %%mm4,%%mm3\n\t" \ - /*r5=R5=F'+B''*/ \ - "paddw %%mm6,%%mm5\n\t" \ - /*r7=R7=G'-C'*/ \ - "psubw %%mm0,%%mm7\n\t" \ - "paddw %%mm0,%%mm0\n\t" \ - /*Save R1.*/ \ - "movq %%mm1,"OC_I(1,_y)"\n\t" \ - /*r0=R0=G'+C'*/ \ - "paddw %%mm7,%%mm0\n\t" \ - "#end OC_ROW_IDCT_10\n\t" \ - -/*25+19=44 cycles'*/ -#define OC_COLUMN_IDCT_10(_y) \ - "#OC_COLUMN_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10(_y,_y) \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ - /*r1=H'+H'*/ \ - "paddw %%mm1,%%mm1\n\t" \ - /*r1=R1=A''+H'*/ \ - "paddw %%mm2,%%mm1\n\t" \ - /*r2=NR2*/ \ - "psraw $4,%%mm2\n\t" \ - /*r4=E'=E-G*/ \ - "psubw %%mm7,%%mm4\n\t" \ - /*r1=NR1*/ \ - "psraw $4,%%mm1\n\t" \ - /*r3=D'*/ \ - "movq "OC_I(2,_y)",%%mm3\n\t" \ - /*r7=G+G*/ \ - "paddw %%mm7,%%mm7\n\t" \ - /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2,_y)"\n\t" \ - /*r7=G'=E+G*/ \ - "paddw %%mm4,%%mm7\n\t" \ - /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1,_y)"\n\t" \ - /*r4=R4=E'-D'*/ \ - "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ - /*r3=D'+D'*/ \ - "paddw %%mm3,%%mm3\n\t" \ - /*r3=R3=E'+D'*/ \ - "paddw %%mm4,%%mm3\n\t" \ - /*r4=NR4*/ \ - "psraw $4,%%mm4\n\t" \ - /*r6=R6=F'-B''*/ \ - "psubw %%mm5,%%mm6\n\t" \ - /*r3=NR3*/ \ - "psraw $4,%%mm3\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ - /*r5=B''+B''*/ \ - "paddw %%mm5,%%mm5\n\t" \ - /*r5=R5=F'+B''*/ \ - "paddw %%mm6,%%mm5\n\t" \ - /*r6=NR6*/ \ - "psraw $4,%%mm6\n\t" \ - /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4,_y)"\n\t" \ - /*r5=NR5*/ \ - "psraw $4,%%mm5\n\t" \ - /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3,_y)"\n\t" \ - /*r7=R7=G'-C'*/ \ - "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ - /*r0=C'+C'*/ \ - "paddw %%mm0,%%mm0\n\t" \ - /*r0=R0=G'+C'*/ \ - "paddw %%mm7,%%mm0\n\t" \ - /*r7=NR7*/ \ - "psraw $4,%%mm7\n\t" \ - /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6,_y)"\n\t" \ - /*r0=NR0*/ \ - "psraw $4,%%mm0\n\t" \ - /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5,_y)"\n\t" \ - /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7,_y)"\n\t" \ - /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0,_y)"\n\t" \ - "#end OC_COLUMN_IDCT_10\n\t" \ - -static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - __asm__ __volatile__( -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) -#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) - /*Done with dequant, descramble, and partial transpose. - Now do the iDCT itself.*/ - OC_ROW_IDCT_10(y,x) - OC_TRANSPOSE(y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT_10(y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT_10(y) -#undef OC_I -#undef OC_J - :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) - :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), - [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) - ); - if(_x!=_y){ - __asm__ __volatile__( - "pxor %%mm0,%%mm0\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" - :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) - ); - } -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform.*/ -void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ - /*_last_zzi is subtly different from an actual count of the number of - coefficients we decoded for this block. - It contains the value of zzi BEFORE the final token in the block was - decoded. - In most cases this is an EOB token (the continuation of an EOB run from a - previous block counts), and so this is the same as the coefficient count. - However, in the case that the last token was NOT an EOB token, but filled - the block up with exactly 64 coefficients, _last_zzi will be less than 64. - Provided the last token was not a pure zero run, the minimum value it can - be is 46, and so that doesn't affect any of the cases in this routine. - However, if the last token WAS a pure zero run of length 63, then _last_zzi - will be 1 while the number of coefficients decoded is 64. - Thus, we will trigger the following special case, where the real - coefficient count would not. - Note also that a zero run of length 64 will give _last_zzi a value of 0, - but we still process the DC coefficient, which might have a non-zero value - due to DC prediction. - Although convoluted, this is arguably the correct behavior: it allows us to - use a smaller transform when the block ends with a long zero run instead - of a normal EOB token. - It could be smarter... multiple separate zero runs at the end of a block - will fool it, but an encoder that generates these really deserves what it - gets. - Needless to say we inherited this approach from VP3.*/ - /*Then perform the iDCT.*/ - if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); - else oc_idct8x8_slow_mmx(_y,_x); -} - -#endif diff --git a/media/libtheora/lib/x86/mmxloop.h b/media/libtheora/lib/x86/mmxloop.h deleted file mode 100644 index 1f6090b56..000000000 --- a/media/libtheora/lib/x86/mmxloop.h +++ /dev/null @@ -1,318 +0,0 @@ -#if !defined(_x86_mmxloop_H) -# define _x86_mmxloop_H (1) -# include <stddef.h> -# include "x86int.h" - -#if defined(OC_X86_ASM) - -/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. - On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and - mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ -#define OC_LOOP_FILTER8_MMX \ - "#OC_LOOP_FILTER8_MMX\n\t" \ - /*mm7=0*/ \ - "pxor %%mm7,%%mm7\n\t" \ - /*mm6:mm0={a0,...,a7}*/ \ - "movq %%mm0,%%mm6\n\t" \ - "punpcklbw %%mm7,%%mm0\n\t" \ - "punpckhbw %%mm7,%%mm6\n\t" \ - /*mm3:mm5={d0,...,d7}*/ \ - "movq %%mm3,%%mm5\n\t" \ - "punpcklbw %%mm7,%%mm3\n\t" \ - "punpckhbw %%mm7,%%mm5\n\t" \ - /*mm6:mm0={a0-d0,...,a7-d7}*/ \ - "psubw %%mm3,%%mm0\n\t" \ - "psubw %%mm5,%%mm6\n\t" \ - /*mm3:mm1={b0,...,b7}*/ \ - "movq %%mm1,%%mm3\n\t" \ - "punpcklbw %%mm7,%%mm1\n\t" \ - "movq %%mm2,%%mm4\n\t" \ - "punpckhbw %%mm7,%%mm3\n\t" \ - /*mm5:mm4={c0,...,c7}*/ \ - "movq %%mm2,%%mm5\n\t" \ - "punpcklbw %%mm7,%%mm4\n\t" \ - "punpckhbw %%mm7,%%mm5\n\t" \ - /*mm7={3}x4 \ - mm5:mm4={c0-b0,...,c7-b7}*/ \ - "pcmpeqw %%mm7,%%mm7\n\t" \ - "psubw %%mm1,%%mm4\n\t" \ - "psrlw $14,%%mm7\n\t" \ - "psubw %%mm3,%%mm5\n\t" \ - /*Scale by 3.*/ \ - "pmullw %%mm7,%%mm4\n\t" \ - "pmullw %%mm7,%%mm5\n\t" \ - /*mm7={4}x4 \ - mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ - "psrlw $1,%%mm7\n\t" \ - "paddw %%mm0,%%mm4\n\t" \ - "psllw $2,%%mm7\n\t" \ - "movq (%[ll]),%%mm0\n\t" \ - "paddw %%mm6,%%mm5\n\t" \ - /*R_i has the range [-127,128], so we compute -R_i instead. \ - mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ - "psubw %%mm7,%%mm4\n\t" \ - "psubw %%mm7,%%mm5\n\t" \ - "psraw $3,%%mm4\n\t" \ - "psraw $3,%%mm5\n\t" \ - "pcmpeqb %%mm7,%%mm7\n\t" \ - "packsswb %%mm5,%%mm4\n\t" \ - "pxor %%mm6,%%mm6\n\t" \ - "pxor %%mm7,%%mm4\n\t" \ - "packuswb %%mm3,%%mm1\n\t" \ - /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ - /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ - we have to split things by sign (the other option is to work in 16 bits, \ - but working in 8 bits gives much better parallelism). \ - We compute abs(R_i), but save a mask of which terms were negative in mm6. \ - Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ - Finally, we split mm4 into positive and negative pieces using the mask in \ - mm6, and add and subtract them as appropriate.*/ \ - /*mm4=abs(-R_i)*/ \ - /*mm7=255-2*L*/ \ - "pcmpgtb %%mm4,%%mm6\n\t" \ - "psubb %%mm0,%%mm7\n\t" \ - "pxor %%mm6,%%mm4\n\t" \ - "psubb %%mm0,%%mm7\n\t" \ - "psubb %%mm6,%%mm4\n\t" \ - /*mm7=255-max(2*L-abs(R_i),0)*/ \ - "paddusb %%mm4,%%mm7\n\t" \ - /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ - "paddusb %%mm7,%%mm4\n\t" \ - "psubusb %%mm7,%%mm4\n\t" \ - /*Now split mm4 by the original sign of -R_i.*/ \ - "movq %%mm4,%%mm5\n\t" \ - "pand %%mm6,%%mm4\n\t" \ - "pandn %%mm5,%%mm6\n\t" \ - /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ - /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ - "paddusb %%mm4,%%mm1\n\t" \ - "psubusb %%mm4,%%mm2\n\t" \ - "psubusb %%mm6,%%mm1\n\t" \ - "paddusb %%mm6,%%mm2\n\t" \ - -/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. - On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and - mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}. - All other MMX registers are clobbered.*/ -#define OC_LOOP_FILTER8_MMXEXT \ - "#OC_LOOP_FILTER8_MMXEXT\n\t" \ - /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \ - -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \ - /*This first part is based on the transformation \ - f = -(3*(c-b)+a-d+4>>3) \ - = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \ - = -(3*(c+~b)+(a+~d)-1016>>3) \ - = 127-(3*(c+~b)+(a+~d)>>3) \ - = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \ - Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \ - fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \ - Using this, the last expression above can be computed in 8 bits of working \ - precision via: \ - u = ~pavgb(~b,c); \ - v = pavgb(b,~c); \ - This mask is 0 or 0xFF, and controls whether t is biased up or down: \ - m = u-v; \ - t = m^pavgb(m^~a,m^d); \ - f = 128+pavgb(pavgb(t,u),v); \ - This required some careful analysis to ensure that carries are propagated \ - correctly in all cases, but has been checked exhaustively.*/ \ - /*input (a, b, c, d, ., ., ., .)*/ \ - /*ff=0xFF; \ - u=b; \ - v=c; \ - ll=255-2*L;*/ \ - "pcmpeqb %%mm7,%%mm7\n\t" \ - "movq %%mm1,%%mm4\n\t" \ - "movq %%mm2,%%mm5\n\t" \ - "movq (%[ll]),%%mm6\n\t" \ - /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \ - /*u^=ff; \ - v^=ff;*/ \ - "pxor %%mm7,%%mm4\n\t" \ - "pxor %%mm7,%%mm5\n\t" \ - /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \ - /*u=pavgb(u,c); \ - v=pavgb(v,b);*/ \ - "pavgb %%mm2,%%mm4\n\t" \ - "pavgb %%mm1,%%mm5\n\t" \ - /*u^=ff; \ - a^=ff;*/ \ - "pxor %%mm7,%%mm4\n\t" \ - "pxor %%mm7,%%mm0\n\t" \ - /*m=u-v;*/ \ - "psubb %%mm5,%%mm4\n\t" \ - /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \ - /*a^=m; \ - d^=m;*/ \ - "pxor %%mm4,%%mm0\n\t" \ - "pxor %%mm4,%%mm3\n\t" \ - /*t=pavgb(a,d);*/ \ - "pavgb %%mm3,%%mm0\n\t" \ - "psllw $7,%%mm7\n\t" \ - /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \ - /*t^=m; \ - u=m+v;*/ \ - "pxor %%mm4,%%mm0\n\t" \ - "paddb %%mm5,%%mm4\n\t" \ - /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \ - /*f=pavgb(f,u); \ - of=128;*/ \ - "pavgb %%mm4,%%mm0\n\t" \ - "packsswb %%mm7,%%mm7\n\t" \ - /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \ - /*f=pavgb(f,v);*/ \ - "pavgb %%mm5,%%mm0\n\t" \ - "movq %%mm7,%%mm3\n\t" \ - "movq %%mm6,%%mm4\n\t" \ - /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \ - /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \ - /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ - we have to split things by sign (the other option is to work in 16 bits, \ - but staying in 8 bits gives much better parallelism).*/ \ - /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \ - This is the same number of instructions as computing a mask and splitting \ - after the lflim computation, but has shorter dependency chains.*/ \ - /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\ - mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \ - "psubusb %%mm0,%%mm3\n\t" \ - "psubusb %%mm7,%%mm0\n\t" \ - /*mm6=255-max(2*L-abs(R_i<0),0) \ - mm4=255-max(2*L-abs(R_i>0),0)*/ \ - "paddusb %%mm3,%%mm4\n\t" \ - "paddusb %%mm0,%%mm6\n\t" \ - /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \ - mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \ - "paddusb %%mm4,%%mm3\n\t" \ - "paddusb %%mm6,%%mm0\n\t" \ - "psubusb %%mm4,%%mm3\n\t" \ - "psubusb %%mm6,%%mm0\n\t" \ - /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ - /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ - "paddusb %%mm3,%%mm1\n\t" \ - "psubusb %%mm3,%%mm2\n\t" \ - "psubusb %%mm0,%%mm1\n\t" \ - "paddusb %%mm0,%%mm2\n\t" \ - -#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \ - do{ \ - ptrdiff_t ystride3__; \ - __asm__ __volatile__( \ - /*mm0={a0,...,a7}*/ \ - "movq (%[pix]),%%mm0\n\t" \ - /*ystride3=_ystride*3*/ \ - "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ - /*mm3={d0,...,d7}*/ \ - "movq (%[pix],%[ystride3]),%%mm3\n\t" \ - /*mm1={b0,...,b7}*/ \ - "movq (%[pix],%[ystride]),%%mm1\n\t" \ - /*mm2={c0,...,c7}*/ \ - "movq (%[pix],%[ystride],2),%%mm2\n\t" \ - _filter \ - /*Write it back out.*/ \ - "movq %%mm1,(%[pix],%[ystride])\n\t" \ - "movq %%mm2,(%[pix],%[ystride],2)\n\t" \ - :[ystride3]"=&r"(ystride3__) \ - :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \ - [ll]"r"(_ll) \ - :"memory" \ - ); \ - } \ - while(0) - -#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \ - do{ \ - unsigned char *pix__; \ - ptrdiff_t ystride3__; \ - ptrdiff_t d__; \ - pix__=(_pix)-2; \ - __asm__ __volatile__( \ - /*x x x x d0 c0 b0 a0*/ \ - "movd (%[pix]),%%mm0\n\t" \ - /*x x x x d1 c1 b1 a1*/ \ - "movd (%[pix],%[ystride]),%%mm1\n\t" \ - /*ystride3=_ystride*3*/ \ - "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ - /*x x x x d2 c2 b2 a2*/ \ - "movd (%[pix],%[ystride],2),%%mm2\n\t" \ - /*x x x x d3 c3 b3 a3*/ \ - "lea (%[pix],%[ystride],4),%[d]\n\t" \ - "movd (%[pix],%[ystride3]),%%mm3\n\t" \ - /*x x x x d4 c4 b4 a4*/ \ - "movd (%[d]),%%mm4\n\t" \ - /*x x x x d5 c5 b5 a5*/ \ - "movd (%[d],%[ystride]),%%mm5\n\t" \ - /*x x x x d6 c6 b6 a6*/ \ - "movd (%[d],%[ystride],2),%%mm6\n\t" \ - /*x x x x d7 c7 b7 a7*/ \ - "movd (%[d],%[ystride3]),%%mm7\n\t" \ - /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ - "punpcklbw %%mm1,%%mm0\n\t" \ - /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ - "punpcklbw %%mm3,%%mm2\n\t" \ - /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ - "movq %%mm0,%%mm3\n\t" \ - /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ - "punpcklwd %%mm2,%%mm0\n\t" \ - /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ - "punpckhwd %%mm2,%%mm3\n\t" \ - /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ - "movq %%mm0,%%mm1\n\t" \ - /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ - "punpcklbw %%mm5,%%mm4\n\t" \ - /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ - "punpcklbw %%mm7,%%mm6\n\t" \ - /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ - "movq %%mm4,%%mm5\n\t" \ - /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ - "punpcklwd %%mm6,%%mm4\n\t" \ - /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ - "punpckhwd %%mm6,%%mm5\n\t" \ - /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ - "movq %%mm3,%%mm2\n\t" \ - /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ - "punpckldq %%mm4,%%mm0\n\t" \ - /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ - "punpckhdq %%mm4,%%mm1\n\t" \ - /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ - "punpckldq %%mm5,%%mm2\n\t" \ - /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ - "punpckhdq %%mm5,%%mm3\n\t" \ - _filter \ - /*mm2={b0+R_0'',...,b7+R_7''}*/ \ - "movq %%mm1,%%mm0\n\t" \ - /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ - "punpcklbw %%mm2,%%mm1\n\t" \ - /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ - "punpckhbw %%mm2,%%mm0\n\t" \ - /*[d]=c1 b1 c0 b0*/ \ - "movd %%mm1,%[d]\n\t" \ - "movw %w[d],1(%[pix])\n\t" \ - "psrlq $32,%%mm1\n\t" \ - "shr $16,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride])\n\t" \ - /*[d]=c3 b3 c2 b2*/ \ - "movd %%mm1,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ - "shr $16,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride3])\n\t" \ - "lea (%[pix],%[ystride],4),%[pix]\n\t" \ - /*[d]=c5 b5 c4 b4*/ \ - "movd %%mm0,%[d]\n\t" \ - "movw %w[d],1(%[pix])\n\t" \ - "psrlq $32,%%mm0\n\t" \ - "shr $16,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride])\n\t" \ - /*[d]=c7 b7 c6 b6*/ \ - "movd %%mm0,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ - "shr $16,%[d]\n\t" \ - "movw %w[d],1(%[pix],%[ystride3])\n\t" \ - :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \ - :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \ - :"memory" \ - ); \ - } \ - while(0) - -# endif -#endif diff --git a/media/libtheora/lib/x86/mmxstate.c b/media/libtheora/lib/x86/mmxstate.c deleted file mode 100644 index 0b9586f94..000000000 --- a/media/libtheora/lib/x86/mmxstate.c +++ /dev/null @@ -1,226 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of complete fragment reconstruction algorithm. - Originally written by Rudolf Marek.*/ -#include <string.h> -#include "x86int.h" -#include "mmxloop.h" - -#if defined(OC_X86_ASM) - -void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - /*Note that this value must be unsigned, to keep the __asm__ block from - sign-extending it when it puts it in a register.*/ - ogg_uint16_t p; - int i; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - /*Fill _dct_coeffs with p.*/ - __asm__ __volatile__( - /*mm0=0000 0000 0000 AAAA*/ - "movd %[p],%%mm0\n\t" - /*mm0=0000 0000 AAAA AAAA*/ - "punpcklwd %%mm0,%%mm0\n\t" - /*mm0=AAAA AAAA AAAA AAAA*/ - "punpckldq %%mm0,%%mm0\n\t" - : - :[p]"r"((unsigned)p) - ); - for(i=0;i<4;i++){ - __asm__ __volatile__( - "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" - :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) - ); - } - } - else{ - /*Dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs+64); - } - else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } -} - -/*We copy these entire function to inline the actual MMX routines so that we - use only a single indirect call.*/ - -void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ - memset(_bv,_flimit,8); -} - -/*Apply the loop filter to a given set of fragment rows in the given plane. - The filter may be run on the bottom edge, affecting pixels in the next row of - fragments, so this row also needs to be available. - _bv: The bounding values array. - _refi: The index of the frame buffer to filter. - _pli: The color plane to filter. - _fragy0: The Y coordinate of the first fragment row to filter. - _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ - OC_ALIGN8(unsigned char ll[8]); - const oc_fragment_plane *fplane; - const oc_fragment *frags; - const ptrdiff_t *frag_buf_offs; - unsigned char *ref_frame_data; - ptrdiff_t fragi_top; - ptrdiff_t fragi_bot; - ptrdiff_t fragi0; - ptrdiff_t fragi0_end; - int ystride; - int nhfrags; - memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); - fplane=_state->fplanes+_pli; - nhfrags=fplane->nhfrags; - fragi_top=fplane->froffset; - fragi_bot=fragi_top+fplane->nfrags; - fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; - ystride=_state->ref_ystride[_pli]; - frags=_state->frags; - frag_buf_offs=_state->frag_buf_offs; - ref_frame_data=_state->ref_frame_data[_refi]; - /*The following loops are constructed somewhat non-intuitively on purpose. - The main idea is: if a block boundary has at least one coded fragment on - it, the filter is applied to it. - However, the order that the filters are applied in matters, and VP3 chose - the somewhat strange ordering used below.*/ - while(fragi0<fragi0_end){ - ptrdiff_t fragi; - ptrdiff_t fragi_end; - fragi=fragi0; - fragi_end=fragi+nhfrags; - while(fragi<fragi_end){ - if(frags[fragi].coded){ - unsigned char *ref; - ref=ref_frame_data+frag_buf_offs[fragi]; - if(fragi>fragi0){ - OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); - } - if(fragi0>fragi_top){ - OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); - } - if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll); - } - if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll); - } - } - fragi++; - } - fragi0+=nhfrags; - } -} - -void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){ - memset(_bv,~(_flimit<<1),8); -} - -/*Apply the loop filter to a given set of fragment rows in the given plane. - The filter may be run on the bottom edge, affecting pixels in the next row of - fragments, so this row also needs to be available. - _bv: The bounding values array. - _refi: The index of the frame buffer to filter. - _pli: The color plane to filter. - _fragy0: The Y coordinate of the first fragment row to filter. - _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ - const oc_fragment_plane *fplane; - const oc_fragment *frags; - const ptrdiff_t *frag_buf_offs; - unsigned char *ref_frame_data; - ptrdiff_t fragi_top; - ptrdiff_t fragi_bot; - ptrdiff_t fragi0; - ptrdiff_t fragi0_end; - int ystride; - int nhfrags; - fplane=_state->fplanes+_pli; - nhfrags=fplane->nhfrags; - fragi_top=fplane->froffset; - fragi_bot=fragi_top+fplane->nfrags; - fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; - ystride=_state->ref_ystride[_pli]; - frags=_state->frags; - frag_buf_offs=_state->frag_buf_offs; - ref_frame_data=_state->ref_frame_data[_refi]; - /*The following loops are constructed somewhat non-intuitively on purpose. - The main idea is: if a block boundary has at least one coded fragment on - it, the filter is applied to it. - However, the order that the filters are applied in matters, and VP3 chose - the somewhat strange ordering used below.*/ - while(fragi0<fragi0_end){ - ptrdiff_t fragi; - ptrdiff_t fragi_end; - fragi=fragi0; - fragi_end=fragi+nhfrags; - while(fragi<fragi_end){ - if(frags[fragi].coded){ - unsigned char *ref; - ref=ref_frame_data+frag_buf_offs[fragi]; - if(fragi>fragi0){ - OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); - } - if(fragi0>fragi_top){ - OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); - } - if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv); - } - if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv); - } - } - fragi++; - } - fragi0+=nhfrags; - } -} - -#endif diff --git a/media/libtheora/lib/x86/sse2idct.c b/media/libtheora/lib/x86/sse2idct.c deleted file mode 100644 index 5f8523fa5..000000000 --- a/media/libtheora/lib/x86/sse2idct.c +++ /dev/null @@ -1,460 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ - - ********************************************************************/ - -/*SSE2 acceleration of Theora's iDCT.*/ -#include "x86int.h" -#include "sse2trans.h" -#include "../dct.h" - -#if defined(OC_X86_ASM) - -/*A table of constants used by the MMX routines.*/ -const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ - 8, 8, 8, 8, 8, 8, 8, 8, - OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, - OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, - OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, - OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, - OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, - OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, - OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 -}; - - -/*Performs the first three stages of the iDCT. - xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input - (accessed in that order). - The remaining rows must be in _x at their corresponding locations. - On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 - contain rows 4 through 7.*/ -#define OC_IDCT_8x8_ABC(_x) \ - "#OC_IDCT_8x8_ABC\n\t" \ - /*Stage 1:*/ \ - /*2-3 rotation by 6pi/16. \ - xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ - "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ - "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ - "movdqa %%xmm1,%%xmm0\n\t" \ - "pmulhw %%xmm2,%%xmm1\n\t" \ - "movdqa %%xmm4,%%xmm7\n\t" \ - "pmulhw %%xmm6,%%xmm0\n\t" \ - "pmulhw %%xmm2,%%xmm7\n\t" \ - "pmulhw %%xmm6,%%xmm4\n\t" \ - "paddw %%xmm6,%%xmm0\n\t" \ - "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ - "paddw %%xmm1,%%xmm2\n\t" \ - "psubw %%xmm0,%%xmm7\n\t" \ - "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ - "paddw %%xmm4,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ - "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ - /*5-6 rotation by 3pi/16. \ - xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ - "movdqa %%xmm4,%%xmm2\n\t" \ - "movdqa %%xmm6,%%xmm1\n\t" \ - "pmulhw %%xmm3,%%xmm4\n\t" \ - "pmulhw %%xmm5,%%xmm1\n\t" \ - "pmulhw %%xmm3,%%xmm6\n\t" \ - "pmulhw %%xmm5,%%xmm2\n\t" \ - "paddw %%xmm3,%%xmm4\n\t" \ - "paddw %%xmm5,%%xmm3\n\t" \ - "paddw %%xmm6,%%xmm3\n\t" \ - "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ - "paddw %%xmm5,%%xmm1\n\t" \ - "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ - "paddw %%xmm3,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ - "psubw %%xmm4,%%xmm1\n\t" \ - "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ - /*4-7 rotation by 7pi/16. \ - xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ - "movdqa %%xmm3,%%xmm0\n\t" \ - "movdqa %%xmm4,%%xmm7\n\t" \ - "pmulhw %%xmm5,%%xmm3\n\t" \ - "pmulhw %%xmm5,%%xmm7\n\t" \ - "pmulhw %%xmm6,%%xmm4\n\t" \ - "pmulhw %%xmm6,%%xmm0\n\t" \ - "paddw %%xmm6,%%xmm4\n\t" \ - "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ - "paddw %%xmm5,%%xmm7\n\t" \ - "psubw %%xmm4,%%xmm3\n\t" \ - "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ - "paddw %%xmm7,%%xmm0\n\t" \ - "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ - /*0-1 butterfly. \ - xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ - "paddw %%xmm7,%%xmm6\n\t" \ - "movdqa %%xmm4,%%xmm5\n\t" \ - "pmulhw %%xmm6,%%xmm4\n\t" \ - "paddw %%xmm7,%%xmm7\n\t" \ - "psubw %%xmm6,%%xmm7\n\t" \ - "paddw %%xmm6,%%xmm4\n\t" \ - /*Stage 2:*/ \ - /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ - 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ - "movdqa %%xmm3,%%xmm6\n\t" \ - "paddw %%xmm1,%%xmm3\n\t" \ - "psubw %%xmm1,%%xmm6\n\t" \ - "movdqa %%xmm5,%%xmm1\n\t" \ - "pmulhw %%xmm7,%%xmm5\n\t" \ - "paddw %%xmm7,%%xmm5\n\t" \ - "movdqa %%xmm0,%%xmm7\n\t" \ - "paddw %%xmm2,%%xmm0\n\t" \ - "psubw %%xmm2,%%xmm7\n\t" \ - "movdqa %%xmm1,%%xmm2\n\t" \ - "pmulhw %%xmm6,%%xmm1\n\t" \ - "pmulhw %%xmm7,%%xmm2\n\t" \ - "paddw %%xmm6,%%xmm1\n\t" \ - "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ - "paddw %%xmm7,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ - /*Stage 3: \ - 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ - 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ - 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ - "paddw %%xmm2,%%xmm1\n\t" \ - "paddw %%xmm5,%%xmm6\n\t" \ - "paddw %%xmm4,%%xmm7\n\t" \ - "paddw %%xmm2,%%xmm2\n\t" \ - "paddw %%xmm4,%%xmm4\n\t" \ - "paddw %%xmm5,%%xmm5\n\t" \ - "psubw %%xmm1,%%xmm2\n\t" \ - "psubw %%xmm7,%%xmm4\n\t" \ - "psubw %%xmm6,%%xmm5\n\t" \ - -/*Performs the last stage of the iDCT. - On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 - contain rows 4 through 7. - On output, xmm0 through xmm7 contain the corresponding rows.*/ -#define OC_IDCT_8x8_D \ - "#OC_IDCT_8x8_D\n\t" \ - /*Stage 4: \ - 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ - 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ - 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ - 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ - "psubw %%xmm0,%%xmm7\n\t" \ - "psubw %%xmm1,%%xmm6\n\t" \ - "psubw %%xmm2,%%xmm5\n\t" \ - "psubw %%xmm3,%%xmm4\n\t" \ - "paddw %%xmm0,%%xmm0\n\t" \ - "paddw %%xmm1,%%xmm1\n\t" \ - "paddw %%xmm2,%%xmm2\n\t" \ - "paddw %%xmm3,%%xmm3\n\t" \ - "paddw %%xmm7,%%xmm0\n\t" \ - "paddw %%xmm6,%%xmm1\n\t" \ - "paddw %%xmm5,%%xmm2\n\t" \ - "paddw %%xmm4,%%xmm3\n\t" \ - -/*Performs the last stage of the iDCT. - On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 - contain rows 4 through 7. - On output, xmm0 through xmm7 contain the corresponding rows.*/ -#define OC_IDCT_8x8_D_STORE \ - "#OC_IDCT_8x8_D_STORE\n\t" \ - /*Stage 4: \ - 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ - 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ - 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ - 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ - "psubw %%xmm3,%%xmm4\n\t" \ - "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ - "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ - "psubw %%xmm0,%%xmm7\n\t" \ - "psubw %%xmm1,%%xmm6\n\t" \ - "psubw %%xmm2,%%xmm5\n\t" \ - "paddw %%xmm4,%%xmm7\n\t" \ - "paddw %%xmm4,%%xmm6\n\t" \ - "paddw %%xmm4,%%xmm5\n\t" \ - "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ - "paddw %%xmm0,%%xmm0\n\t" \ - "paddw %%xmm1,%%xmm1\n\t" \ - "paddw %%xmm2,%%xmm2\n\t" \ - "paddw %%xmm3,%%xmm3\n\t" \ - "paddw %%xmm7,%%xmm0\n\t" \ - "paddw %%xmm6,%%xmm1\n\t" \ - "psraw $4,%%xmm0\n\t" \ - "paddw %%xmm5,%%xmm2\n\t" \ - "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ - "psraw $4,%%xmm1\n\t" \ - "paddw %%xmm4,%%xmm3\n\t" \ - "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ - "psraw $4,%%xmm2\n\t" \ - "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ - "psraw $4,%%xmm3\n\t" \ - "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ - "psraw $4,%%xmm4\n\t" \ - "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ - "psraw $4,%%xmm5\n\t" \ - "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ - "psraw $4,%%xmm6\n\t" \ - "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ - "psraw $4,%%xmm7\n\t" \ - "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ - -static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - OC_ALIGN16(ogg_int16_t buf[16]); - /*This routine accepts an 8x8 matrix pre-transposed.*/ - __asm__ __volatile__( - /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ - "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" - "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" - "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" - "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" - OC_IDCT_8x8_ABC(x) - OC_IDCT_8x8_D - OC_TRANSPOSE_8x8 - /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ - "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" - "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" - "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" - "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" - OC_IDCT_8x8_ABC(y) - OC_IDCT_8x8_D_STORE - :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), - [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) - :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), - [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) - ); - if(_x!=_y){ - int i; - __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); - /*Clear input data for next block (decoder only).*/ - for(i=0;i<2;i++){ - __asm__ __volatile__( - "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" - "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" - "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" - "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" - :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) - ); - } - } -} - -/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only - need to work with four columns at a time. - Doing this in MMX is faster on processors with a 64-bit data path.*/ -#define OC_IDCT_8x8_10_MMX \ - "#OC_IDCT_8x8_10_MMX\n\t" \ - /*Stage 1:*/ \ - /*2-3 rotation by 6pi/16. \ - mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ - "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ - "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ - "pmulhw %%mm2,%%mm6\n\t" \ - "pmulhw %%mm2,%%mm7\n\t" \ - "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ - "paddw %%mm6,%%mm2\n\t" \ - "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ - "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ - "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ - /*5-6 rotation by 3pi/16. \ - mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ - "pmulhw %%mm3,%%mm5\n\t" \ - "pmulhw %%mm3,%%mm2\n\t" \ - "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ - "paddw %%mm3,%%mm5\n\t" \ - "paddw %%mm3,%%mm2\n\t" \ - "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ - /*4-7 rotation by 7pi/16. \ - mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ - "pmulhw %%mm1,%%mm3\n\t" \ - "pmulhw %%mm1,%%mm7\n\t" \ - "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ - "movq %%mm3,%%mm6\n\t" \ - "paddw %%mm1,%%mm7\n\t" \ - /*0-1 butterfly. \ - mm4=C4, mm0=X0, X4=0.*/ \ - /*Stage 2:*/ \ - /*4-5 butterfly: mm3=t[4], mm5=t[5] \ - 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ - "psubw %%mm5,%%mm3\n\t" \ - "paddw %%mm5,%%mm6\n\t" \ - "movq %%mm4,%%mm1\n\t" \ - "pmulhw %%mm0,%%mm4\n\t" \ - "paddw %%mm0,%%mm4\n\t" \ - "movq %%mm7,%%mm0\n\t" \ - "movq %%mm4,%%mm5\n\t" \ - "paddw %%mm2,%%mm0\n\t" \ - "psubw %%mm2,%%mm7\n\t" \ - "movq %%mm1,%%mm2\n\t" \ - "pmulhw %%mm6,%%mm1\n\t" \ - "pmulhw %%mm7,%%mm2\n\t" \ - "paddw %%mm6,%%mm1\n\t" \ - "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ - "paddw %%mm7,%%mm2\n\t" \ - "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ - /*Stage 3: \ - 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ - 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ - 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ - "paddw %%mm2,%%mm1\n\t" \ - "paddw %%mm5,%%mm6\n\t" \ - "paddw %%mm4,%%mm7\n\t" \ - "paddw %%mm2,%%mm2\n\t" \ - "paddw %%mm4,%%mm4\n\t" \ - "paddw %%mm5,%%mm5\n\t" \ - "psubw %%mm1,%%mm2\n\t" \ - "psubw %%mm7,%%mm4\n\t" \ - "psubw %%mm6,%%mm5\n\t" \ - /*Stage 4: \ - 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ - 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ - 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ - 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ - "psubw %%mm0,%%mm7\n\t" \ - "psubw %%mm1,%%mm6\n\t" \ - "psubw %%mm2,%%mm5\n\t" \ - "psubw %%mm3,%%mm4\n\t" \ - "paddw %%mm0,%%mm0\n\t" \ - "paddw %%mm1,%%mm1\n\t" \ - "paddw %%mm2,%%mm2\n\t" \ - "paddw %%mm3,%%mm3\n\t" \ - "paddw %%mm7,%%mm0\n\t" \ - "paddw %%mm6,%%mm1\n\t" \ - "paddw %%mm5,%%mm2\n\t" \ - "paddw %%mm4,%%mm3\n\t" \ - -#define OC_IDCT_8x8_10_ABC \ - "#OC_IDCT_8x8_10_ABC\n\t" \ - /*Stage 1:*/ \ - /*2-3 rotation by 6pi/16. \ - xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ - "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ - "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ - "pmulhw %%xmm2,%%xmm6\n\t" \ - "pmulhw %%xmm2,%%xmm7\n\t" \ - "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ - "paddw %%xmm6,%%xmm2\n\t" \ - "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ - "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ - "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ - /*5-6 rotation by 3pi/16. \ - xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ - "pmulhw %%xmm3,%%xmm5\n\t" \ - "pmulhw %%xmm3,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ - "paddw %%xmm3,%%xmm5\n\t" \ - "paddw %%xmm3,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ - /*4-7 rotation by 7pi/16. \ - xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ - "pmulhw %%xmm1,%%xmm3\n\t" \ - "pmulhw %%xmm1,%%xmm7\n\t" \ - "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ - "movdqa %%xmm3,%%xmm6\n\t" \ - "paddw %%xmm1,%%xmm7\n\t" \ - /*0-1 butterfly. \ - xmm4=C4, xmm0=X0, X4=0.*/ \ - /*Stage 2:*/ \ - /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ - 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ - "psubw %%xmm5,%%xmm3\n\t" \ - "paddw %%xmm5,%%xmm6\n\t" \ - "movdqa %%xmm4,%%xmm1\n\t" \ - "pmulhw %%xmm0,%%xmm4\n\t" \ - "paddw %%xmm0,%%xmm4\n\t" \ - "movdqa %%xmm7,%%xmm0\n\t" \ - "movdqa %%xmm4,%%xmm5\n\t" \ - "paddw %%xmm2,%%xmm0\n\t" \ - "psubw %%xmm2,%%xmm7\n\t" \ - "movdqa %%xmm1,%%xmm2\n\t" \ - "pmulhw %%xmm6,%%xmm1\n\t" \ - "pmulhw %%xmm7,%%xmm2\n\t" \ - "paddw %%xmm6,%%xmm1\n\t" \ - "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ - "paddw %%xmm7,%%xmm2\n\t" \ - "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ - /*Stage 3: \ - 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ - 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ - 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ - "paddw %%xmm2,%%xmm1\n\t" \ - "paddw %%xmm5,%%xmm6\n\t" \ - "paddw %%xmm4,%%xmm7\n\t" \ - "paddw %%xmm2,%%xmm2\n\t" \ - "paddw %%xmm4,%%xmm4\n\t" \ - "paddw %%xmm5,%%xmm5\n\t" \ - "psubw %%xmm1,%%xmm2\n\t" \ - "psubw %%xmm7,%%xmm4\n\t" \ - "psubw %%xmm6,%%xmm5\n\t" \ - -static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - OC_ALIGN16(ogg_int16_t buf[16]); - /*This routine accepts an 8x8 matrix pre-transposed.*/ - __asm__ __volatile__( - "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" - "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" - "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" - "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" - OC_IDCT_8x8_10_MMX - OC_TRANSPOSE_8x4_MMX2SSE - OC_IDCT_8x8_10_ABC - OC_IDCT_8x8_D_STORE - :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), - [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) - :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), - [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) - ); - if(_x!=_y){ - /*Clear input data for next block (decoder only).*/ - __asm__ __volatile__( - "pxor %%mm0,%%mm0\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" - "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" - :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) - ); - } -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform.*/ -void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ - /*_last_zzi is subtly different from an actual count of the number of - coefficients we decoded for this block. - It contains the value of zzi BEFORE the final token in the block was - decoded. - In most cases this is an EOB token (the continuation of an EOB run from a - previous block counts), and so this is the same as the coefficient count. - However, in the case that the last token was NOT an EOB token, but filled - the block up with exactly 64 coefficients, _last_zzi will be less than 64. - Provided the last token was not a pure zero run, the minimum value it can - be is 46, and so that doesn't affect any of the cases in this routine. - However, if the last token WAS a pure zero run of length 63, then _last_zzi - will be 1 while the number of coefficients decoded is 64. - Thus, we will trigger the following special case, where the real - coefficient count would not. - Note also that a zero run of length 64 will give _last_zzi a value of 0, - but we still process the DC coefficient, which might have a non-zero value - due to DC prediction. - Although convoluted, this is arguably the correct behavior: it allows us to - use a smaller transform when the block ends with a long zero run instead - of a normal EOB token. - It could be smarter... multiple separate zero runs at the end of a block - will fool it, but an encoder that generates these really deserves what it - gets. - Needless to say we inherited this approach from VP3.*/ - /*Then perform the iDCT.*/ - if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); - else oc_idct8x8_slow_sse2(_y,_x); -} - -#endif diff --git a/media/libtheora/lib/x86/sse2trans.h b/media/libtheora/lib/x86/sse2trans.h deleted file mode 100644 index e76da5140..000000000 --- a/media/libtheora/lib/x86/sse2trans.h +++ /dev/null @@ -1,242 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ - - ********************************************************************/ - -#if !defined(_x86_sse2trans_H) -# define _x86_sse2trans_H (1) -# include "x86int.h" - -# if defined(OC_X86_64_ASM) -/*On x86-64 we can transpose in-place without spilling registers. - By clever choices of the order to apply the butterflies and the order of - their outputs, we can take the rows in order and output the columns in order - without any extra operations and using just one temporary register.*/ -# define OC_TRANSPOSE_8x8 \ - "#OC_TRANSPOSE_8x8\n\t" \ - "movdqa %%xmm4,%%xmm8\n\t" \ - /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ - "punpcklwd %%xmm5,%%xmm4\n\t" \ - /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ - "punpckhwd %%xmm5,%%xmm8\n\t" \ - /*xmm5 is free.*/ \ - "movdqa %%xmm0,%%xmm5\n\t" \ - /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ - "punpcklwd %%xmm1,%%xmm0\n\t" \ - /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ - "punpckhwd %%xmm1,%%xmm5\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm6,%%xmm1\n\t" \ - /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ - "punpcklwd %%xmm7,%%xmm6\n\t" \ - /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ - "punpckhwd %%xmm7,%%xmm1\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm2,%%xmm7\n\t" \ - /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ - "punpckhwd %%xmm3,%%xmm2\n\t" \ - /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ - "punpcklwd %%xmm3,%%xmm7\n\t" \ - /*xmm3 is free.*/ \ - "movdqa %%xmm0,%%xmm3\n\t" \ - /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ - "punpckldq %%xmm7,%%xmm0\n\t" \ - /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ - "punpckhdq %%xmm7,%%xmm3\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm5,%%xmm7\n\t" \ - /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ - "punpckldq %%xmm2,%%xmm5\n\t" \ - /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ - "punpckhdq %%xmm2,%%xmm7\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm4,%%xmm2\n\t" \ - /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ - "punpckhdq %%xmm6,%%xmm4\n\t" \ - /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ - "punpckldq %%xmm6,%%xmm2\n\t" \ - /*xmm6 is free.*/ \ - "movdqa %%xmm8,%%xmm6\n\t" \ - /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ - "punpckldq %%xmm1,%%xmm6\n\t" \ - /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ - "punpckhdq %%xmm1,%%xmm8\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm0,%%xmm1\n\t" \ - /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "punpcklqdq %%xmm2,%%xmm0\n\t" \ - /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ - "punpckhqdq %%xmm2,%%xmm1\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm3,%%xmm2\n\t" \ - /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ - "punpckhqdq %%xmm4,%%xmm3\n\t" \ - /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ - "punpcklqdq %%xmm4,%%xmm2\n\t" \ - /*xmm4 is free.*/ \ - "movdqa %%xmm5,%%xmm4\n\t" \ - /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ - "punpckhqdq %%xmm6,%%xmm5\n\t" \ - /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ - "punpcklqdq %%xmm6,%%xmm4\n\t" \ - /*xmm6 is free.*/ \ - "movdqa %%xmm7,%%xmm6\n\t" \ - /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ - "punpckhqdq %%xmm8,%%xmm7\n\t" \ - /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ - "punpcklqdq %%xmm8,%%xmm6\n\t" \ - /*xmm8 is free.*/ \ - -# else -/*Otherwise, we need to spill some values to %[buf] temporarily. - Again, the butterflies are carefully arranged to get the columns to come out - in order, minimizing register spills and maximizing the delay between a load - and when the value loaded is actually used.*/ -# define OC_TRANSPOSE_8x8 \ - "#OC_TRANSPOSE_8x8\n\t" \ - /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ - "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ - /*xmm0 is free.*/ \ - "movdqa %%xmm2,%%xmm0\n\t" \ - /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ - "punpckhwd %%xmm3,%%xmm2\n\t" \ - /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ - "punpcklwd %%xmm3,%%xmm0\n\t" \ - /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ - "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ - /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ - "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm6,%%xmm2\n\t" \ - /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ - "punpcklwd %%xmm7,%%xmm6\n\t" \ - /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ - "punpckhwd %%xmm7,%%xmm2\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm4,%%xmm7\n\t" \ - /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ - "punpcklwd %%xmm5,%%xmm4\n\t" \ - /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ - "punpckhwd %%xmm5,%%xmm7\n\t" \ - /*xmm5 is free.*/ \ - "movdqa %%xmm3,%%xmm5\n\t" \ - /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ - "punpcklwd %%xmm1,%%xmm3\n\t" \ - /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ - "punpckhwd %%xmm1,%%xmm5\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm7,%%xmm1\n\t" \ - /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ - "punpckldq %%xmm2,%%xmm7\n\t" \ - /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ - "punpckhdq %%xmm2,%%xmm1\n\t" \ - /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ - "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ - /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ - "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm3,%%xmm1\n\t" \ - /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ - "punpckhdq %%xmm0,%%xmm3\n\t" \ - /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ - "punpckldq %%xmm0,%%xmm1\n\t" \ - /*xmm0 is free.*/ \ - "movdqa %%xmm4,%%xmm0\n\t" \ - /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ - "punpckhdq %%xmm6,%%xmm4\n\t" \ - /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ - "punpckldq %%xmm6,%%xmm0\n\t" \ - /*xmm6 is free.*/ \ - "movdqa %%xmm5,%%xmm6\n\t" \ - /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ - "punpckldq %%xmm2,%%xmm5\n\t" \ - /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ - "punpckhdq %%xmm2,%%xmm6\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm1,%%xmm2\n\t" \ - /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ - "punpckhqdq %%xmm0,%%xmm1\n\t" \ - /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "punpcklqdq %%xmm0,%%xmm2\n\t" \ - /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ - "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ - /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm3,%%xmm2\n\t" \ - /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ - "punpckhqdq %%xmm4,%%xmm3\n\t" \ - /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ - "punpcklqdq %%xmm4,%%xmm2\n\t" \ - /*xmm4 is free.*/ \ - "movdqa %%xmm5,%%xmm4\n\t" \ - /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ - "punpckhqdq %%xmm7,%%xmm5\n\t" \ - /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ - "punpcklqdq %%xmm7,%%xmm4\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm6,%%xmm7\n\t" \ - /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ - "punpcklqdq %%xmm0,%%xmm6\n\t" \ - /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ - "punpckhqdq %%xmm0,%%xmm7\n\t" \ - /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ - -# endif - -/*Transpose 4 values in each of 8 MMX registers into 8 values in the first - four SSE registers. - No need to be clever here; we have plenty of room.*/ -# define OC_TRANSPOSE_8x4_MMX2SSE \ - "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ - "movq2dq %%mm0,%%xmm0\n\t" \ - "movq2dq %%mm1,%%xmm1\n\t" \ - /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ - "punpcklwd %%xmm1,%%xmm0\n\t" \ - "movq2dq %%mm2,%%xmm3\n\t" \ - "movq2dq %%mm3,%%xmm2\n\t" \ - /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ - "punpcklwd %%xmm2,%%xmm3\n\t" \ - "movq2dq %%mm4,%%xmm4\n\t" \ - "movq2dq %%mm5,%%xmm5\n\t" \ - /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ - "punpcklwd %%xmm5,%%xmm4\n\t" \ - "movq2dq %%mm6,%%xmm7\n\t" \ - "movq2dq %%mm7,%%xmm6\n\t" \ - /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ - "punpcklwd %%xmm6,%%xmm7\n\t" \ - "movdqa %%xmm0,%%xmm2\n\t" \ - /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ - "punpckldq %%xmm3,%%xmm0\n\t" \ - /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ - "punpckhdq %%xmm3,%%xmm2\n\t" \ - "movdqa %%xmm4,%%xmm5\n\t" \ - /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ - "punpckldq %%xmm7,%%xmm4\n\t" \ - /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ - "punpckhdq %%xmm7,%%xmm5\n\t" \ - "movdqa %%xmm0,%%xmm1\n\t" \ - /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "punpcklqdq %%xmm4,%%xmm0\n\t" \ - /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ - "punpckhqdq %%xmm4,%%xmm1\n\t" \ - "movdqa %%xmm2,%%xmm3\n\t" \ - /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ - "punpcklqdq %%xmm5,%%xmm2\n\t" \ - /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ - "punpckhqdq %%xmm5,%%xmm3\n\t" \ - -#endif diff --git a/media/libtheora/lib/x86/x86cpu.c b/media/libtheora/lib/x86/x86cpu.c deleted file mode 100644 index c3a20b319..000000000 --- a/media/libtheora/lib/x86/x86cpu.c +++ /dev/null @@ -1,182 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - CPU capability detection for x86 processors. - Originally written by Rudolf Marek. - - function: - last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#include "x86cpu.h" - -#if !defined(OC_X86_ASM) -ogg_uint32_t oc_cpu_flags_get(void){ - return 0; -} -#else -# if defined(__amd64__)||defined(__x86_64__) -/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when - compiling with -fPIC.*/ -# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ - __asm__ __volatile__( \ - "cpuid\n\t" \ - :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ - :"a"(_op) \ - :"cc" \ - ) -# else -/*On x86-32, not so much.*/ -# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ - __asm__ __volatile__( \ - "xchgl %%ebx,%[ebx]\n\t" \ - "cpuid\n\t" \ - "xchgl %%ebx,%[ebx]\n\t" \ - :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ - :"a"(_op) \ - :"cc" \ - ) -# endif - -static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ - ogg_uint32_t flags; - /*If there isn't even MMX, give up.*/ - if(!(_edx&0x00800000))return 0; - flags=OC_CPU_X86_MMX; - if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE; - if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2; - if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI; - if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3; - if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1; - if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2; - return flags; -} - -static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ - ogg_uint32_t flags; - /*If there isn't even MMX, give up.*/ - if(!(_edx&0x00800000))return 0; - flags=OC_CPU_X86_MMX; - if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT; - if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW; - if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT; - if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A; - if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5; - return flags; -} - -ogg_uint32_t oc_cpu_flags_get(void){ - ogg_uint32_t flags; - ogg_uint32_t eax; - ogg_uint32_t ebx; - ogg_uint32_t ecx; - ogg_uint32_t edx; -# if !defined(__amd64__)&&!defined(__x86_64__) - /*Not all x86-32 chips support cpuid, so we have to check.*/ - __asm__ __volatile__( - "pushfl\n\t" - "pushfl\n\t" - "popl %[a]\n\t" - "movl %[a],%[b]\n\t" - "xorl $0x200000,%[a]\n\t" - "pushl %[a]\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %[a]\n\t" - "popfl\n\t" - :[a]"=r"(eax),[b]"=r"(ebx) - : - :"cc" - ); - /*No cpuid.*/ - if(eax==ebx)return 0; -# endif - cpuid(0,eax,ebx,ecx,edx); - /* l e t n I e n i u n e G*/ - if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547|| - /* 6 8 x M T e n i u n e G*/ - ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){ - int family; - int model; - /*Intel, Transmeta (tested with Crusoe TM5800):*/ - cpuid(1,eax,ebx,ecx,edx); - flags=oc_parse_intel_flags(edx,ecx); - family=(eax>>8)&0xF; - model=(eax>>4)&0xF; - /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX - unit, so don't use it.*/ - if(family==6&&(model==9||model==13||model==14)){ - flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI); - } - } - /* D M A c i t n e h t u A*/ - else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541|| - /* C S N y b e d o e G*/ - ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){ - /*AMD, Geode:*/ - cpuid(0x80000000,eax,ebx,ecx,edx); - if(eax<0x80000001)flags=0; - else{ - cpuid(0x80000001,eax,ebx,ecx,edx); - flags=oc_parse_amd_flags(edx,ecx); - } - /*Also check for SSE.*/ - cpuid(1,eax,ebx,ecx,edx); - flags|=oc_parse_intel_flags(edx,ecx); - } - /*Technically some VIA chips can be configured in the BIOS to return any - string here the user wants. - There is a special detection method that can be used to identify such - processors, but in my opinion, if the user really wants to change it, they - deserve what they get.*/ - /* s l u a H r u a t n e C*/ - else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){ - /*VIA:*/ - /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming) - chips (thanks to the engineers from Centaur Technology who provided it). - These chips support Intel-like cpuid info. - The C3-2 (Nehemiah) cores appear to, as well.*/ - cpuid(1,eax,ebx,ecx,edx); - flags=oc_parse_intel_flags(edx,ecx); - if(eax>=0x80000001){ - /*The (non-Nehemiah) C3 processors support AMD-like cpuid info. - We need to check this even if the Intel test succeeds to pick up 3DNow! - support on these processors. - Unlike actual AMD processors, we cannot _rely_ on this info, since - some cores (e.g., the 693 stepping of the Nehemiah) claim to support - this function, yet return edx=0, despite the Intel test indicating - MMX support. - Therefore the features detected here are strictly added to those - detected by the Intel test.*/ - /*TODO: How about earlier chips?*/ - cpuid(0x80000001,eax,ebx,ecx,edx); - /*Note: As of the C7, this function returns Intel-style extended feature - flags, not AMD-style. - Currently, this only defines bits 11, 20, and 29 (0x20100800), which - do not conflict with any of the AMD flags we inspect. - For the remaining bits, Intel tells us, "Do not count on their value", - but VIA assures us that they will all be zero (at least on the C7 and - Isaiah chips). - In the (unlikely) event a future processor uses bits 18, 19, 30, or 31 - (0xC0C00000) for something else, we will have to add code to detect - the model to decide when it is appropriate to inspect them.*/ - flags|=oc_parse_amd_flags(edx,ecx); - } - } - else{ - /*Implement me.*/ - flags=0; - } - return flags; -} -#endif diff --git a/media/libtheora/lib/x86/x86cpu.h b/media/libtheora/lib/x86/x86cpu.h deleted file mode 100644 index 153a48d89..000000000 --- a/media/libtheora/lib/x86/x86cpu.h +++ /dev/null @@ -1,36 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - function: - last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#if !defined(_x86_x86cpu_H) -# define _x86_x86cpu_H (1) -#include "../internal.h" - -#define OC_CPU_X86_MMX (1<<0) -#define OC_CPU_X86_3DNOW (1<<1) -#define OC_CPU_X86_3DNOWEXT (1<<2) -#define OC_CPU_X86_MMXEXT (1<<3) -#define OC_CPU_X86_SSE (1<<4) -#define OC_CPU_X86_SSE2 (1<<5) -#define OC_CPU_X86_PNI (1<<6) -#define OC_CPU_X86_SSSE3 (1<<7) -#define OC_CPU_X86_SSE4_1 (1<<8) -#define OC_CPU_X86_SSE4_2 (1<<9) -#define OC_CPU_X86_SSE4A (1<<10) -#define OC_CPU_X86_SSE5 (1<<11) - -ogg_uint32_t oc_cpu_flags_get(void); - -#endif diff --git a/media/libtheora/lib/x86/x86int.h b/media/libtheora/lib/x86/x86int.h deleted file mode 100644 index 35bfb0a02..000000000 --- a/media/libtheora/lib/x86/x86int.h +++ /dev/null @@ -1,122 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86int.h 17578 2010-10-29 04:21:26Z tterribe $ - - ********************************************************************/ - -#if !defined(_x86_x86int_H) -# define _x86_x86int_H (1) -# include "../internal.h" - -# if defined(OC_X86_ASM) -# define oc_state_accel_init oc_state_accel_init_x86 -# if defined(OC_X86_64_ASM) -/*x86-64 guarantees SIMD support up through at least SSE2. - If the best routine we have available only needs SSE2 (which at the moment - covers all of them), then we can avoid runtime detection and the indirect - call.*/ -# define oc_frag_copy(_state,_dst,_src,_ystride) \ - oc_frag_copy_mmx(_dst,_src,_ystride) -# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs) \ - oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \ - _fragis,_nfragis,_frag_buf_offs) -# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \ - oc_frag_recon_intra_mmx(_dst,_ystride,_residue) -# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ - oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue) -# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ - oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue) -# define oc_idct8x8(_state,_y,_x,_last_zzi) \ - oc_idct8x8_sse2(_y,_x,_last_zzi) -# define oc_state_frag_recon oc_state_frag_recon_mmx -# define oc_loop_filter_init(_state,_bv,_flimit) \ - oc_loop_filter_init_mmxext(_bv,_flimit) -# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext -# define oc_restore_fpu(_state) \ - oc_restore_fpu_mmx() -# else -# define OC_STATE_USE_VTABLE (1) -# endif -# endif - -# include "../state.h" -# include "x86cpu.h" - -/*Converts the expression in the argument to a string.*/ -#define OC_M2STR(_s) #_s - -/*Memory operands do not always include an offset. - To avoid warnings, we force an offset with %H (which adds 8).*/ -# if __GNUC_PREREQ(4,0) -# define OC_MEM_OFFS(_offs,_name) \ - OC_M2STR(_offs-8+%H[_name]) -# endif -/*If your gcc version does't support %H, then you get to suffer the warnings. - Note that Apple's gas breaks on things like _offs+(%esp): it throws away the - whole offset, instead of substituting in 0 for the missing operand to +.*/ -# if !defined(OC_MEM_OFFS) -# define OC_MEM_OFFS(_offs,_name) \ - OC_M2STR(_offs+%[_name]) -# endif - -/*Declare an array operand with an exact size. - This tells gcc we're going to clobber this memory region, without having to - clobber all of "memory" and lets us access local buffers directly using the - stack pointer, without allocating a separate register to point to them.*/ -#define OC_ARRAY_OPERAND(_type,_ptr,_size) \ - (*({ \ - struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \ - array_addr__; \ - })) - -/*Declare an array operand with an exact size. - This tells gcc we're going to clobber this memory region, without having to - clobber all of "memory" and lets us access local buffers directly using the - stack pointer, without allocating a separate register to point to them.*/ -#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \ - (*({ \ - const struct{_type array_value__[(_size)];} *array_addr__= \ - (const void *)(_ptr); \ - array_addr__; \ - })) - -extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64]; - -void oc_state_accel_init_x86(oc_theora_state *_state); - -void oc_frag_copy_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride); -void oc_frag_copy_list_mmx(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue); -void oc_frag_recon_inter_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); -void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit); -void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit); -void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_restore_fpu_mmx(void); - -#endif diff --git a/media/libtheora/lib/x86/x86state.c b/media/libtheora/lib/x86/x86state.c deleted file mode 100644 index a3d37267f..000000000 --- a/media/libtheora/lib/x86/x86state.c +++ /dev/null @@ -1,95 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86state.c 17421 2010-09-22 16:46:18Z giles $ - - ********************************************************************/ - -#include "x86int.h" - -#if defined(OC_X86_ASM) - -/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into - each quadrant of the destination.*/ -static const unsigned char OC_FZIG_ZAG_MMX[128]={ - 0, 8, 1, 2, 9,16,24,17, - 10, 3,32,11,18,25, 4,12, - 5,26,19,40,33,34,41,48, - 27, 6,13,20,28,21,14, 7, - 56,49,42,35,43,50,57,36, - 15,22,29,30,23,44,37,58, - 51,59,38,45,52,31,60,53, - 46,39,47,54,61,62,55,63, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64 -}; - -/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into - the destination.*/ -static const unsigned char OC_FZIG_ZAG_SSE2[128]={ - 0, 8, 1, 2, 9,16,24,17, - 10, 3, 4,11,18,25,32,40, - 33,26,19,12, 5, 6,13,20, - 27,34,41,48,56,49,42,35, - 28,21,14, 7,15,22,29,36, - 43,50,57,58,51,44,37,30, - 23,31,38,45,52,59,60,53, - 46,39,47,54,61,62,55,63, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64 -}; - -void oc_state_accel_init_x86(oc_theora_state *_state){ - oc_state_accel_init_c(_state); - _state->cpu_flags=oc_cpu_flags_get(); -# if defined(OC_STATE_USE_VTABLE) - if(_state->cpu_flags&OC_CPU_X86_MMX){ - _state->opt_vtable.frag_copy=oc_frag_copy_mmx; - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx; - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; - _state->opt_vtable.idct8x8=oc_idct8x8_mmx; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx; - _state->opt_vtable.state_loop_filter_frag_rows= - oc_state_loop_filter_frag_rows_mmx; - _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; - } - if(_state->cpu_flags&OC_CPU_X86_MMXEXT){ - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext; - _state->opt_vtable.state_loop_filter_frag_rows= - oc_state_loop_filter_frag_rows_mmxext; - } - if(_state->cpu_flags&OC_CPU_X86_SSE2){ - _state->opt_vtable.idct8x8=oc_idct8x8_sse2; -# endif - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2; -# if defined(OC_STATE_USE_VTABLE) - } -# endif -} -#endif diff --git a/media/libtheora/lib/x86_vc/mmxfrag.c b/media/libtheora/lib/x86_vc/mmxfrag.c deleted file mode 100644 index c16b026ff..000000000 --- a/media/libtheora/lib/x86_vc/mmxfrag.c +++ /dev/null @@ -1,416 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of fragment reconstruction for motion compensation. - Originally written by Rudolf Marek. - Additional optimization by Nils Pipenbrinck. - Note: Loops are unrolled for best performance. - The iteration each instruction belongs to is marked in the comments as #i.*/ -#include <stddef.h> -#include "x86int.h" - -#if defined(OC_X86_ASM) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ - do{ \ - const unsigned char *src; \ - unsigned char *dst; \ - src=(_src); \ - dst=(_dst); \ - __asm mov SRC,src \ - __asm mov DST,dst \ - __asm mov YSTRIDE,_ystride \ - /*src+0*ystride*/ \ - __asm movq mm0,[SRC] \ - /*src+1*ystride*/ \ - __asm movq mm1,[SRC+YSTRIDE] \ - /*ystride3=ystride*3*/ \ - __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ - /*src+2*ystride*/ \ - __asm movq mm2,[SRC+YSTRIDE*2] \ - /*src+3*ystride*/ \ - __asm movq mm3,[SRC+YSTRIDE3] \ - /*dst+0*ystride*/ \ - __asm movq [DST],mm0 \ - /*dst+1*ystride*/ \ - __asm movq [DST+YSTRIDE],mm1 \ - /*Pointer to next 4.*/ \ - __asm lea SRC,[SRC+YSTRIDE*4] \ - /*dst+2*ystride*/ \ - __asm movq [DST+YSTRIDE*2],mm2 \ - /*dst+3*ystride*/ \ - __asm movq [DST+YSTRIDE3],mm3 \ - /*Pointer to next 4.*/ \ - __asm lea DST,[DST+YSTRIDE*4] \ - /*src+0*ystride*/ \ - __asm movq mm0,[SRC] \ - /*src+1*ystride*/ \ - __asm movq mm1,[SRC+YSTRIDE] \ - /*src+2*ystride*/ \ - __asm movq mm2,[SRC+YSTRIDE*2] \ - /*src+3*ystride*/ \ - __asm movq mm3,[SRC+YSTRIDE3] \ - /*dst+0*ystride*/ \ - __asm movq [DST],mm0 \ - /*dst+1*ystride*/ \ - __asm movq [DST+YSTRIDE],mm1 \ - /*dst+2*ystride*/ \ - __asm movq [DST+YSTRIDE*2],mm2 \ - /*dst+3*ystride*/ \ - __asm movq [DST+YSTRIDE3],mm3 \ - } \ - while(0) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -void oc_frag_copy_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride){ -#define SRC edx -#define DST eax -#define YSTRIDE ecx -#define YSTRIDE3 esi - OC_FRAG_COPY_MMX(_dst,_src,_ystride); -#undef SRC -#undef DST -#undef YSTRIDE -#undef YSTRIDE3 -} - -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _ystride: The row stride of the reference frames. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _frag_buf_offs: The offsets of fragments in the reference frames.*/ -void oc_frag_copy_list_mmx(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ - ptrdiff_t fragii; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=_frag_buf_offs[_fragis[fragii]]; -#define SRC edx -#define DST eax -#define YSTRIDE ecx -#define YSTRIDE3 edi - OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, - _src_frame+frag_buf_off,_ystride); -#undef SRC -#undef DST -#undef YSTRIDE -#undef YSTRIDE3 - } -} - -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue){ - __asm{ -#define DST edx -#define DST4 esi -#define YSTRIDE eax -#define YSTRIDE3 edi -#define RESIDUE ecx - mov DST,_dst - mov YSTRIDE,_ystride - mov RESIDUE,_residue - lea DST4,[DST+YSTRIDE*4] - lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] - /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ - pcmpeqw mm0,mm0 - /*#0 Load low residue.*/ - movq mm1,[0*8+RESIDUE] - /*#0 Load high residue.*/ - movq mm2,[1*8+RESIDUE] - /*Set mm0 to 0x8000800080008000.*/ - psllw mm0,15 - /*#1 Load low residue.*/ - movq mm3,[2*8+RESIDUE] - /*#1 Load high residue.*/ - movq mm4,[3*8+RESIDUE] - /*Set mm0 to 0x0080008000800080.*/ - psrlw mm0,8 - /*#2 Load low residue.*/ - movq mm5,[4*8+RESIDUE] - /*#2 Load high residue.*/ - movq mm6,[5*8+RESIDUE] - /*#0 Bias low residue.*/ - paddsw mm1,mm0 - /*#0 Bias high residue.*/ - paddsw mm2,mm0 - /*#0 Pack to byte.*/ - packuswb mm1,mm2 - /*#1 Bias low residue.*/ - paddsw mm3,mm0 - /*#1 Bias high residue.*/ - paddsw mm4,mm0 - /*#1 Pack to byte.*/ - packuswb mm3,mm4 - /*#2 Bias low residue.*/ - paddsw mm5,mm0 - /*#2 Bias high residue.*/ - paddsw mm6,mm0 - /*#2 Pack to byte.*/ - packuswb mm5,mm6 - /*#0 Write row.*/ - movq [DST],mm1 - /*#1 Write row.*/ - movq [DST+YSTRIDE],mm3 - /*#2 Write row.*/ - movq [DST+YSTRIDE*2],mm5 - /*#3 Load low residue.*/ - movq mm1,[6*8+RESIDUE] - /*#3 Load high residue.*/ - movq mm2,[7*8+RESIDUE] - /*#4 Load high residue.*/ - movq mm3,[8*8+RESIDUE] - /*#4 Load high residue.*/ - movq mm4,[9*8+RESIDUE] - /*#5 Load high residue.*/ - movq mm5,[10*8+RESIDUE] - /*#5 Load high residue.*/ - movq mm6,[11*8+RESIDUE] - /*#3 Bias low residue.*/ - paddsw mm1,mm0 - /*#3 Bias high residue.*/ - paddsw mm2,mm0 - /*#3 Pack to byte.*/ - packuswb mm1,mm2 - /*#4 Bias low residue.*/ - paddsw mm3,mm0 - /*#4 Bias high residue.*/ - paddsw mm4,mm0 - /*#4 Pack to byte.*/ - packuswb mm3,mm4 - /*#5 Bias low residue.*/ - paddsw mm5,mm0 - /*#5 Bias high residue.*/ - paddsw mm6,mm0 - /*#5 Pack to byte.*/ - packuswb mm5,mm6 - /*#3 Write row.*/ - movq [DST+YSTRIDE3],mm1 - /*#4 Write row.*/ - movq [DST4],mm3 - /*#5 Write row.*/ - movq [DST4+YSTRIDE],mm5 - /*#6 Load low residue.*/ - movq mm1,[12*8+RESIDUE] - /*#6 Load high residue.*/ - movq mm2,[13*8+RESIDUE] - /*#7 Load low residue.*/ - movq mm3,[14*8+RESIDUE] - /*#7 Load high residue.*/ - movq mm4,[15*8+RESIDUE] - /*#6 Bias low residue.*/ - paddsw mm1,mm0 - /*#6 Bias high residue.*/ - paddsw mm2,mm0 - /*#6 Pack to byte.*/ - packuswb mm1,mm2 - /*#7 Bias low residue.*/ - paddsw mm3,mm0 - /*#7 Bias high residue.*/ - paddsw mm4,mm0 - /*#7 Pack to byte.*/ - packuswb mm3,mm4 - /*#6 Write row.*/ - movq [DST4+YSTRIDE*2],mm1 - /*#7 Write row.*/ - movq [DST4+YSTRIDE3],mm3 -#undef DST -#undef DST4 -#undef YSTRIDE -#undef YSTRIDE3 -#undef RESIDUE - } -} - -void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, - int _ystride,const ogg_int16_t *_residue){ - int i; - /*Zero mm0.*/ - __asm pxor mm0,mm0; - for(i=4;i-->0;){ - __asm{ -#define DST edx -#define SRC ecx -#define YSTRIDE edi -#define RESIDUE eax - mov DST,_dst - mov SRC,_src - mov YSTRIDE,_ystride - mov RESIDUE,_residue - /*#0 Load source.*/ - movq mm3,[SRC] - /*#1 Load source.*/ - movq mm7,[SRC+YSTRIDE] - /*#0 Get copy of src.*/ - movq mm4,mm3 - /*#0 Expand high source.*/ - punpckhbw mm4,mm0 - /*#0 Expand low source.*/ - punpcklbw mm3,mm0 - /*#0 Add residue high.*/ - paddsw mm4,[8+RESIDUE] - /*#1 Get copy of src.*/ - movq mm2,mm7 - /*#0 Add residue low.*/ - paddsw mm3,[RESIDUE] - /*#1 Expand high source.*/ - punpckhbw mm2,mm0 - /*#0 Pack final row pixels.*/ - packuswb mm3,mm4 - /*#1 Expand low source.*/ - punpcklbw mm7,mm0 - /*#1 Add residue low.*/ - paddsw mm7,[16+RESIDUE] - /*#1 Add residue high.*/ - paddsw mm2,[24+RESIDUE] - /*Advance residue.*/ - lea RESIDUE,[32+RESIDUE] - /*#1 Pack final row pixels.*/ - packuswb mm7,mm2 - /*Advance src.*/ - lea SRC,[SRC+YSTRIDE*2] - /*#0 Write row.*/ - movq [DST],mm3 - /*#1 Write row.*/ - movq [DST+YSTRIDE],mm7 - /*Advance dst.*/ - lea DST,[DST+YSTRIDE*2] - mov _residue,RESIDUE - mov _dst,DST - mov _src,SRC -#undef DST -#undef SRC -#undef YSTRIDE -#undef RESIDUE - } - } -} - -void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ - int i; - /*Zero mm7.*/ - __asm pxor mm7,mm7; - for(i=4;i-->0;){ - __asm{ -#define SRC1 ecx -#define SRC2 edi -#define YSTRIDE esi -#define RESIDUE edx -#define DST eax - mov YSTRIDE,_ystride - mov DST,_dst - mov RESIDUE,_residue - mov SRC1,_src1 - mov SRC2,_src2 - /*#0 Load src1.*/ - movq mm0,[SRC1] - /*#0 Load src2.*/ - movq mm2,[SRC2] - /*#0 Copy src1.*/ - movq mm1,mm0 - /*#0 Copy src2.*/ - movq mm3,mm2 - /*#1 Load src1.*/ - movq mm4,[SRC1+YSTRIDE] - /*#0 Unpack lower src1.*/ - punpcklbw mm0,mm7 - /*#1 Load src2.*/ - movq mm5,[SRC2+YSTRIDE] - /*#0 Unpack higher src1.*/ - punpckhbw mm1,mm7 - /*#0 Unpack lower src2.*/ - punpcklbw mm2,mm7 - /*#0 Unpack higher src2.*/ - punpckhbw mm3,mm7 - /*Advance src1 ptr.*/ - lea SRC1,[SRC1+YSTRIDE*2] - /*Advance src2 ptr.*/ - lea SRC2,[SRC2+YSTRIDE*2] - /*#0 Lower src1+src2.*/ - paddsw mm0,mm2 - /*#0 Higher src1+src2.*/ - paddsw mm1,mm3 - /*#1 Copy src1.*/ - movq mm2,mm4 - /*#0 Build lo average.*/ - psraw mm0,1 - /*#1 Copy src2.*/ - movq mm3,mm5 - /*#1 Unpack lower src1.*/ - punpcklbw mm4,mm7 - /*#0 Build hi average.*/ - psraw mm1,1 - /*#1 Unpack higher src1.*/ - punpckhbw mm2,mm7 - /*#0 low+=residue.*/ - paddsw mm0,[RESIDUE] - /*#1 Unpack lower src2.*/ - punpcklbw mm5,mm7 - /*#0 high+=residue.*/ - paddsw mm1,[8+RESIDUE] - /*#1 Unpack higher src2.*/ - punpckhbw mm3,mm7 - /*#1 Lower src1+src2.*/ - paddsw mm5,mm4 - /*#0 Pack and saturate.*/ - packuswb mm0,mm1 - /*#1 Higher src1+src2.*/ - paddsw mm3,mm2 - /*#0 Write row.*/ - movq [DST],mm0 - /*#1 Build lo average.*/ - psraw mm5,1 - /*#1 Build hi average.*/ - psraw mm3,1 - /*#1 low+=residue.*/ - paddsw mm5,[16+RESIDUE] - /*#1 high+=residue.*/ - paddsw mm3,[24+RESIDUE] - /*#1 Pack and saturate.*/ - packuswb mm5,mm3 - /*#1 Write row ptr.*/ - movq [DST+YSTRIDE],mm5 - /*Advance residue ptr.*/ - add RESIDUE,32 - /*Advance dest ptr.*/ - lea DST,[DST+YSTRIDE*2] - mov _dst,DST - mov _residue,RESIDUE - mov _src1,SRC1 - mov _src2,SRC2 -#undef SRC1 -#undef SRC2 -#undef YSTRIDE -#undef RESIDUE -#undef DST - } - } -} - -void oc_restore_fpu_mmx(void){ - __asm emms; -} - -#endif diff --git a/media/libtheora/lib/x86_vc/mmxidct.c b/media/libtheora/lib/x86_vc/mmxidct.c deleted file mode 100644 index 53a9ac7f3..000000000 --- a/media/libtheora/lib/x86_vc/mmxidct.c +++ /dev/null @@ -1,597 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of Theora's iDCT. - Originally written by Rudolf Marek, based on code from On2's VP3.*/ -#include "x86int.h" -#include "../dct.h" - -#if defined(OC_X86_ASM) - -/*These are offsets into the table of constants below.*/ -/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ -#define OC_COSINE_OFFSET (8) -/*A row of 8's.*/ -#define OC_EIGHT_OFFSET (0) - - - -/*A table of constants used by the MMX routines.*/ -static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={ - 8, 8, 8, 8, - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1 -}; - -/*38 cycles*/ -#define OC_IDCT_BEGIN(_y,_x) __asm{ \ - __asm movq mm2,OC_I(3,_x) \ - __asm movq mm6,OC_C(3) \ - __asm movq mm4,mm2 \ - __asm movq mm7,OC_J(5,_x) \ - __asm pmulhw mm4,mm6 \ - __asm movq mm1,OC_C(5) \ - __asm pmulhw mm6,mm7 \ - __asm movq mm5,mm1 \ - __asm pmulhw mm1,mm2 \ - __asm movq mm3,OC_I(1,_x) \ - __asm pmulhw mm5,mm7 \ - __asm movq mm0,OC_C(1) \ - __asm paddw mm4,mm2 \ - __asm paddw mm6,mm7 \ - __asm paddw mm2,mm1 \ - __asm movq mm1,OC_J(7,_x) \ - __asm paddw mm7,mm5 \ - __asm movq mm5,mm0 \ - __asm pmulhw mm0,mm3 \ - __asm paddw mm4,mm7 \ - __asm pmulhw mm5,mm1 \ - __asm movq mm7,OC_C(7) \ - __asm psubw mm6,mm2 \ - __asm paddw mm0,mm3 \ - __asm pmulhw mm3,mm7 \ - __asm movq mm2,OC_I(2,_x) \ - __asm pmulhw mm7,mm1 \ - __asm paddw mm5,mm1 \ - __asm movq mm1,mm2 \ - __asm pmulhw mm2,OC_C(2) \ - __asm psubw mm3,mm5 \ - __asm movq mm5,OC_J(6,_x) \ - __asm paddw mm0,mm7 \ - __asm movq mm7,mm5 \ - __asm psubw mm0,mm4 \ - __asm pmulhw mm5,OC_C(2) \ - __asm paddw mm2,mm1 \ - __asm pmulhw mm1,OC_C(6) \ - __asm paddw mm4,mm4 \ - __asm paddw mm4,mm0 \ - __asm psubw mm3,mm6 \ - __asm paddw mm5,mm7 \ - __asm paddw mm6,mm6 \ - __asm pmulhw mm7,OC_C(6) \ - __asm paddw mm6,mm3 \ - __asm movq OC_I(1,_y),mm4 \ - __asm psubw mm1,mm5 \ - __asm movq mm4,OC_C(4) \ - __asm movq mm5,mm3 \ - __asm pmulhw mm3,mm4 \ - __asm paddw mm7,mm2 \ - __asm movq OC_I(2,_y),mm6 \ - __asm movq mm2,mm0 \ - __asm movq mm6,OC_I(0,_x) \ - __asm pmulhw mm0,mm4 \ - __asm paddw mm5,mm3 \ - __asm movq mm3,OC_J(4,_x) \ - __asm psubw mm5,mm1 \ - __asm paddw mm2,mm0 \ - __asm psubw mm6,mm3 \ - __asm movq mm0,mm6 \ - __asm pmulhw mm6,mm4 \ - __asm paddw mm3,mm3 \ - __asm paddw mm1,mm1 \ - __asm paddw mm3,mm0 \ - __asm paddw mm1,mm5 \ - __asm pmulhw mm4,mm3 \ - __asm paddw mm6,mm0 \ - __asm psubw mm6,mm2 \ - __asm paddw mm2,mm2 \ - __asm movq mm0,OC_I(1,_y) \ - __asm paddw mm2,mm6 \ - __asm paddw mm4,mm3 \ - __asm psubw mm2,mm1 \ -} - -/*38+8=46 cycles.*/ -#define OC_ROW_IDCT(_y,_x) __asm{ \ - OC_IDCT_BEGIN(_y,_x) \ - /*r3=D'*/ \ - __asm movq mm3,OC_I(2,_y) \ - /*r4=E'=E-G*/ \ - __asm psubw mm4,mm7 \ - /*r1=H'+H'*/ \ - __asm paddw mm1,mm1 \ - /*r7=G+G*/ \ - __asm paddw mm7,mm7 \ - /*r1=R1=A''+H'*/ \ - __asm paddw mm1,mm2 \ - /*r7=G'=E+G*/ \ - __asm paddw mm7,mm4 \ - /*r4=R4=E'-D'*/ \ - __asm psubw mm4,mm3 \ - __asm paddw mm3,mm3 \ - /*r6=R6=F'-B''*/ \ - __asm psubw mm6,mm5 \ - __asm paddw mm5,mm5 \ - /*r3=R3=E'+D'*/ \ - __asm paddw mm3,mm4 \ - /*r5=R5=F'+B''*/ \ - __asm paddw mm5,mm6 \ - /*r7=R7=G'-C'*/ \ - __asm psubw mm7,mm0 \ - __asm paddw mm0,mm0 \ - /*Save R1.*/ \ - __asm movq OC_I(1,_y),mm1 \ - /*r0=R0=G.+C.*/ \ - __asm paddw mm0,mm7 \ -} - -/*The following macro does two 4x4 transposes in place. - At entry, we assume: - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 - - At exit, we have: - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 - - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 - - I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. - J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. - - Since r1 is free at entry, we calculate the Js first.*/ -/*19 cycles.*/ -#define OC_TRANSPOSE(_y) __asm{ \ - __asm movq mm1,mm4 \ - __asm punpcklwd mm4,mm5 \ - __asm movq OC_I(0,_y),mm0 \ - __asm punpckhwd mm1,mm5 \ - __asm movq mm0,mm6 \ - __asm punpcklwd mm6,mm7 \ - __asm movq mm5,mm4 \ - __asm punpckldq mm4,mm6 \ - __asm punpckhdq mm5,mm6 \ - __asm movq mm6,mm1 \ - __asm movq OC_J(4,_y),mm4 \ - __asm punpckhwd mm0,mm7 \ - __asm movq OC_J(5,_y),mm5 \ - __asm punpckhdq mm6,mm0 \ - __asm movq mm4,OC_I(0,_y) \ - __asm punpckldq mm1,mm0 \ - __asm movq mm5,OC_I(1,_y) \ - __asm movq mm0,mm4 \ - __asm movq OC_J(7,_y),mm6 \ - __asm punpcklwd mm0,mm5 \ - __asm movq OC_J(6,_y),mm1 \ - __asm punpckhwd mm4,mm5 \ - __asm movq mm5,mm2 \ - __asm punpcklwd mm2,mm3 \ - __asm movq mm1,mm0 \ - __asm punpckldq mm0,mm2 \ - __asm punpckhdq mm1,mm2 \ - __asm movq mm2,mm4 \ - __asm movq OC_I(0,_y),mm0 \ - __asm punpckhwd mm5,mm3 \ - __asm movq OC_I(1,_y),mm1 \ - __asm punpckhdq mm4,mm5 \ - __asm punpckldq mm2,mm5 \ - __asm movq OC_I(3,_y),mm4 \ - __asm movq OC_I(2,_y),mm2 \ -} - -/*38+19=57 cycles.*/ -#define OC_COLUMN_IDCT(_y) __asm{ \ - OC_IDCT_BEGIN(_y,_y) \ - __asm paddw mm2,OC_8 \ - /*r1=H'+H'*/ \ - __asm paddw mm1,mm1 \ - /*r1=R1=A''+H'*/ \ - __asm paddw mm1,mm2 \ - /*r2=NR2*/ \ - __asm psraw mm2,4 \ - /*r4=E'=E-G*/ \ - __asm psubw mm4,mm7 \ - /*r1=NR1*/ \ - __asm psraw mm1,4 \ - /*r3=D'*/ \ - __asm movq mm3,OC_I(2,_y) \ - /*r7=G+G*/ \ - __asm paddw mm7,mm7 \ - /*Store NR2 at I(2).*/ \ - __asm movq OC_I(2,_y),mm2 \ - /*r7=G'=E+G*/ \ - __asm paddw mm7,mm4 \ - /*Store NR1 at I(1).*/ \ - __asm movq OC_I(1,_y),mm1 \ - /*r4=R4=E'-D'*/ \ - __asm psubw mm4,mm3 \ - __asm paddw mm4,OC_8 \ - /*r3=D'+D'*/ \ - __asm paddw mm3,mm3 \ - /*r3=R3=E'+D'*/ \ - __asm paddw mm3,mm4 \ - /*r4=NR4*/ \ - __asm psraw mm4,4 \ - /*r6=R6=F'-B''*/ \ - __asm psubw mm6,mm5 \ - /*r3=NR3*/ \ - __asm psraw mm3,4 \ - __asm paddw mm6,OC_8 \ - /*r5=B''+B''*/ \ - __asm paddw mm5,mm5 \ - /*r5=R5=F'+B''*/ \ - __asm paddw mm5,mm6 \ - /*r6=NR6*/ \ - __asm psraw mm6,4 \ - /*Store NR4 at J(4).*/ \ - __asm movq OC_J(4,_y),mm4 \ - /*r5=NR5*/ \ - __asm psraw mm5,4 \ - /*Store NR3 at I(3).*/ \ - __asm movq OC_I(3,_y),mm3 \ - /*r7=R7=G'-C'*/ \ - __asm psubw mm7,mm0 \ - __asm paddw mm7,OC_8 \ - /*r0=C'+C'*/ \ - __asm paddw mm0,mm0 \ - /*r0=R0=G'+C'*/ \ - __asm paddw mm0,mm7 \ - /*r7=NR7*/ \ - __asm psraw mm7,4 \ - /*Store NR6 at J(6).*/ \ - __asm movq OC_J(6,_y),mm6 \ - /*r0=NR0*/ \ - __asm psraw mm0,4 \ - /*Store NR5 at J(5).*/ \ - __asm movq OC_J(5,_y),mm5 \ - /*Store NR7 at J(7).*/ \ - __asm movq OC_J(7,_y),mm7 \ - /*Store NR0 at I(0).*/ \ - __asm movq OC_I(0,_y),mm0 \ -} - -#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8] -#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) -#define OC_8 OC_MID(OC_EIGHT_OFFSET,0) - -static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - int i; - /*This routine accepts an 8x8 matrix, but in partially transposed form. - Every 4x4 block is transposed.*/ - __asm{ -#define CONSTS eax -#define Y edx -#define X ecx - mov CONSTS,offset OC_IDCT_CONSTS - mov Y,_y - mov X,_x -#define OC_I(_k,_y) [(_y)+(_k)*16] -#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] - OC_ROW_IDCT(Y,X) - OC_TRANSPOSE(Y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) [(_y)+(_k)*16+64] -#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72] - OC_ROW_IDCT(Y,X) - OC_TRANSPOSE(Y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) [(_y)+(_k)*16] -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT(Y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) [(_y)+(_k)*16+8] -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT(Y) -#undef OC_I -#undef OC_J -#undef CONSTS -#undef Y -#undef X - } - if(_x!=_y){ - int i; - __asm pxor mm0,mm0; - for(i=0;i<4;i++){ - ogg_int16_t *x; - x=_x+16*i; -#define X ecx - __asm{ - mov X,x - movq [X+0x00],mm0 - movq [X+0x08],mm0 - movq [X+0x10],mm0 - movq [X+0x18],mm0 - } -#undef X - } - } -} - -/*25 cycles.*/ -#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \ - __asm movq mm2,OC_I(3,_x) \ - __asm nop \ - __asm movq mm6,OC_C(3) \ - __asm movq mm4,mm2 \ - __asm movq mm1,OC_C(5) \ - __asm pmulhw mm4,mm6 \ - __asm movq mm3,OC_I(1,_x) \ - __asm pmulhw mm1,mm2 \ - __asm movq mm0,OC_C(1) \ - __asm paddw mm4,mm2 \ - __asm pxor mm6,mm6 \ - __asm paddw mm2,mm1 \ - __asm movq mm5,OC_I(2,_x) \ - __asm pmulhw mm0,mm3 \ - __asm movq mm1,mm5 \ - __asm paddw mm0,mm3 \ - __asm pmulhw mm3,OC_C(7) \ - __asm psubw mm6,mm2 \ - __asm pmulhw mm5,OC_C(2) \ - __asm psubw mm0,mm4 \ - __asm movq mm7,OC_I(2,_x) \ - __asm paddw mm4,mm4 \ - __asm paddw mm7,mm5 \ - __asm paddw mm4,mm0 \ - __asm pmulhw mm1,OC_C(6) \ - __asm psubw mm3,mm6 \ - __asm movq OC_I(1,_y),mm4 \ - __asm paddw mm6,mm6 \ - __asm movq mm4,OC_C(4) \ - __asm paddw mm6,mm3 \ - __asm movq mm5,mm3 \ - __asm pmulhw mm3,mm4 \ - __asm movq OC_I(2,_y),mm6 \ - __asm movq mm2,mm0 \ - __asm movq mm6,OC_I(0,_x) \ - __asm pmulhw mm0,mm4 \ - __asm paddw mm5,mm3 \ - __asm paddw mm2,mm0 \ - __asm psubw mm5,mm1 \ - __asm pmulhw mm6,mm4 \ - __asm paddw mm6,OC_I(0,_x) \ - __asm paddw mm1,mm1 \ - __asm movq mm4,mm6 \ - __asm paddw mm1,mm5 \ - __asm psubw mm6,mm2 \ - __asm paddw mm2,mm2 \ - __asm movq mm0,OC_I(1,_y) \ - __asm paddw mm2,mm6 \ - __asm psubw mm2,mm1 \ - __asm nop \ -} - -/*25+8=33 cycles.*/ -#define OC_ROW_IDCT_10(_y,_x) __asm{ \ - OC_IDCT_BEGIN_10(_y,_x) \ - /*r3=D'*/ \ - __asm movq mm3,OC_I(2,_y) \ - /*r4=E'=E-G*/ \ - __asm psubw mm4,mm7 \ - /*r1=H'+H'*/ \ - __asm paddw mm1,mm1 \ - /*r7=G+G*/ \ - __asm paddw mm7,mm7 \ - /*r1=R1=A''+H'*/ \ - __asm paddw mm1,mm2 \ - /*r7=G'=E+G*/ \ - __asm paddw mm7,mm4 \ - /*r4=R4=E'-D'*/ \ - __asm psubw mm4,mm3 \ - __asm paddw mm3,mm3 \ - /*r6=R6=F'-B''*/ \ - __asm psubw mm6,mm5 \ - __asm paddw mm5,mm5 \ - /*r3=R3=E'+D'*/ \ - __asm paddw mm3,mm4 \ - /*r5=R5=F'+B''*/ \ - __asm paddw mm5,mm6 \ - /*r7=R7=G'-C'*/ \ - __asm psubw mm7,mm0 \ - __asm paddw mm0,mm0 \ - /*Save R1.*/ \ - __asm movq OC_I(1,_y),mm1 \ - /*r0=R0=G'+C'*/ \ - __asm paddw mm0,mm7 \ -} - -/*25+19=44 cycles'*/ -#define OC_COLUMN_IDCT_10(_y) __asm{ \ - OC_IDCT_BEGIN_10(_y,_y) \ - __asm paddw mm2,OC_8 \ - /*r1=H'+H'*/ \ - __asm paddw mm1,mm1 \ - /*r1=R1=A''+H'*/ \ - __asm paddw mm1,mm2 \ - /*r2=NR2*/ \ - __asm psraw mm2,4 \ - /*r4=E'=E-G*/ \ - __asm psubw mm4,mm7 \ - /*r1=NR1*/ \ - __asm psraw mm1,4 \ - /*r3=D'*/ \ - __asm movq mm3,OC_I(2,_y) \ - /*r7=G+G*/ \ - __asm paddw mm7,mm7 \ - /*Store NR2 at I(2).*/ \ - __asm movq OC_I(2,_y),mm2 \ - /*r7=G'=E+G*/ \ - __asm paddw mm7,mm4 \ - /*Store NR1 at I(1).*/ \ - __asm movq OC_I(1,_y),mm1 \ - /*r4=R4=E'-D'*/ \ - __asm psubw mm4,mm3 \ - __asm paddw mm4,OC_8 \ - /*r3=D'+D'*/ \ - __asm paddw mm3,mm3 \ - /*r3=R3=E'+D'*/ \ - __asm paddw mm3,mm4 \ - /*r4=NR4*/ \ - __asm psraw mm4,4 \ - /*r6=R6=F'-B''*/ \ - __asm psubw mm6,mm5 \ - /*r3=NR3*/ \ - __asm psraw mm3,4 \ - __asm paddw mm6,OC_8 \ - /*r5=B''+B''*/ \ - __asm paddw mm5,mm5 \ - /*r5=R5=F'+B''*/ \ - __asm paddw mm5,mm6 \ - /*r6=NR6*/ \ - __asm psraw mm6,4 \ - /*Store NR4 at J(4).*/ \ - __asm movq OC_J(4,_y),mm4 \ - /*r5=NR5*/ \ - __asm psraw mm5,4 \ - /*Store NR3 at I(3).*/ \ - __asm movq OC_I(3,_y),mm3 \ - /*r7=R7=G'-C'*/ \ - __asm psubw mm7,mm0 \ - __asm paddw mm7,OC_8 \ - /*r0=C'+C'*/ \ - __asm paddw mm0,mm0 \ - /*r0=R0=G'+C'*/ \ - __asm paddw mm0,mm7 \ - /*r7=NR7*/ \ - __asm psraw mm7,4 \ - /*Store NR6 at J(6).*/ \ - __asm movq OC_J(6,_y),mm6 \ - /*r0=NR0*/ \ - __asm psraw mm0,4 \ - /*Store NR5 at J(5).*/ \ - __asm movq OC_J(5,_y),mm5 \ - /*Store NR7 at J(7).*/ \ - __asm movq OC_J(7,_y),mm7 \ - /*Store NR0 at I(0).*/ \ - __asm movq OC_I(0,_y),mm0 \ -} - -static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ - __asm{ -#define CONSTS eax -#define Y edx -#define X ecx - mov CONSTS,offset OC_IDCT_CONSTS - mov Y,_y - mov X,_x -#define OC_I(_k,_y) [(_y)+(_k)*16] -#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] - /*Done with dequant, descramble, and partial transpose. - Now do the iDCT itself.*/ - OC_ROW_IDCT_10(Y,X) - OC_TRANSPOSE(Y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) [(_y)+(_k)*16] -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT_10(Y) -#undef OC_I -#undef OC_J -#define OC_I(_k,_y) [(_y)+(_k)*16+8] -#define OC_J(_k,_y) OC_I(_k,_y) - OC_COLUMN_IDCT_10(Y) -#undef OC_I -#undef OC_J -#undef CONSTS -#undef Y -#undef X - } - if(_x!=_y){ -#define X ecx - __asm{ - pxor mm0,mm0; - mov X,_x - movq [X+0x00],mm0 - movq [X+0x10],mm0 - movq [X+0x20],mm0 - movq [X+0x30],mm0 - } -#undef X - } -} - -/*Performs an inverse 8x8 Type-II DCT transform. - The input is assumed to be scaled by a factor of 4 relative to orthonormal - version of the transform.*/ -void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ - /*_last_zzi is subtly different from an actual count of the number of - coefficients we decoded for this block. - It contains the value of zzi BEFORE the final token in the block was - decoded. - In most cases this is an EOB token (the continuation of an EOB run from a - previous block counts), and so this is the same as the coefficient count. - However, in the case that the last token was NOT an EOB token, but filled - the block up with exactly 64 coefficients, _last_zzi will be less than 64. - Provided the last token was not a pure zero run, the minimum value it can - be is 46, and so that doesn't affect any of the cases in this routine. - However, if the last token WAS a pure zero run of length 63, then _last_zzi - will be 1 while the number of coefficients decoded is 64. - Thus, we will trigger the following special case, where the real - coefficient count would not. - Note also that a zero run of length 64 will give _last_zzi a value of 0, - but we still process the DC coefficient, which might have a non-zero value - due to DC prediction. - Although convoluted, this is arguably the correct behavior: it allows us to - use a smaller transform when the block ends with a long zero run instead - of a normal EOB token. - It could be smarter... multiple separate zero runs at the end of a block - will fool it, but an encoder that generates these really deserves what it - gets. - Needless to say we inherited this approach from VP3.*/ - /*Perform the iDCT.*/ - if(_last_zzi<=10)oc_idct8x8_10(_y,_x); - else oc_idct8x8_slow(_y,_x); -} - -#endif diff --git a/media/libtheora/lib/x86_vc/mmxloop.h b/media/libtheora/lib/x86_vc/mmxloop.h deleted file mode 100644 index 2561fca2a..000000000 --- a/media/libtheora/lib/x86_vc/mmxloop.h +++ /dev/null @@ -1,219 +0,0 @@ -#if !defined(_x86_vc_mmxloop_H) -# define _x86_vc_mmxloop_H (1) -# include <stddef.h> -# include "x86int.h" - -#if defined(OC_X86_ASM) - -/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. - On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and - mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ -#define OC_LOOP_FILTER8_MMX __asm{ \ - /*mm7=0*/ \ - __asm pxor mm7,mm7 \ - /*mm6:mm0={a0,...,a7}*/ \ - __asm movq mm6,mm0 \ - __asm punpcklbw mm0,mm7 \ - __asm punpckhbw mm6,mm7 \ - /*mm3:mm5={d0,...,d7}*/ \ - __asm movq mm5,mm3 \ - __asm punpcklbw mm3,mm7 \ - __asm punpckhbw mm5,mm7 \ - /*mm6:mm0={a0-d0,...,a7-d7}*/ \ - __asm psubw mm0,mm3 \ - __asm psubw mm6,mm5 \ - /*mm3:mm1={b0,...,b7}*/ \ - __asm movq mm3,mm1 \ - __asm punpcklbw mm1,mm7 \ - __asm movq mm4,mm2 \ - __asm punpckhbw mm3,mm7 \ - /*mm5:mm4={c0,...,c7}*/ \ - __asm movq mm5,mm2 \ - __asm punpcklbw mm4,mm7 \ - __asm punpckhbw mm5,mm7 \ - /*mm7={3}x4 \ - mm5:mm4={c0-b0,...,c7-b7}*/ \ - __asm pcmpeqw mm7,mm7 \ - __asm psubw mm4,mm1 \ - __asm psrlw mm7,14 \ - __asm psubw mm5,mm3 \ - /*Scale by 3.*/ \ - __asm pmullw mm4,mm7 \ - __asm pmullw mm5,mm7 \ - /*mm7={4}x4 \ - mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ - __asm psrlw mm7,1 \ - __asm paddw mm4,mm0 \ - __asm psllw mm7,2 \ - __asm movq mm0,[LL] \ - __asm paddw mm5,mm6 \ - /*R_i has the range [-127,128], so we compute -R_i instead. \ - mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ - __asm psubw mm4,mm7 \ - __asm psubw mm5,mm7 \ - __asm psraw mm4,3 \ - __asm psraw mm5,3 \ - __asm pcmpeqb mm7,mm7 \ - __asm packsswb mm4,mm5 \ - __asm pxor mm6,mm6 \ - __asm pxor mm4,mm7 \ - __asm packuswb mm1,mm3 \ - /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ - /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ - we have to split things by sign (the other option is to work in 16 bits, \ - but working in 8 bits gives much better parallelism). \ - We compute abs(R_i), but save a mask of which terms were negative in mm6. \ - Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ - Finally, we split mm4 into positive and negative pieces using the mask in \ - mm6, and add and subtract them as appropriate.*/ \ - /*mm4=abs(-R_i)*/ \ - /*mm7=255-2*L*/ \ - __asm pcmpgtb mm6,mm4 \ - __asm psubb mm7,mm0 \ - __asm pxor mm4,mm6 \ - __asm psubb mm7,mm0 \ - __asm psubb mm4,mm6 \ - /*mm7=255-max(2*L-abs(R_i),0)*/ \ - __asm paddusb mm7,mm4 \ - /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ - __asm paddusb mm4,mm7 \ - __asm psubusb mm4,mm7 \ - /*Now split mm4 by the original sign of -R_i.*/ \ - __asm movq mm5,mm4 \ - __asm pand mm4,mm6 \ - __asm pandn mm6,mm5 \ - /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ - /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ - __asm paddusb mm1,mm4 \ - __asm psubusb mm2,mm4 \ - __asm psubusb mm1,mm6 \ - __asm paddusb mm2,mm6 \ -} - -#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \ - do{ \ - /*Used local variable pix__ in order to fix compilation errors like: \ - "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \ - unsigned char *pix__; \ - unsigned char *ll__; \ - ll__=(_ll); \ - pix__=(_pix); \ - __asm mov YSTRIDE,_ystride \ - __asm mov LL,ll__ \ - __asm mov PIX,pix__ \ - __asm sub PIX,YSTRIDE \ - __asm sub PIX,YSTRIDE \ - /*mm0={a0,...,a7}*/ \ - __asm movq mm0,[PIX] \ - /*ystride3=_ystride*3*/ \ - __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ - /*mm3={d0,...,d7}*/ \ - __asm movq mm3,[PIX+YSTRIDE3] \ - /*mm1={b0,...,b7}*/ \ - __asm movq mm1,[PIX+YSTRIDE] \ - /*mm2={c0,...,c7}*/ \ - __asm movq mm2,[PIX+YSTRIDE*2] \ - OC_LOOP_FILTER8_MMX \ - /*Write it back out.*/ \ - __asm movq [PIX+YSTRIDE],mm1 \ - __asm movq [PIX+YSTRIDE*2],mm2 \ - } \ - while(0) - -#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \ - do{ \ - /*Used local variable ll__ in order to fix compilation errors like: \ - "error C2443: operand size conflict".*/ \ - unsigned char *ll__; \ - unsigned char *pix__; \ - ll__=(_ll); \ - pix__=(_pix)-2; \ - __asm mov PIX,pix__ \ - __asm mov YSTRIDE,_ystride \ - __asm mov LL,ll__ \ - /*x x x x d0 c0 b0 a0*/ \ - __asm movd mm0,[PIX] \ - /*x x x x d1 c1 b1 a1*/ \ - __asm movd mm1,[PIX+YSTRIDE] \ - /*ystride3=_ystride*3*/ \ - __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ - /*x x x x d2 c2 b2 a2*/ \ - __asm movd mm2,[PIX+YSTRIDE*2] \ - /*x x x x d3 c3 b3 a3*/ \ - __asm lea D,[PIX+YSTRIDE*4] \ - __asm movd mm3,[PIX+YSTRIDE3] \ - /*x x x x d4 c4 b4 a4*/ \ - __asm movd mm4,[D] \ - /*x x x x d5 c5 b5 a5*/ \ - __asm movd mm5,[D+YSTRIDE] \ - /*x x x x d6 c6 b6 a6*/ \ - __asm movd mm6,[D+YSTRIDE*2] \ - /*x x x x d7 c7 b7 a7*/ \ - __asm movd mm7,[D+YSTRIDE3] \ - /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ - __asm punpcklbw mm0,mm1 \ - /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ - __asm punpcklbw mm2,mm3 \ - /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ - __asm movq mm3,mm0 \ - /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ - __asm punpcklwd mm0,mm2 \ - /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ - __asm punpckhwd mm3,mm2 \ - /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ - __asm movq mm1,mm0 \ - /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ - __asm punpcklbw mm4,mm5 \ - /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ - __asm punpcklbw mm6,mm7 \ - /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ - __asm movq mm5,mm4 \ - /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ - __asm punpcklwd mm4,mm6 \ - /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ - __asm punpckhwd mm5,mm6 \ - /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ - __asm movq mm2,mm3 \ - /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ - __asm punpckldq mm0,mm4 \ - /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ - __asm punpckhdq mm1,mm4 \ - /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ - __asm punpckldq mm2,mm5 \ - /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ - __asm punpckhdq mm3,mm5 \ - OC_LOOP_FILTER8_MMX \ - /*mm2={b0+R_0'',...,b7+R_7''}*/ \ - __asm movq mm0,mm1 \ - /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ - __asm punpcklbw mm1,mm2 \ - /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ - __asm punpckhbw mm0,mm2 \ - /*[d]=c1 b1 c0 b0*/ \ - __asm movd D,mm1 \ - __asm mov [PIX+1],D_WORD \ - __asm psrlq mm1,32 \ - __asm shr D,16 \ - __asm mov [PIX+YSTRIDE+1],D_WORD \ - /*[d]=c3 b3 c2 b2*/ \ - __asm movd D,mm1 \ - __asm mov [PIX+YSTRIDE*2+1],D_WORD \ - __asm shr D,16 \ - __asm mov [PIX+YSTRIDE3+1],D_WORD \ - __asm lea PIX,[PIX+YSTRIDE*4] \ - /*[d]=c5 b5 c4 b4*/ \ - __asm movd D,mm0 \ - __asm mov [PIX+1],D_WORD \ - __asm psrlq mm0,32 \ - __asm shr D,16 \ - __asm mov [PIX+YSTRIDE+1],D_WORD \ - /*[d]=c7 b7 c6 b6*/ \ - __asm movd D,mm0 \ - __asm mov [PIX+YSTRIDE*2+1],D_WORD \ - __asm shr D,16 \ - __asm mov [PIX+YSTRIDE3+1],D_WORD \ - } \ - while(0) - -# endif -#endif diff --git a/media/libtheora/lib/x86_vc/mmxstate.c b/media/libtheora/lib/x86_vc/mmxstate.c deleted file mode 100644 index d3d468d5f..000000000 --- a/media/libtheora/lib/x86_vc/mmxstate.c +++ /dev/null @@ -1,176 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ - - ********************************************************************/ - -/*MMX acceleration of complete fragment reconstruction algorithm. - Originally written by Rudolf Marek.*/ -#include <string.h> -#include "x86int.h" -#include "mmxloop.h" - -#if defined(OC_X86_ASM) - -void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ - unsigned char *dst; - ptrdiff_t frag_buf_off; - int ystride; - int refi; - /*Apply the inverse transform.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - /*Note that this value must be unsigned, to keep the __asm__ block from - sign-extending it when it puts it in a register.*/ - ogg_uint16_t p; - /*We round this dequant product (and not any of the others) because there's - no iDCT rounding.*/ - p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); - /*Fill _dct_coeffs with p.*/ - __asm{ -#define Y eax -#define P ecx - mov Y,_dct_coeffs - movzx P,p - lea Y,[Y+128] - /*mm0=0000 0000 0000 AAAA*/ - movd mm0,P - /*mm0=0000 0000 AAAA AAAA*/ - punpcklwd mm0,mm0 - /*mm0=AAAA AAAA AAAA AAAA*/ - punpckldq mm0,mm0 - movq [Y],mm0 - movq [8+Y],mm0 - movq [16+Y],mm0 - movq [24+Y],mm0 - movq [32+Y],mm0 - movq [40+Y],mm0 - movq [48+Y],mm0 - movq [56+Y],mm0 - movq [64+Y],mm0 - movq [72+Y],mm0 - movq [80+Y],mm0 - movq [88+Y],mm0 - movq [96+Y],mm0 - movq [104+Y],mm0 - movq [112+Y],mm0 - movq [120+Y],mm0 -#undef Y -#undef P - } - } - else{ - /*Dequantize the DC coefficient.*/ - _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi); - } - /*Fill in the target buffer.*/ - frag_buf_off=_state->frag_buf_offs[_fragi]; - refi=_state->frags[_fragi].refi; - ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; - if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); - else{ - const unsigned char *ref; - int mvoffsets[2]; - ref=_state->ref_frame_data[refi]+frag_buf_off; - if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi])>1){ - oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs+64); - } - else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); - } -} - -/*We copy these entire function to inline the actual MMX routines so that we - use only a single indirect call.*/ - -void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ - memset(_bv,~(_flimit<<1),8); -} - -/*Apply the loop filter to a given set of fragment rows in the given plane. - The filter may be run on the bottom edge, affecting pixels in the next row of - fragments, so this row also needs to be available. - _bv: The bounding values array. - _refi: The index of the frame buffer to filter. - _pli: The color plane to filter. - _fragy0: The Y coordinate of the first fragment row to filter. - _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ - const oc_fragment_plane *fplane; - const oc_fragment *frags; - const ptrdiff_t *frag_buf_offs; - unsigned char *ref_frame_data; - ptrdiff_t fragi_top; - ptrdiff_t fragi_bot; - ptrdiff_t fragi0; - ptrdiff_t fragi0_end; - int ystride; - int nhfrags; - fplane=_state->fplanes+_pli; - nhfrags=fplane->nhfrags; - fragi_top=fplane->froffset; - fragi_bot=fragi_top+fplane->nfrags; - fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; - ystride=_state->ref_ystride[_pli]; - frags=_state->frags; - frag_buf_offs=_state->frag_buf_offs; - ref_frame_data=_state->ref_frame_data[_refi]; - /*The following loops are constructed somewhat non-intuitively on purpose. - The main idea is: if a block boundary has at least one coded fragment on - it, the filter is applied to it. - However, the order that the filters are applied in matters, and VP3 chose - the somewhat strange ordering used below.*/ - while(fragi0<fragi0_end){ - ptrdiff_t fragi; - ptrdiff_t fragi_end; - fragi=fragi0; - fragi_end=fragi+nhfrags; - while(fragi<fragi_end){ - if(frags[fragi].coded){ - unsigned char *ref; - ref=ref_frame_data+frag_buf_offs[fragi]; -#define PIX eax -#define YSTRIDE3 edi -#define YSTRIDE ecx -#define LL edx -#define D esi -#define D_WORD si - if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv); - if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv); - if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv); - } - if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv); - } -#undef PIX -#undef YSTRIDE3 -#undef YSTRIDE -#undef LL -#undef D -#undef D_WORD - } - fragi++; - } - fragi0+=nhfrags; - } -} - -#endif diff --git a/media/libtheora/lib/x86_vc/x86cpu.c b/media/libtheora/lib/x86_vc/x86cpu.c deleted file mode 100644 index 41f4bcba9..000000000 --- a/media/libtheora/lib/x86_vc/x86cpu.c +++ /dev/null @@ -1,192 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - CPU capability detection for x86 processors. - Originally written by Rudolf Marek. - - function: - last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#include "x86cpu.h" - -#if !defined(OC_X86_ASM) -ogg_uint32_t oc_cpu_flags_get(void){ - return 0; -} -#else -/*Why does MSVC need this complicated rigamarole? - At this point I honestly do not care.*/ - -/*Visual C cpuid helper function. - For VS2005 we could as well use the _cpuid builtin, but that wouldn't work - for VS2003 users, so we do it in inline assembler.*/ -static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){ - _asm{ - mov eax,[_op] - mov esi,_cpu_info - cpuid - mov [esi+0],eax - mov [esi+4],ebx - mov [esi+8],ecx - mov [esi+12],edx - } -} - -# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ - do{ \ - ogg_uint32_t cpu_info[4]; \ - oc_cpuid_helper(cpu_info,_op); \ - (_eax)=cpu_info[0]; \ - (_ebx)=cpu_info[1]; \ - (_ecx)=cpu_info[2]; \ - (_edx)=cpu_info[3]; \ - }while(0) - -static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){ - _asm{ - pushfd - pushfd - pop eax - mov ebx,eax - xor eax,200000h - push eax - popfd - pushfd - pop eax - popfd - mov ecx,_eax - mov [ecx],eax - mov ecx,_ebx - mov [ecx],ebx - } -} - -static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ - ogg_uint32_t flags; - /*If there isn't even MMX, give up.*/ - if(!(_edx&0x00800000))return 0; - flags=OC_CPU_X86_MMX; - if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE; - if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2; - if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI; - if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3; - if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1; - if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2; - return flags; -} - -static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ - ogg_uint32_t flags; - /*If there isn't even MMX, give up.*/ - if(!(_edx&0x00800000))return 0; - flags=OC_CPU_X86_MMX; - if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT; - if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW; - if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT; - if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A; - if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5; - return flags; -} - -ogg_uint32_t oc_cpu_flags_get(void){ - ogg_uint32_t flags; - ogg_uint32_t eax; - ogg_uint32_t ebx; - ogg_uint32_t ecx; - ogg_uint32_t edx; -# if !defined(__amd64__)&&!defined(__x86_64__) - /*Not all x86-32 chips support cpuid, so we have to check.*/ - oc_detect_cpuid_helper(&eax,&ebx); - /*No cpuid.*/ - if(eax==ebx)return 0; -# endif - cpuid(0,eax,ebx,ecx,edx); - /* l e t n I e n i u n e G*/ - if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547|| - /* 6 8 x M T e n i u n e G*/ - ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){ - int family; - int model; - /*Intel, Transmeta (tested with Crusoe TM5800):*/ - cpuid(1,eax,ebx,ecx,edx); - flags=oc_parse_intel_flags(edx,ecx); - family=(eax>>8)&0xF; - model=(eax>>4)&0xF; - /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX - unit, so don't use it.*/ - if(family==6&&(model==9||model==13||model==14)){ - flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI); - } - } - /* D M A c i t n e h t u A*/ - else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541|| - /* C S N y b e d o e G*/ - ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){ - /*AMD, Geode:*/ - cpuid(0x80000000,eax,ebx,ecx,edx); - if(eax<0x80000001)flags=0; - else{ - cpuid(0x80000001,eax,ebx,ecx,edx); - flags=oc_parse_amd_flags(edx,ecx); - } - /*Also check for SSE.*/ - cpuid(1,eax,ebx,ecx,edx); - flags|=oc_parse_intel_flags(edx,ecx); - } - /*Technically some VIA chips can be configured in the BIOS to return any - string here the user wants. - There is a special detection method that can be used to identify such - processors, but in my opinion, if the user really wants to change it, they - deserve what they get.*/ - /* s l u a H r u a t n e C*/ - else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){ - /*VIA:*/ - /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming) - chips (thanks to the engineers from Centaur Technology who provided it). - These chips support Intel-like cpuid info. - The C3-2 (Nehemiah) cores appear to, as well.*/ - cpuid(1,eax,ebx,ecx,edx); - flags=oc_parse_intel_flags(edx,ecx); - if(eax>=0x80000001){ - /*The (non-Nehemiah) C3 processors support AMD-like cpuid info. - We need to check this even if the Intel test succeeds to pick up 3DNow! - support on these processors. - Unlike actual AMD processors, we cannot _rely_ on this info, since - some cores (e.g., the 693 stepping of the Nehemiah) claim to support - this function, yet return edx=0, despite the Intel test indicating - MMX support. - Therefore the features detected here are strictly added to those - detected by the Intel test.*/ - /*TODO: How about earlier chips?*/ - cpuid(0x80000001,eax,ebx,ecx,edx); - /*Note: As of the C7, this function returns Intel-style extended feature - flags, not AMD-style. - Currently, this only defines bits 11, 20, and 29 (0x20100800), which - do not conflict with any of the AMD flags we inspect. - For the remaining bits, Intel tells us, "Do not count on their value", - but VIA assures us that they will all be zero (at least on the C7 and - Isaiah chips). - In the (unlikely) event a future processor uses bits 18, 19, 30, or 31 - (0xC0C00000) for something else, we will have to add code to detect - the model to decide when it is appropriate to inspect them.*/ - flags|=oc_parse_amd_flags(edx,ecx); - } - } - else{ - /*Implement me.*/ - flags=0; - } - return flags; -} -#endif diff --git a/media/libtheora/lib/x86_vc/x86cpu.h b/media/libtheora/lib/x86_vc/x86cpu.h deleted file mode 100644 index 327d93246..000000000 --- a/media/libtheora/lib/x86_vc/x86cpu.h +++ /dev/null @@ -1,36 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - function: - last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#if !defined(_x86_vc_x86cpu_H) -# define _x86_vc_x86cpu_H (1) -#include "../internal.h" - -#define OC_CPU_X86_MMX (1<<0) -#define OC_CPU_X86_3DNOW (1<<1) -#define OC_CPU_X86_3DNOWEXT (1<<2) -#define OC_CPU_X86_MMXEXT (1<<3) -#define OC_CPU_X86_SSE (1<<4) -#define OC_CPU_X86_SSE2 (1<<5) -#define OC_CPU_X86_PNI (1<<6) -#define OC_CPU_X86_SSSE3 (1<<7) -#define OC_CPU_X86_SSE4_1 (1<<8) -#define OC_CPU_X86_SSE4_2 (1<<9) -#define OC_CPU_X86_SSE4A (1<<10) -#define OC_CPU_X86_SSE5 (1<<11) - -ogg_uint32_t oc_cpu_flags_get(void); - -#endif diff --git a/media/libtheora/lib/x86_vc/x86int.h b/media/libtheora/lib/x86_vc/x86int.h deleted file mode 100644 index bc4c54a2f..000000000 --- a/media/libtheora/lib/x86_vc/x86int.h +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86int.h 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#if !defined(_x86_vc_x86int_H) -# define _x86_vc_x86int_H (1) -# include "../internal.h" -# if defined(OC_X86_ASM) -# define oc_state_accel_init oc_state_accel_init_x86 -# define OC_STATE_USE_VTABLE (1) -# endif -# include "../state.h" -# include "x86cpu.h" - -void oc_state_accel_init_x86(oc_theora_state *_state); - -void oc_frag_copy_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride); -void oc_frag_copy_list_mmx(unsigned char *_dst_frame, - const unsigned char *_src_frame,int _ystride, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, - const ogg_int16_t *_residue); -void oc_frag_recon_inter_mmx(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); -void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); -void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit); -void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_restore_fpu_mmx(void); - -#endif diff --git a/media/libtheora/lib/x86_vc/x86state.c b/media/libtheora/lib/x86_vc/x86state.c deleted file mode 100644 index 7aa73deae..000000000 --- a/media/libtheora/lib/x86_vc/x86state.c +++ /dev/null @@ -1,61 +0,0 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: x86state.c 17410 2010-09-21 21:53:48Z tterribe $ - - ********************************************************************/ - -#include "x86int.h" - -#if defined(OC_X86_ASM) - -/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into - each quadrant of the destination.*/ -static const unsigned char OC_FZIG_ZAG_MMX[128]={ - 0, 8, 1, 2, 9,16,24,17, - 10, 3,32,11,18,25, 4,12, - 5,26,19,40,33,34,41,48, - 27, 6,13,20,28,21,14, 7, - 56,49,42,35,43,50,57,36, - 15,22,29,30,23,44,37,58, - 51,59,38,45,52,31,60,53, - 46,39,47,54,61,62,55,63, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, -}; - -void oc_state_accel_init_x86(oc_theora_state *_state){ - _state->cpu_flags=oc_cpu_flags_get(); - if(_state->cpu_flags&OC_CPU_X86_MMX){ - _state->opt_vtable.frag_copy=oc_frag_copy_mmx; - _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx; - _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; - _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; - _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; - _state->opt_vtable.idct8x8=oc_idct8x8_mmx; - _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; - _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx; - _state->opt_vtable.state_loop_filter_frag_rows= - oc_state_loop_filter_frag_rows_mmx; - _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; - } - else oc_state_accel_init_c(_state); -} -#endif |