417 lines
11 KiB
C
417 lines
11 KiB
C
/********************************************************************
|
|
* *
|
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
|
* *
|
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
|
* *
|
|
********************************************************************
|
|
|
|
function:
|
|
last mod: $Id$
|
|
|
|
********************************************************************/
|
|
|
|
/*MMX acceleration of fragment reconstruction for motion compensation.
|
|
Originally written by Rudolf Marek.
|
|
Additional optimization by Nils Pipenbrinck.
|
|
Note: Loops are unrolled for best performance.
|
|
The iteration each instruction belongs to is marked in the comments as #i.*/
|
|
#include <stddef.h>
|
|
#include "x86int.h"
|
|
|
|
#if defined(OC_X86_ASM)
|
|
|
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
|
between rows.*/
|
|
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
|
do{ \
|
|
const unsigned char *src; \
|
|
unsigned char *dst; \
|
|
src=(_src); \
|
|
dst=(_dst); \
|
|
__asm mov SRC,src \
|
|
__asm mov DST,dst \
|
|
__asm mov YSTRIDE,_ystride \
|
|
/*src+0*ystride*/ \
|
|
__asm movq mm0,[SRC] \
|
|
/*src+1*ystride*/ \
|
|
__asm movq mm1,[SRC+YSTRIDE] \
|
|
/*ystride3=ystride*3*/ \
|
|
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
|
/*src+2*ystride*/ \
|
|
__asm movq mm2,[SRC+YSTRIDE*2] \
|
|
/*src+3*ystride*/ \
|
|
__asm movq mm3,[SRC+YSTRIDE3] \
|
|
/*dst+0*ystride*/ \
|
|
__asm movq [DST],mm0 \
|
|
/*dst+1*ystride*/ \
|
|
__asm movq [DST+YSTRIDE],mm1 \
|
|
/*Pointer to next 4.*/ \
|
|
__asm lea SRC,[SRC+YSTRIDE*4] \
|
|
/*dst+2*ystride*/ \
|
|
__asm movq [DST+YSTRIDE*2],mm2 \
|
|
/*dst+3*ystride*/ \
|
|
__asm movq [DST+YSTRIDE3],mm3 \
|
|
/*Pointer to next 4.*/ \
|
|
__asm lea DST,[DST+YSTRIDE*4] \
|
|
/*src+0*ystride*/ \
|
|
__asm movq mm0,[SRC] \
|
|
/*src+1*ystride*/ \
|
|
__asm movq mm1,[SRC+YSTRIDE] \
|
|
/*src+2*ystride*/ \
|
|
__asm movq mm2,[SRC+YSTRIDE*2] \
|
|
/*src+3*ystride*/ \
|
|
__asm movq mm3,[SRC+YSTRIDE3] \
|
|
/*dst+0*ystride*/ \
|
|
__asm movq [DST],mm0 \
|
|
/*dst+1*ystride*/ \
|
|
__asm movq [DST+YSTRIDE],mm1 \
|
|
/*dst+2*ystride*/ \
|
|
__asm movq [DST+YSTRIDE*2],mm2 \
|
|
/*dst+3*ystride*/ \
|
|
__asm movq [DST+YSTRIDE3],mm3 \
|
|
} \
|
|
while(0)
|
|
|
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
|
between rows.*/
|
|
void oc_frag_copy_mmx(unsigned char *_dst,
|
|
const unsigned char *_src,int _ystride){
|
|
#define SRC edx
|
|
#define DST eax
|
|
#define YSTRIDE ecx
|
|
#define YSTRIDE3 esi
|
|
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
|
#undef SRC
|
|
#undef DST
|
|
#undef YSTRIDE
|
|
#undef YSTRIDE3
|
|
}
|
|
|
|
/*Copies the fragments specified by the lists of fragment indices from one
|
|
frame to another.
|
|
_dst_frame: The reference frame to copy to.
|
|
_src_frame: The reference frame to copy from.
|
|
_ystride: The row stride of the reference frames.
|
|
_fragis: A pointer to a list of fragment indices.
|
|
_nfragis: The number of fragment indices to copy.
|
|
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
|
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
|
const unsigned char *_src_frame,int _ystride,
|
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
|
ptrdiff_t fragii;
|
|
for(fragii=0;fragii<_nfragis;fragii++){
|
|
ptrdiff_t frag_buf_off;
|
|
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
|
#define SRC edx
|
|
#define DST eax
|
|
#define YSTRIDE ecx
|
|
#define YSTRIDE3 edi
|
|
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
|
_src_frame+frag_buf_off,_ystride);
|
|
#undef SRC
|
|
#undef DST
|
|
#undef YSTRIDE
|
|
#undef YSTRIDE3
|
|
}
|
|
}
|
|
|
|
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
|
const ogg_int16_t *_residue){
|
|
__asm{
|
|
#define DST edx
|
|
#define DST4 esi
|
|
#define YSTRIDE eax
|
|
#define YSTRIDE3 edi
|
|
#define RESIDUE ecx
|
|
mov DST,_dst
|
|
mov YSTRIDE,_ystride
|
|
mov RESIDUE,_residue
|
|
lea DST4,[DST+YSTRIDE*4]
|
|
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
|
|
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
|
|
pcmpeqw mm0,mm0
|
|
/*#0 Load low residue.*/
|
|
movq mm1,[0*8+RESIDUE]
|
|
/*#0 Load high residue.*/
|
|
movq mm2,[1*8+RESIDUE]
|
|
/*Set mm0 to 0x8000800080008000.*/
|
|
psllw mm0,15
|
|
/*#1 Load low residue.*/
|
|
movq mm3,[2*8+RESIDUE]
|
|
/*#1 Load high residue.*/
|
|
movq mm4,[3*8+RESIDUE]
|
|
/*Set mm0 to 0x0080008000800080.*/
|
|
psrlw mm0,8
|
|
/*#2 Load low residue.*/
|
|
movq mm5,[4*8+RESIDUE]
|
|
/*#2 Load high residue.*/
|
|
movq mm6,[5*8+RESIDUE]
|
|
/*#0 Bias low residue.*/
|
|
paddsw mm1,mm0
|
|
/*#0 Bias high residue.*/
|
|
paddsw mm2,mm0
|
|
/*#0 Pack to byte.*/
|
|
packuswb mm1,mm2
|
|
/*#1 Bias low residue.*/
|
|
paddsw mm3,mm0
|
|
/*#1 Bias high residue.*/
|
|
paddsw mm4,mm0
|
|
/*#1 Pack to byte.*/
|
|
packuswb mm3,mm4
|
|
/*#2 Bias low residue.*/
|
|
paddsw mm5,mm0
|
|
/*#2 Bias high residue.*/
|
|
paddsw mm6,mm0
|
|
/*#2 Pack to byte.*/
|
|
packuswb mm5,mm6
|
|
/*#0 Write row.*/
|
|
movq [DST],mm1
|
|
/*#1 Write row.*/
|
|
movq [DST+YSTRIDE],mm3
|
|
/*#2 Write row.*/
|
|
movq [DST+YSTRIDE*2],mm5
|
|
/*#3 Load low residue.*/
|
|
movq mm1,[6*8+RESIDUE]
|
|
/*#3 Load high residue.*/
|
|
movq mm2,[7*8+RESIDUE]
|
|
/*#4 Load high residue.*/
|
|
movq mm3,[8*8+RESIDUE]
|
|
/*#4 Load high residue.*/
|
|
movq mm4,[9*8+RESIDUE]
|
|
/*#5 Load high residue.*/
|
|
movq mm5,[10*8+RESIDUE]
|
|
/*#5 Load high residue.*/
|
|
movq mm6,[11*8+RESIDUE]
|
|
/*#3 Bias low residue.*/
|
|
paddsw mm1,mm0
|
|
/*#3 Bias high residue.*/
|
|
paddsw mm2,mm0
|
|
/*#3 Pack to byte.*/
|
|
packuswb mm1,mm2
|
|
/*#4 Bias low residue.*/
|
|
paddsw mm3,mm0
|
|
/*#4 Bias high residue.*/
|
|
paddsw mm4,mm0
|
|
/*#4 Pack to byte.*/
|
|
packuswb mm3,mm4
|
|
/*#5 Bias low residue.*/
|
|
paddsw mm5,mm0
|
|
/*#5 Bias high residue.*/
|
|
paddsw mm6,mm0
|
|
/*#5 Pack to byte.*/
|
|
packuswb mm5,mm6
|
|
/*#3 Write row.*/
|
|
movq [DST+YSTRIDE3],mm1
|
|
/*#4 Write row.*/
|
|
movq [DST4],mm3
|
|
/*#5 Write row.*/
|
|
movq [DST4+YSTRIDE],mm5
|
|
/*#6 Load low residue.*/
|
|
movq mm1,[12*8+RESIDUE]
|
|
/*#6 Load high residue.*/
|
|
movq mm2,[13*8+RESIDUE]
|
|
/*#7 Load low residue.*/
|
|
movq mm3,[14*8+RESIDUE]
|
|
/*#7 Load high residue.*/
|
|
movq mm4,[15*8+RESIDUE]
|
|
/*#6 Bias low residue.*/
|
|
paddsw mm1,mm0
|
|
/*#6 Bias high residue.*/
|
|
paddsw mm2,mm0
|
|
/*#6 Pack to byte.*/
|
|
packuswb mm1,mm2
|
|
/*#7 Bias low residue.*/
|
|
paddsw mm3,mm0
|
|
/*#7 Bias high residue.*/
|
|
paddsw mm4,mm0
|
|
/*#7 Pack to byte.*/
|
|
packuswb mm3,mm4
|
|
/*#6 Write row.*/
|
|
movq [DST4+YSTRIDE*2],mm1
|
|
/*#7 Write row.*/
|
|
movq [DST4+YSTRIDE3],mm3
|
|
#undef DST
|
|
#undef DST4
|
|
#undef YSTRIDE
|
|
#undef YSTRIDE3
|
|
#undef RESIDUE
|
|
}
|
|
}
|
|
|
|
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
|
|
int _ystride,const ogg_int16_t *_residue){
|
|
int i;
|
|
/*Zero mm0.*/
|
|
__asm pxor mm0,mm0;
|
|
for(i=4;i-->0;){
|
|
__asm{
|
|
#define DST edx
|
|
#define SRC ecx
|
|
#define YSTRIDE edi
|
|
#define RESIDUE eax
|
|
mov DST,_dst
|
|
mov SRC,_src
|
|
mov YSTRIDE,_ystride
|
|
mov RESIDUE,_residue
|
|
/*#0 Load source.*/
|
|
movq mm3,[SRC]
|
|
/*#1 Load source.*/
|
|
movq mm7,[SRC+YSTRIDE]
|
|
/*#0 Get copy of src.*/
|
|
movq mm4,mm3
|
|
/*#0 Expand high source.*/
|
|
punpckhbw mm4,mm0
|
|
/*#0 Expand low source.*/
|
|
punpcklbw mm3,mm0
|
|
/*#0 Add residue high.*/
|
|
paddsw mm4,[8+RESIDUE]
|
|
/*#1 Get copy of src.*/
|
|
movq mm2,mm7
|
|
/*#0 Add residue low.*/
|
|
paddsw mm3,[RESIDUE]
|
|
/*#1 Expand high source.*/
|
|
punpckhbw mm2,mm0
|
|
/*#0 Pack final row pixels.*/
|
|
packuswb mm3,mm4
|
|
/*#1 Expand low source.*/
|
|
punpcklbw mm7,mm0
|
|
/*#1 Add residue low.*/
|
|
paddsw mm7,[16+RESIDUE]
|
|
/*#1 Add residue high.*/
|
|
paddsw mm2,[24+RESIDUE]
|
|
/*Advance residue.*/
|
|
lea RESIDUE,[32+RESIDUE]
|
|
/*#1 Pack final row pixels.*/
|
|
packuswb mm7,mm2
|
|
/*Advance src.*/
|
|
lea SRC,[SRC+YSTRIDE*2]
|
|
/*#0 Write row.*/
|
|
movq [DST],mm3
|
|
/*#1 Write row.*/
|
|
movq [DST+YSTRIDE],mm7
|
|
/*Advance dst.*/
|
|
lea DST,[DST+YSTRIDE*2]
|
|
mov _residue,RESIDUE
|
|
mov _dst,DST
|
|
mov _src,SRC
|
|
#undef DST
|
|
#undef SRC
|
|
#undef YSTRIDE
|
|
#undef RESIDUE
|
|
}
|
|
}
|
|
}
|
|
|
|
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
|
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
|
|
int i;
|
|
/*Zero mm7.*/
|
|
__asm pxor mm7,mm7;
|
|
for(i=4;i-->0;){
|
|
__asm{
|
|
#define SRC1 ecx
|
|
#define SRC2 edi
|
|
#define YSTRIDE esi
|
|
#define RESIDUE edx
|
|
#define DST eax
|
|
mov YSTRIDE,_ystride
|
|
mov DST,_dst
|
|
mov RESIDUE,_residue
|
|
mov SRC1,_src1
|
|
mov SRC2,_src2
|
|
/*#0 Load src1.*/
|
|
movq mm0,[SRC1]
|
|
/*#0 Load src2.*/
|
|
movq mm2,[SRC2]
|
|
/*#0 Copy src1.*/
|
|
movq mm1,mm0
|
|
/*#0 Copy src2.*/
|
|
movq mm3,mm2
|
|
/*#1 Load src1.*/
|
|
movq mm4,[SRC1+YSTRIDE]
|
|
/*#0 Unpack lower src1.*/
|
|
punpcklbw mm0,mm7
|
|
/*#1 Load src2.*/
|
|
movq mm5,[SRC2+YSTRIDE]
|
|
/*#0 Unpack higher src1.*/
|
|
punpckhbw mm1,mm7
|
|
/*#0 Unpack lower src2.*/
|
|
punpcklbw mm2,mm7
|
|
/*#0 Unpack higher src2.*/
|
|
punpckhbw mm3,mm7
|
|
/*Advance src1 ptr.*/
|
|
lea SRC1,[SRC1+YSTRIDE*2]
|
|
/*Advance src2 ptr.*/
|
|
lea SRC2,[SRC2+YSTRIDE*2]
|
|
/*#0 Lower src1+src2.*/
|
|
paddsw mm0,mm2
|
|
/*#0 Higher src1+src2.*/
|
|
paddsw mm1,mm3
|
|
/*#1 Copy src1.*/
|
|
movq mm2,mm4
|
|
/*#0 Build lo average.*/
|
|
psraw mm0,1
|
|
/*#1 Copy src2.*/
|
|
movq mm3,mm5
|
|
/*#1 Unpack lower src1.*/
|
|
punpcklbw mm4,mm7
|
|
/*#0 Build hi average.*/
|
|
psraw mm1,1
|
|
/*#1 Unpack higher src1.*/
|
|
punpckhbw mm2,mm7
|
|
/*#0 low+=residue.*/
|
|
paddsw mm0,[RESIDUE]
|
|
/*#1 Unpack lower src2.*/
|
|
punpcklbw mm5,mm7
|
|
/*#0 high+=residue.*/
|
|
paddsw mm1,[8+RESIDUE]
|
|
/*#1 Unpack higher src2.*/
|
|
punpckhbw mm3,mm7
|
|
/*#1 Lower src1+src2.*/
|
|
paddsw mm5,mm4
|
|
/*#0 Pack and saturate.*/
|
|
packuswb mm0,mm1
|
|
/*#1 Higher src1+src2.*/
|
|
paddsw mm3,mm2
|
|
/*#0 Write row.*/
|
|
movq [DST],mm0
|
|
/*#1 Build lo average.*/
|
|
psraw mm5,1
|
|
/*#1 Build hi average.*/
|
|
psraw mm3,1
|
|
/*#1 low+=residue.*/
|
|
paddsw mm5,[16+RESIDUE]
|
|
/*#1 high+=residue.*/
|
|
paddsw mm3,[24+RESIDUE]
|
|
/*#1 Pack and saturate.*/
|
|
packuswb mm5,mm3
|
|
/*#1 Write row ptr.*/
|
|
movq [DST+YSTRIDE],mm5
|
|
/*Advance residue ptr.*/
|
|
add RESIDUE,32
|
|
/*Advance dest ptr.*/
|
|
lea DST,[DST+YSTRIDE*2]
|
|
mov _dst,DST
|
|
mov _residue,RESIDUE
|
|
mov _src1,SRC1
|
|
mov _src2,SRC2
|
|
#undef SRC1
|
|
#undef SRC2
|
|
#undef YSTRIDE
|
|
#undef RESIDUE
|
|
#undef DST
|
|
}
|
|
}
|
|
}
|
|
|
|
void oc_restore_fpu_mmx(void){
|
|
__asm emms;
|
|
}
|
|
|
|
#endif
|