NekoX/TMessagesProj/jni/third_party/openh264/src/codec/common/x86/satd_sad.asm

2735 lines
68 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* satd_sad.asm
;*
;* Abstract
;* WelsSampleSatd4x4_sse2
;* WelsSampleSatd8x8_sse2
;* WelsSampleSatd16x8_sse2
;* WelsSampleSatd8x16_sse2
;* WelsSampleSatd16x16_sse2
;*
;* WelsSampleSad16x8_sse2
;* WelsSampleSad16x16_sse2
;*
;* History
;* 8/5/2009 Created
;* 24/9/2009 modified
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Data
;***********************************************************************
%ifdef X86_32_PICASM
SECTION .text align=16
%else
SECTION .rodata align=16
%endif
align 16
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
align 16
HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
align 16
PDW1: dw 1,1,1,1,1,1,1,1
align 16
PDQ2: dw 2,0,0,0,2,0,0,0
align 16
HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
;
;Pixel_satd_wxh_sse2 BEGIN
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
pxor %1, %1
pcmpeqw %2, %2
psubw %1, %2
%endmacro
%macro SSE2_SumWHorizon1 2
movdqa %2, %1
psrldq %2, 8
paddusw %1, %2
movdqa %2, %1
psrldq %2, 4
paddusw %1, %2
movdqa %2, %1
psrldq %2, 2
paddusw %1, %2
%endmacro
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %4, %5
SSE2_SumSub %2, %4, %5
SSE2_SumSub %1, %3, %5
%endmacro
%macro SSE2_SumAbs4 7
WELS_AbsW %1, %3
WELS_AbsW %2, %3
WELS_AbsW %4, %6
WELS_AbsW %5, %6
paddusw %1, %2
paddusw %4, %5
paddusw %7, %1
paddusw %7, %4
%endmacro
%macro SSE2_SumWHorizon 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro SSE2_GetSatd8x8 0
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
movd xmm1, [r0+r1]
lea r0 , [r0+2*r1]
movd xmm2, [r0]
movd xmm3, [r0+r1]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
movd xmm4, [r2]
movd xmm5, [r2+r3]
lea r2 , [r2+2*r3]
movd xmm6, [r2]
movd xmm7, [r2+r3]
punpckldq xmm4, xmm6
punpckldq xmm5, xmm7
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklbw xmm1, xmm6
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
POP_XMM
LOAD_4_PARA_POP
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
push r2
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
pop r2
pop r0
add r0, 8
add r2, 8
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
push r2
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSatd8x8
pop r2
pop r0
add r0, 8
add r2, 8
SSE2_GetSatd8x8
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSatd8x8
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
;***********************************************************************
;
;Pixel_satd_wxh_sse2 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************
%macro SSE_DB_1_2REG 2
pxor %1, %1
pcmpeqw %2, %2
psubb %1, %2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
WELS_EXTERN WelsSampleSatdThree4x4_sse2
%ifdef X86_32
push r3
push r4
push r5
push r6
%assign push_num 4
%else
%assign push_num 0
%endif
PUSH_XMM 8
mov r2, arg3
mov r3, arg4
SIGN_EXTENSION r3, r3d
; load source 4x4 samples and Hadamard transform
movd xmm0, [r2]
movd xmm1, [r2+r3]
lea r2 , [r2+2*r3]
movd xmm2, [r2]
movd xmm3, [r2+r3]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklbw xmm1, xmm6
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
SSE2_XSawp qdq, xmm0, xmm7, xmm1
; Hadamard transform results are saved in xmm0 and xmm2
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
;load top boundary samples: [a b c d]
mov r0, arg1
mov r1, arg2
SIGN_EXTENSION r1, r1d
sub r0, r1
%ifdef UNIX64
push r4
push r5
%endif
movzx r2d, byte [r0]
movzx r3d, byte [r0+1]
movzx r4d, byte [r0+2]
movzx r5d, byte [r0+3]
; get the transform results of top boundary samples: [a b c d]
add r3d, r2d ; r3d = a + b
add r5d, r4d ; r5d = c + d
add r2d, r2d ; r2d = a + a
add r4d, r4d ; r4d = c + c
sub r2d, r3d ; r2d = a + a - a - b = a - b
sub r4d, r5d ; r4d = c + c - c - d = c - d
add r5d, r3d ; r5d = (a + b) + (c + d)
add r3d, r3d
sub r3d, r5d ; r3d = (a + b) - (c + d)
add r4d, r2d ; r4d = (a - b) + (c - d)
add r2d, r2d
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
movdqa xmm6, xmm0
movdqa xmm7, xmm2
movd xmm5, r5d ; store the edi for DC mode
pxor xmm3, xmm3
pxor xmm4, xmm4
pinsrw xmm3, r5d, 0
pinsrw xmm3, r4d, 4
psllw xmm3, 2
pinsrw xmm4, r3d, 0
pinsrw xmm4, r2d, 4
psllw xmm4, 2
; get the satd of H
psubw xmm0, xmm3
psubw xmm2, xmm4
WELS_AbsW xmm0, xmm1
WELS_AbsW xmm2, xmm1
paddusw xmm0, xmm2
SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
;load left boundary samples: [a b c d]'
add r0, r1
movzx r2d, byte [r0-1]
movzx r3d, byte [r0+r1-1]
lea r0 , [r0+2*r1]
movzx r4d, byte [r0-1]
movzx r5d, byte [r0+r1-1]
; get the transform results of left boundary samples: [a b c d]'
add r3d, r2d ; r3d = a + b
add r5d, r4d ; r5d = c + d
add r2d, r2d ; r2d = a + a
add r4d, r4d ; r4d = c + c
sub r2d, r3d ; r2d = a + a - a - b = a - b
sub r4d, r5d ; r4d = c + c - c - d = c - d
add r5d, r3d ; r5d = (a + b) + (c + d)
add r3d, r3d
sub r3d, r5d ; r3d = (a + b) - (c + d)
add r4d, r2d ; r4d = (a - b) + (c - d)
add r2d, r2d
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
; store the transform results in xmm3
movd xmm3, r5d
pinsrw xmm3, r3d, 1
pinsrw xmm3, r2d, 2
pinsrw xmm3, r4d, 3
psllw xmm3, 2
; get the satd of V
movdqa xmm2, xmm6
movdqa xmm4, xmm7
psubw xmm2, xmm3
WELS_AbsW xmm2, xmm1
WELS_AbsW xmm4, xmm1
paddusw xmm2, xmm4
SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
; DC result is stored in xmm1
add r5d, 4
movd xmm1, r5d
paddw xmm1, xmm5
psrlw xmm1, 3
movdqa xmm5, xmm1
psllw xmm1, 4
; get the satd of DC
psubw xmm6, xmm1
WELS_AbsW xmm6, xmm1
WELS_AbsW xmm7, xmm1
paddusw xmm6, xmm7
SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
%ifdef UNIX64
pop r5
pop r4
%endif
; comparing order: DC H V
mov r4, arg5
movd r2d, xmm6
movd r3d, xmm2
movd r6d, xmm0
and r2d, 0xffff
shr r2d, 1
and r3d, 0xffff
shr r3d, 1
and r6d, 0xffff
shr r6d, 1
add r2d, dword arg7
add r3d, dword arg8
add r6d, dword arg9
cmp r2w, r3w
jg near not_dc
cmp r2w, r6w
jg near not_dc_h
; for DC mode
movd r3d, xmm5
imul r3d, 0x01010101
movd xmm5, r3d
pshufd xmm5, xmm5, 0
movdqa [r4], xmm5
mov r5, arg6
mov dword [r5], 0x02
mov retrd, r2d
POP_XMM
%ifdef X86_32
pop r6
pop r5
pop r4
pop r3
%endif
ret
not_dc:
cmp r3w, r6w
jg near not_dc_h
; for H mode
SSE_DB_1_2REG xmm6, xmm7
sub r0, r1
sub r0, r1
movzx r6d, byte [r0-1]
movd xmm0, r6d
pmuludq xmm0, xmm6
movzx r6d, byte [r0+r1-1]
movd xmm1, r6d
pmuludq xmm1, xmm6
punpckldq xmm0, xmm1
lea r0, [r0+r1*2]
movzx r6d, byte [r0-1]
movd xmm2, r6d
pmuludq xmm2, xmm6
movzx r6d, byte [r0+r1-1]
movd xmm3, r6d
pmuludq xmm3, xmm6
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
movdqa [r4],xmm0
mov retrd, r3d
mov r5, arg6
mov dword [r5], 0x01
POP_XMM
%ifdef X86_32
pop r6
pop r5
pop r4
pop r3
%endif
ret
not_dc_h:
sub r0, r1
sub r0, r1
sub r0, r1
movd xmm0, [r0]
pshufd xmm0, xmm0, 0
movdqa [r4],xmm0
mov retrd, r6d
mov r5, arg6
mov dword [r5], 0x00
POP_XMM
%ifdef X86_32
pop r6
pop r5
pop r4
pop r3
%endif
ret
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
punpckldq %1, %2
punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
paddd xmm4, %1 ;for dc
paddd xmm4, %3 ;for dc
packssdw %1, %3
psllw %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
punpckldq %1, %2
punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
; paddd xmm4, %1 ;for dc
; paddd xmm4, %3 ;for dc
movdqa %4, %1
punpcklqdq %4, %3
packssdw %1, %3
psllw %1, 2
%endmacro
%macro SSE41_GetX38x4SatdDec 0
pxor xmm7, xmm7
movq xmm0, [r2]
movq xmm1, [r2+r3]
lea r2, [r2+2*r3]
movq xmm2, [r2]
movq xmm3, [r2+r3]
lea r2, [r2+2*r3]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
punpcklbw xmm2, xmm7
punpcklbw xmm3, xmm7
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
;doesn't need another transpose
%endmacro
%macro SSE41_GetX38x4SatdV 2
pxor xmm0, xmm0
pinsrw xmm0, word[r6+%2], 0
pinsrw xmm0, word[r6+%2+8], 4
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[r6+%2+2], 0
pinsrw xmm0, word[r6+%2+10], 4
psubsw xmm0, xmm1
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[r6+%2+4], 0
pinsrw xmm0, word[r6+%2+12], 4
psubsw xmm0, xmm3
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[r6+%2+6], 0
pinsrw xmm0, word[r6+%2+14], 4
psubsw xmm0, xmm2
pabsw xmm0, xmm0
paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
movq xmm0, [r6+%3+8*%1]
punpcklqdq xmm0, xmm0
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm5, xmm0
pabsw xmm1, xmm1
pabsw xmm2, xmm2
pabsw xmm3, xmm3
paddw xmm2, xmm1;for DC
paddw xmm2, xmm3;for DC
paddw xmm5, xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
pxor xmm0, xmm0
movq2dq xmm0, mm4
punpcklqdq xmm0, xmm0
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
shl %1, 4
movdqa xmm0, [r6+32+%1]
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 32
SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 16
SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
pmaddwd %1, %2
movhlps %3, %1
paddd %1, %3
pshuflw %3, %1,0Eh
paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
%assign push_num 0
LOAD_7_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r2
%endif
INIT_X86_32_PIC r2
pxor xmm4, xmm4
movdqa xmm5, [pic(HSumSubDB1)]
movdqa xmm6, [pic(HSumSubDW1)]
movdqa xmm7, [pic(PDW1)]
DEINIT_X86_32_PIC
sub r0, r1
movdqu xmm0, [r0]
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [r6], xmm0 ;V
movdqa [r6+16], xmm1
add r0, r1
pinsrb xmm0, byte[r0-1], 0
pinsrb xmm0, byte[r0+r1-1], 1
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 2
pinsrb xmm0, byte[r0+r1-1], 3
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 4
pinsrb xmm0, byte[r0+r1-1], 5
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 6
pinsrb xmm0, byte[r0+r1-1], 7
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 8
pinsrb xmm0, byte[r0+r1-1], 9
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 10
pinsrb xmm0, byte[r0+r1-1], 11
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 12
pinsrb xmm0, byte[r0+r1-1], 13
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 14
pinsrb xmm0, byte[r0+r1-1], 15
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [r6+32], xmm0 ;H
movdqa [r6+48], xmm1
movd r0d, xmm4 ;dc
add r0d, 16 ;(sum+16)
shr r0d, 5 ;((sum+16)>>5)
shl r0d, 4 ;
movd mm4, r0d ; mm4 copy DC
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
%ifdef UNIX64
push r4
%endif
mov r0, 0
mov r4, 0
.loop16x16_get_satd:
.loopStart1:
SSE41_I16x16GetX38x4Satd r0, r4
inc r0
cmp r0, 4
jl .loopStart1
cmp r4, 16
je .loop16x16_get_satd_end
%ifdef X86_32
mov r2, arg3
%else
mov r2, r12
%endif
add r2, 8
mov r0, 0
add r4, 16
jmp .loop16x16_get_satd
.loop16x16_get_satd_end:
MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
%ifdef UNIX64
pop r4
%endif
; comparing order: DC H V
movd r3d, xmm6 ;DC
movd r1d, xmm5 ;H
movd r0d, xmm4 ;V
%ifndef X86_32
pop r12
%endif
shl r5d, 1
add r1d, r5d
add r3d, r5d
mov r4, arg5
cmp r3d, r1d
jge near not_dc_16x16
cmp r3d, r0d
jge near not_dc_h_16x16
; for DC mode
mov dword[r4], 2;I16_PRED_DC
mov retrd, r3d
jmp near return_satd_intra_16x16_x3
not_dc_16x16:
; for H mode
cmp r1d, r0d
jge near not_dc_h_16x16
mov dword[r4], 1;I16_PRED_H
mov retrd, r1d
jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
; for V mode
mov dword[r4], 0;I16_PRED_V
mov retrd, r0d
return_satd_intra_16x16_x3:
WELSEMMS
POP_XMM
LOAD_7_PARA_POP
ret
%macro SSE41_ChromaGetX38x8Satd 0
movdqa xmm5, [pic(HSumSubDB1)]
movdqa xmm6, [pic(HSumSubDW1)]
movdqa xmm7, [pic(PDW1)]
sub r0, r1
movq xmm0, [r0]
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
movdqa [r6], xmm0 ;V
add r0, r1
pinsrb xmm0, byte[r0-1], 0
pinsrb xmm0, byte[r0+r1-1], 1
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 2
pinsrb xmm0, byte[r0+r1-1], 3
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 4
pinsrb xmm0, byte[r0+r1-1], 5
lea r0, [r0+2*r1]
pinsrb xmm0, byte[r0-1], 6
pinsrb xmm0, byte[r0+r1-1], 7
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
movdqa [r6+16], xmm0 ;H
;(sum+2)>>2
movdqa xmm6, [pic(PDQ2)]
movdqa xmm5, xmm4
punpckhqdq xmm5, xmm1
paddd xmm5, xmm6
psrld xmm5, 2
;(sum1+sum2+4)>>3
paddd xmm6, xmm6
paddd xmm4, xmm1
paddd xmm4, xmm6
psrld xmm4, 3
;satd *16
pslld xmm5, 4
pslld xmm4, 4
;temp satd
movdqa xmm6, xmm4
punpcklqdq xmm4, xmm5
psllq xmm4, 32
psrlq xmm4, 32
movdqa [r6+32], xmm4
punpckhqdq xmm5, xmm6
psllq xmm5, 32
psrlq xmm5, 32
movdqa [r6+48], xmm5
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
mov r0, 0
SSE41_ChromaGetX38x4Satd r0, 0
inc r0
SSE41_ChromaGetX38x4Satd r0, 0
%endmacro
%macro SSEReg2MMX 3
movdq2q %2, %1
movhlps %1, %1
movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
movq2dq %1, %3
movq2dq %2, %4
punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
%assign push_num 0
LOAD_7_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
INIT_X86_32_PIC r4
SSE41_ChromaGetX38x8Satd
SSEReg2MMX xmm4, mm0,mm1
SSEReg2MMX xmm5, mm2,mm3
SSEReg2MMX xmm6, mm5,mm6
mov r0, arg8
mov r2, arg9
SSE41_ChromaGetX38x8Satd
DEINIT_X86_32_PIC
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
MMXReg2SSE xmm2, xmm3, mm5, mm6
paddw xmm4, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
; comparing order: DC H V
movd r3d, xmm6 ;DC
movd r1d, xmm5 ;H
movd r0d, xmm4 ;V
shl r5d, 1
add r1d, r5d
add r0d, r5d
cmp r3d, r1d
jge near not_dc_8x8
cmp r3d, r0d
jge near not_dc_h_8x8
; for DC mode
mov dword[r4], 0;I8_PRED_DC
mov retrd, r3d
jmp near return_satd_intra_8x8_x3
not_dc_8x8:
; for H mode
cmp r1d, r0d
jge near not_dc_h_8x8
mov dword[r4], 1;I8_PRED_H
mov retrd, r1d
jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
; for V mode
mov dword[r4], 2;I8_PRED_V
mov retrd, r0d
return_satd_intra_8x8_x3:
WELSEMMS
POP_XMM
LOAD_7_PARA_POP
ret
;***********************************************************************
;
;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
movd xmm6,%1
pshufb xmm6,xmm1
movdqa %1, xmm6
movdqa xmm0,%2
psadbw xmm0,xmm7
paddw xmm4,xmm0
movdqa xmm0,%2
psadbw xmm0,xmm5
paddw xmm2,xmm0
psadbw xmm6,%2
paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
movzx %2, byte %1
mov %3, %2
add %4, %2
%endmacro
;***********************************************************************
;
;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
%assign push_num 0
LOAD_7_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
push r5
push r4
push r3
sub r0, r1
movdqa xmm5,[r0]
pxor xmm0,xmm0
psadbw xmm0,xmm5
movhlps xmm1,xmm0
paddw xmm0,xmm1
movd r5d, xmm0
add r0,r1
lea r3,[r1+2*r1] ;ebx r3
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
lea r0, [r0+4*r1]
add r6, 64
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
lea r0, [r0+4*r1]
add r6, 64
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
lea r0, [r0+4*r1]
add r6, 64
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
sub r6, 192
add r5d,10h
shr r5d,5
movd xmm7,r5d
pxor xmm1,xmm1
pshufb xmm7,xmm1
pxor xmm4,xmm4
pxor xmm3,xmm3
pxor xmm2,xmm2
;sad begin
pop r3
lea r4, [r3+2*r3] ;esi r4
SSSE3_Get16BSadHVDC [r6], [r2]
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
add r6, 64
lea r2, [r2+4*r3]
SSSE3_Get16BSadHVDC [r6], [r2]
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
add r6, 64
lea r2, [r2+4*r3]
SSSE3_Get16BSadHVDC [r6], [r2]
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
add r6, 64
lea r2, [r2+4*r3]
SSSE3_Get16BSadHVDC [r6], [r2]
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
pop r4
pop r5
pslldq xmm3,4
por xmm3,xmm2
movhlps xmm1,xmm3
paddw xmm3,xmm1
movhlps xmm0,xmm4
paddw xmm4,xmm0
; comparing order: DC H V
movd r1d, xmm4 ;DC ;ebx r1d
movd r0d, xmm3 ;V ;ecx r0d
psrldq xmm3, 4
movd r2d, xmm3 ;H ;esi r2d
;mov eax, [esp+36] ;lamda ;eax r5
shl r5d, 1
add r2d, r5d
add r1d, r5d
;mov edx, [esp+32] ;edx r4
cmp r1d, r2d
jge near not_dc_16x16_sad
cmp r1d, r0d
jge near not_dc_h_16x16_sad
; for DC mode
mov dword[r4], 2;I16_PRED_DC
mov retrd, r1d
sub r6, 192
%assign x 0
%rep 16
movdqa [r6+16*x], xmm7
%assign x x+1
%endrep
jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
; for H mode
cmp r2d, r0d
jge near not_dc_h_16x16_sad
mov dword[r4], 1;I16_PRED_H
mov retrd, r2d
jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
; for V mode
mov dword[r4], 0;I16_PRED_V
mov retrd, r0d
sub r6, 192
%assign x 0
%rep 16
movdqa [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
POP_XMM
LOAD_7_PARA_POP
ret
;***********************************************************************
;
;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************
;SSE4.1
%macro SSE41_GetSatd8x4 0
movq xmm0, [r0]
punpcklqdq xmm0, xmm0
pmaddubsw xmm0, xmm7
movq xmm1, [r0+r1]
punpcklqdq xmm1, xmm1
pmaddubsw xmm1, xmm7
movq xmm2, [r2]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
movq xmm3, [r2+r3]
punpcklqdq xmm3, xmm3
pmaddubsw xmm3, xmm7
psubsw xmm0, xmm2
psubsw xmm1, xmm3
movq xmm2, [r0+2*r1]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
movq xmm3, [r0+r4]
punpcklqdq xmm3, xmm3
pmaddubsw xmm3, xmm7
movq xmm4, [r2+2*r3]
punpcklqdq xmm4, xmm4
pmaddubsw xmm4, xmm7
movq xmm5, [r2+r5]
punpcklqdq xmm5, xmm5
pmaddubsw xmm5, xmm7
psubsw xmm2, xmm4
psubsw xmm3, xmm5
SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
pabsw xmm0, xmm0
pabsw xmm2, xmm2
pabsw xmm1, xmm1
pabsw xmm3, xmm3
movdqa xmm4, xmm3
pblendw xmm3, xmm1, 0xAA
pslld xmm1, 16
psrld xmm4, 16
por xmm1, xmm4
pmaxuw xmm1, xmm3
paddw xmm6, xmm1
movdqa xmm4, xmm0
pblendw xmm0, xmm2, 0xAA
pslld xmm2, 16
psrld xmm4, 16
por xmm2, xmm4
pmaxuw xmm0, xmm2
paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
MMX_DW_1_2REG %3, %4
pmaddwd %2, %3
movhlps %4, %2
paddd %2, %4
pshuflw %4, %2,0Eh
paddd %2, %4
movd %1, %2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
%assign push_num 0
INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm4,[pic(HSwapSumSubDB1)]
movd xmm2,[r2]
movd xmm5,[r2+r3]
shufps xmm2,xmm5,0
movd xmm3,[r2+r3*2]
lea r2, [r3*2+r2]
movd xmm5,[r2+r3]
shufps xmm3,xmm5,0
movd xmm0,[r0]
movd xmm5,[r0+r1]
shufps xmm0,xmm5,0
movd xmm1,[r0+r1*2]
lea r0, [r1*2+r0]
movd xmm5,[r0+r1]
shufps xmm1,xmm5,0
pmaddubsw xmm0,xmm4
pmaddubsw xmm1,xmm4
pmaddubsw xmm2,xmm4
pmaddubsw xmm3,xmm4
psubw xmm0,xmm2
psubw xmm1,xmm3
movdqa xmm2,xmm0
paddw xmm0,xmm1
psubw xmm1,xmm2
movdqa xmm2,xmm0
punpcklqdq xmm0,xmm1
punpckhqdq xmm2,xmm1
movdqa xmm1,xmm0
paddw xmm0,xmm2
psubw xmm2,xmm1
movdqa xmm1,xmm0
pblendw xmm0,xmm2,0AAh
pslld xmm2,16
psrld xmm1,16
por xmm2,xmm1
pabsw xmm0,xmm0
pabsw xmm2,xmm2
pmaxsw xmm0,xmm2
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
%ifdef X86_32
push r4
push r5
%endif
%assign push_num 2
INIT_X86_32_PIC r6
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [pic(HSumSubDB1)]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
%ifdef X86_32
push r4
push r5
push r6
%endif
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
INIT_X86_32_PIC_NOPRESERVE r4
movdqa xmm7, [pic(HSumSubDB1)]
DEINIT_X86_32_PIC
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
mov r6, 0
loop_get_satd_8x16:
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
inc r6
cmp r6, 4
jl loop_get_satd_8x16
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
%ifdef X86_32
push r4
push r5
%endif
%assign push_num 2
INIT_X86_32_PIC r6
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
push r2
movdqa xmm7, [pic(HSumSubDB1)]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
pop r2
pop r0
add r0, 8
add r2, 8
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse41
%ifdef X86_32
push r4
push r5
push r6
%endif
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
push r2
INIT_X86_32_PIC_NOPRESERVE r4
movdqa xmm7, [pic(HSumSubDB1)]
DEINIT_X86_32_PIC
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
mov r6, 0
loop_get_satd_16x16_left:
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
inc r6
cmp r6, 4
jl loop_get_satd_16x16_left
pop r2
pop r0
add r0, 8
add r2, 8
mov r6, 0
loop_get_satd_16x16_right:
SSE41_GetSatd8x4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
inc r6
cmp r6, 4
jl loop_get_satd_16x16_right
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;Pixel_satd_wxh_sse41 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_wxh_avx2 BEGIN
;
;***********************************************************************
%ifdef HAVE_AVX2
; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
%macro AVX2_LoadDiffSatd16x1 5
vbroadcasti128 %1, [%2]
vpmaddubsw %1, %1, %4 ; hadamard neighboring horizontal sums and differences
vbroadcasti128 %5, [%3]
vpmaddubsw %5, %5, %4 ; hadamard neighboring horizontal sums and differences
vpsubw %1, %1, %5 ; diff srcA srcB
%endmacro
; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8
%macro AVX2_LoadDiffSatd8x2 8
vpbroadcastq %1, [%2]
vpbroadcastq %7, [%3]
vpblendd %1, %1, %7, 11110000b
vpmaddubsw %1, %1, %6 ; hadamard neighboring horizontal sums and differences
vpbroadcastq %7, [%4]
vpbroadcastq %8, [%5]
vpblendd %7, %7, %8, 11110000b
vpmaddubsw %7, %7, %6 ; hadamard neighboring horizontal sums and differences
vpsubw %1, %1, %7 ; diff srcA srcB
%endmacro
; in/out=%1,%2,%3,%4 clobber=%5
%macro AVX2_HDMFour4x4 5
vpsubw %5, %1, %4 ; s3 = x0 - x3
vpaddw %1, %1, %4 ; s0 = x0 + x3
vpsubw %4, %2, %3 ; s2 = x1 - x2
vpaddw %2, %2, %3 ; s1 = x1 + x2
vpsubw %3, %1, %2 ; y2 = s0 - s1
vpaddw %1, %1, %2 ; y0 = s0 + s1
vpaddw %2, %5, %4 ; y1 = s3 + s2
vpsubw %4, %5, %4 ; y3 = s3 - s2
%endmacro
; out=%1 in=%1,%2,%3,%4 clobber=%5
%macro AVX2_SatdFour4x4 5
AVX2_HDMFour4x4 %1, %2, %3, %4, %5
vpabsw %1, %1
vpabsw %2, %2
vpabsw %3, %3
vpabsw %4, %4
; second stage of horizontal hadamard.
; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|)
vpblendw %5, %1, %2, 10101010b
vpslld %2, %2, 16
vpsrld %1, %1, 16
vpor %2, %2, %1
vpmaxuw %2, %2, %5
vpblendw %5, %3, %4, 10101010b
vpslld %4, %4, 16
vpsrld %3, %3, 16
vpor %4, %4, %3
vpmaxuw %3, %5, %4
vpaddw %1, %2, %3
%endmacro
; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12
%macro AVX2_GetSatd16x4 12
AVX2_LoadDiffSatd16x1 %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12
AVX2_LoadDiffSatd16x1 %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12
AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12
AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12
AVX2_SatdFour4x4 %1, %9, %10, %11, %12
%endmacro
; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13
%macro AVX2_GetSatd8x8 13
AVX2_LoadDiffSatd8x2 %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
add %2, %3
add %5, %6
AVX2_LoadDiffSatd8x2 %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
AVX2_SatdFour4x4 %1, %9, %10, %11, %12
%endmacro
; d_out=%1 mm_in=%2 mm_clobber=%3
%macro AVX2_SumWHorizon 3
WELS_DW1_VEX y%3
vpmaddwd y%2, y%2, y%3
vextracti128 x%3, y%2, 1
vpaddd x%2, x%2, x%3
vpunpckhqdq x%3, x%2, x%2
vpaddd x%2, x%2, x%3
vpsrldq x%3, x%2, 4
vpaddd x%2, x%2, x%3
vmovd %1, x%2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_avx2
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
mov r4, 2 ; loop cnt
jmp WelsSampleSatd8x8N_avx2
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_avx2
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
mov r4, 1 ; loop cnt
; fall through
WelsSampleSatd8x8N_avx2:
%ifdef X86_32
push r5
push r6
%assign push_num push_num+2
%endif
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
INIT_X86_32_PIC_NOPRESERVE r5
vbroadcasti128 ymm7, [pic(HSumSubDB1)]
DEINIT_X86_32_PIC
lea r5, [3 * r1]
lea r6, [3 * r3]
vpxor ymm6, ymm6, ymm6
.loop:
AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5
vpaddw ymm6, ymm6, ymm0
sub r4, 1
jbe .loop_end
add r0, r5
add r2, r6
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
jmp .loop
.loop_end:
AVX2_SumWHorizon retrd, mm6, mm5
vzeroupper
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_avx2
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
mov r4, 4 ; loop cnt
jmp WelsSampleSatd16x4N_avx2
;***********************************************************************
;
;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_avx2
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
mov r4, 2 ; loop cnt
; fall through
WelsSampleSatd16x4N_avx2:
%ifdef X86_32
push r5
push r6
%assign push_num push_num+2
%endif
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
INIT_X86_32_PIC_NOPRESERVE r5
vpbroadcastq xmm0, [pic(HSumSubDB1)]
vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)]
vpblendd ymm6, ymm0, ymm6, 11110000b
DEINIT_X86_32_PIC
lea r5, [3 * r1]
lea r6, [3 * r3]
vpxor ymm5, ymm5, ymm5
.loop:
AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4
vpaddw ymm5, ymm5, ymm0
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
sub r4, 1
ja .loop
AVX2_SumWHorizon retrd, mm5, mm0
vzeroupper
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
pop r5
pop r4
%endif
ret
%endif
;***********************************************************************
;
;Pixel_satd_wxh_avx2 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_sad_wxh_sse2 BEGIN
;
;***********************************************************************
%macro SSE2_GetSad2x16 0
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqu xmm1, [r2]
MOVDQ xmm2, [r0];[eax] must aligned 16
psadbw xmm1, xmm2
paddw xmm0, xmm1
movdqu xmm1, [r2+r3]
MOVDQ xmm2, [r0+r1]
psadbw xmm1, xmm2
paddw xmm0, xmm1
%endmacro
%macro SSE2_GetSad4x16 0
movdqu xmm0, [r2]
MOVDQ xmm2, [r0]
psadbw xmm0, xmm2
paddw xmm7, xmm0
movdqu xmm1, [r2+r3]
MOVDQ xmm2, [r0+r1]
psadbw xmm1, xmm2
paddw xmm7, xmm1
movdqu xmm1, [r2+2*r3]
MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
psadbw xmm1, xmm2
paddw xmm7, xmm1
movdqu xmm1, [r2+r5]
MOVDQ xmm2, [r0+r4]
psadbw xmm1, xmm2
paddw xmm7, xmm1
%endmacro
%macro SSE2_GetSad8x4 0
movq xmm0, [r0]
movq xmm1, [r0+r1]
lea r0, [r0+2*r1]
movhps xmm0, [r0]
movhps xmm1, [r0+r1]
movq xmm2, [r2]
movq xmm3, [r2+r3]
lea r2, [r2+2*r3]
movhps xmm2, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm2
psadbw xmm1, xmm3
paddw xmm6, xmm0
paddw xmm6, xmm1
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
%ifdef X86_32
push r4
push r5
%endif
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
lea r4, [3*r1]
lea r5, [3*r3]
pxor xmm7, xmm7
SSE2_GetSad4x16
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE2_GetSad4x16
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE2_GetSad4x16
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE2_GetSad4x16
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
pop r4
%endif
ret
;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqu xmm0, [r2]
MOVDQ xmm2, [r0]
psadbw xmm0, xmm2
movdqu xmm1, [r2+r3]
MOVDQ xmm2, [r0+r1]
psadbw xmm1, xmm2
paddw xmm0, xmm1
SSE2_GetSad2x16
SSE2_GetSad2x16
SSE2_GetSad2x16
movhlps xmm1, xmm0
paddw xmm0, xmm1
movd retrd, xmm0
LOAD_4_PARA_POP
ret
WELS_EXTERN WelsSampleSad8x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSad8x4
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSad8x4
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
ret
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
and %1, 0x1f|(%3>>1)
cmp %1, (32-%2)|(%3>>1)
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
%assign push_num 0
mov r2, arg3
push r2
CACHE_SPLIT_CHECK r2, 8, 64
jle near .pixel_sad_8x8_nsplit
pop r2
%ifdef X86_32
push r3
push r4
push r5
%endif
%assign push_num 3
PUSH_XMM 8
mov r0, arg1
mov r1, arg2
SIGN_EXTENSION r1, r1d
pxor xmm7, xmm7
;ecx r2, edx r4, edi r5
mov r5, r2
and r5, 0x07
sub r2, r5
mov r4, 8
sub r4, r5
shl r5, 3
shl r4, 3
movd xmm5, r5d
movd xmm6, r4d
mov r5, 8
add r5, r2
mov r3, arg4
SIGN_EXTENSION r3, r3d
movq xmm0, [r0]
movhps xmm0, [r0+r1]
movq xmm1, [r2]
movq xmm2, [r5]
movhps xmm1, [r2+r3]
movhps xmm2, [r5+r3]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
lea r5, [r5+2*r3]
movq xmm0, [r0]
movhps xmm0, [r0+r1]
movq xmm1, [r2]
movq xmm2, [r5]
movhps xmm1, [r2+r3]
movhps xmm2, [r5+r3]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
lea r5, [r5+2*r3]
movq xmm0, [r0]
movhps xmm0, [r0+r1]
movq xmm1, [r2]
movq xmm2, [r5]
movhps xmm1, [r2+r3]
movhps xmm2, [r5+r3]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
lea r5, [r5+2*r3]
movq xmm0, [r0]
movhps xmm0, [r0+r1]
movq xmm1, [r2]
movq xmm2, [r5]
movhps xmm1, [r2+r3]
movhps xmm2, [r5+r3]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
POP_XMM
%ifdef X86_32
pop r5
pop r4
pop r3
%endif
jmp .return
.pixel_sad_8x8_nsplit:
pop r2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
.return:
ret
;***********************************************************************
;
;Pixel_sad_wxh_sse2 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
psadbw %1, %4
paddw xmm5, %1
psadbw %4, %3
paddw xmm4, %4
movdqu %4, [%5-1]
psadbw %4, %2
paddw xmm6, %4
movdqu %4, [%5+1]
psadbw %4, %2
paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movdqa xmm0, [r0]
sub r2, r3
movdqu xmm3, [r2]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movdqa xmm1, [r0+r1]
movdqu xmm3, [r2+r3]
psadbw xmm3, xmm1
paddw xmm4, xmm3
movdqu xmm2, [r2+r3-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [r2+r3+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm2, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
movdqa xmm0, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm1, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
movdqa xmm2, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm0, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
movdqa xmm1, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm2, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
movdqa xmm0, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm1, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
movdqa xmm2, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm0, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
movdqa xmm1, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm2, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
movdqa xmm0, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
lea r2, [r2+2*r3]
movdqu xmm3, [r2]
psadbw xmm2, xmm3
paddw xmm5, xmm2
movdqu xmm2, [r2-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [r2+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
movdqu xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
WELS_EXTERN WelsSampleSadFour16x8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movdqa xmm0, [r0]
sub r2, r3
movdqu xmm3, [r2]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movdqa xmm1, [r0+r1]
movdqu xmm3, [r2+r3]
psadbw xmm3, xmm1
paddw xmm4, xmm3
movdqu xmm2, [r2+r3-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [r2+r3+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm2, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
movdqa xmm0, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm1, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
movdqa xmm2, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movdqa xmm0, [r0]
movdqu xmm3, [r2]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
movdqa xmm1, [r0+r1]
movdqu xmm3, [r2+r3]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
lea r2, [r2+2*r3]
movdqu xmm3, [r2]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movdqu xmm0, [r2-1]
psadbw xmm0, xmm1
paddw xmm6, xmm0
movdqu xmm3, [r2+1]
psadbw xmm3, xmm1
paddw xmm7, xmm3
movdqu xmm3, [r2+r3]
psadbw xmm1, xmm3
paddw xmm5, xmm1
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movq xmm0, [r0]
movhps xmm0, [r0+r1]
sub r2, r3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
WELS_EXTERN WelsSampleSadFour8x8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movq xmm0, [r0]
movhps xmm0, [r0+r1]
sub r2, r3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [r0]
movhps xmm0, [r0+r1]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [r2+r3-1]
movq xmm3, [r2+r3+1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movhps xmm1, [r2-1]
movhps xmm3, [r2+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [r2]
movhps xmm3, [r2+r3]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
movd xmm1, [r0+r1]
lea r0, [r0+2*r1]
movd xmm2, [r0]
movd xmm3, [r0+r1]
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub r2, r3
movd xmm1, [r2]
movd xmm2, [r2+r3]
punpckldq xmm1, xmm2
movd xmm2, [r2+r3-1]
movd xmm3, [r2+r3+1]
lea r2, [r2+2*r3]
movd xmm4, [r2]
movd xmm5, [r2-1]
punpckldq xmm2, xmm5
movd xmm5, [r2+1]
punpckldq xmm3, xmm5
movd xmm5, [r2+r3]
punpckldq xmm4, xmm5
punpcklqdq xmm1, xmm4 ;-L
movd xmm5, [r2+r3-1]
movd xmm6, [r2+r3+1]
lea r2, [r2+2*r3]
movd xmm7, [r2-1]
punpckldq xmm5, xmm7
punpcklqdq xmm2, xmm5 ;-1
movd xmm7, [r2+1]
punpckldq xmm6, xmm7
punpcklqdq xmm3, xmm6 ;+1
movd xmm6, [r2]
movd xmm7, [r2+r3]
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6 ;+L
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
movhlps xmm0, xmm1
paddw xmm1, xmm0
movhlps xmm0, xmm2
paddw xmm2, xmm0
movhlps xmm0, xmm3
paddw xmm3, xmm0
movhlps xmm0, xmm4
paddw xmm4, xmm0
punpckldq xmm1, xmm4
punpckldq xmm2, xmm3
punpcklqdq xmm1, xmm2
movdqa [r4],xmm1
POP_XMM
LOAD_5_PARA_POP
ret
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
;
;***********************************************************************
;***********************************************************************
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd mm0, [r0]
movd mm1, [r0+r1]
punpckldq mm0, mm1
movd mm3, [r2]
movd mm4, [r2+r3]
punpckldq mm3, mm4
psadbw mm0, mm3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movd mm1, [r0]
movd mm2, [r0+r1]
punpckldq mm1, mm2
movd mm3, [r2]
movd mm4, [r2+r3]
punpckldq mm3, mm4
psadbw mm1, mm3
paddw mm0, mm1
movd retrd, mm0
WELSEMMS
LOAD_4_PARA_POP
ret