summaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/h264_deblock.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/h264_deblock.asm')
-rw-r--r--libavcodec/x86/h264_deblock.asm404
1 files changed, 375 insertions, 29 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 33fd5a9..6702ae9 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
;* Fiona Glaser <fiona@x264.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -37,11 +37,6 @@ cextern pb_0
cextern pb_1
cextern pb_3
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
%define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
@@ -287,18 +282,18 @@ cextern pb_3
; int8_t *tc0)
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma_8, 5,5,10
+cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
movd m8, [r4] ; tc0
- lea r4, [r1*3]
- dec r2d ; alpha-1
+ lea r4, [stride_q*3]
+ dec alpha_d ; alpha-1
neg r4
- dec r3d ; beta-1
- add r4, r0 ; pix-3*stride
+ dec beta_d ; beta-1
+ add base3_q, pix_q ; pix-3*stride
- mova m0, [r4+r1] ; p1
- mova m1, [r4+2*r1] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r1] ; q1
+ mova m0, [base3_q + stride_q] ; p1
+ mova m1, [base3_q + 2*stride_q] ; p0
+ mova m2, [pix_q] ; q0
+ mova m3, [pix_q + stride_q] ; q1
LOAD_MASK r2d, r3d
punpcklbw m8, m8
@@ -308,24 +303,24 @@ cglobal deblock_v_luma_8, 5,5,10
pandn m9, m7
pand m8, m9
- movdqa m3, [r4] ; p2
+ movdqa m3, [base3_q] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
psubb m7, m8, m6
pand m6, m8
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+ LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
- movdqa m4, [r0+2*r1] ; q2
+ movdqa m4, [pix_q + 2*stride_q] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
pand m6, m9
pand m8, m6
psubb m7, m6
- mova m3, [r0+r1]
- LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+ mova m3, [pix_q + stride_q]
+ LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
DEBLOCK_P0_Q0
- mova [r4+2*r1], m1
- mova [r0], m2
+ mova [base3_q + 2*stride_q], m1
+ mova [pix_q], m2
RET
;-----------------------------------------------------------------------------
@@ -382,10 +377,101 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
RET
%endmacro
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
+ movsxd stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ mov base3_q, pix_q
+ lea stride3_q, [3*stride_q]
+ add base3_q, stride3_q
+
+ movq m0, [pix_q - 4]
+ movq m1, [pix_q + stride_q - 4]
+ movq m2, [pix_q + 2*stride_q - 4]
+ movq m3, [base3_q - 4]
+ movq m4, [base3_q + stride_q - 4]
+ movq m5, [base3_q + 2*stride_q - 4]
+ movq m6, [base3_q + stride3_q - 4]
+ movq m7, [base3_q + 4*stride_q - 4]
+
+ TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+
+ %assign i 0
+ %rep 8
+ movq [rsp + 16*i], m %+ i
+ %assign i i+1
+ %endrep
+
+ ; p2 = m1 [rsp + 16]
+ ; p1 = m2 [rsp + 32]
+ ; p0 = m3 [rsp + 48]
+ ; q0 = m4 [rsp + 64]
+ ; q1 = m5 [rsp + 80]
+ ; q2 = m6 [rsp + 96]
+
+ SWAP 0, 2
+ SWAP 1, 3
+ SWAP 2, 4
+ SWAP 3, 5
+
+ LOAD_MASK alpha_d, beta_d
+ movd m8, [tc0_q]
+ punpcklbw m8, m8
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [rsp + 16] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ psubb m7, m8, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+ movdqa m4, [rsp + 96] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ mova m3, [rsp + 80]
+ LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+ DEBLOCK_P0_Q0
+ SWAP 1, 3
+ SWAP 2, 4
+ movq m0, [rsp]
+ movq m1, [rsp + 16]
+ movq m2, [rsp + 32]
+ movq m5, [rsp + 80]
+ movq m6, [rsp + 96]
+ movq m7, [rsp + 112]
+
+ TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+ movq [pix_q - 4], m0
+ movq [pix_q + stride_q - 4], m1
+ movq [pix_q + 2*stride_q - 4], m2
+ movq [base3_q - 4], m3
+ movq [base3_q + stride_q - 4], m4
+ movq [base3_q + 2*stride_q - 4], m5
+ movq [base3_q + stride3_q - 4], m6
+ movq [base3_q + 4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
+
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
+%endif
%else
@@ -499,8 +585,10 @@ INIT_MMX mmxext
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA v, 16
+%endif
%endif ; ARCH
@@ -772,8 +860,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
INIT_XMM sse2
DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
+%endif
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
@@ -836,7 +926,11 @@ cglobal deblock_h_chroma_8, 5,7
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
- call ff_chroma_inter_body_mmxext
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
@@ -854,7 +948,52 @@ ff_chroma_inter_body_mmxext:
DEBLOCK_P0_Q0
ret
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6
+ SUB rsp, (1+ARCH_X86_64*2)*mmsize
+ %if ARCH_X86_64
+ %define buf0 [rsp+16]
+ %define buf1 [rsp+8]
+ %else
+ %define buf0 r0m
+ %define buf1 r2m
+ %endif
+
+ movd m6, [r4]
+ punpcklbw m6, m6
+ movq [rsp], m6
+ CHROMA_H_START
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+ lea r0, [r0+r1*8]
+ lea t5, [t5+r1*8]
+
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp+4]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+ ADD rsp, (1+ARCH_X86_64*2)*mmsize
+RET
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -867,9 +1006,6 @@ ff_chroma_inter_body_mmxext:
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
-%define t5 r4
-%define t6 r5
-
;------------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
;------------------------------------------------------------------------------
@@ -894,6 +1030,20 @@ cglobal deblock_h_chroma_intra_8, 4,6
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
+cglobal deblock_h_chroma422_intra_8, 4, 6
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ call ff_chroma_intra_body_mmxext
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+ lea r0, [r0+r1*8]
+ lea t5, [t5+r1*8]
+
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ call ff_chroma_intra_body_mmxext
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+RET
+
ALIGN 16
ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d
@@ -909,6 +1059,202 @@ ff_chroma_intra_body_mmxext:
paddb m2, m6
ret
+%macro LOAD_8_ROWS 8
+ movd m0, %1
+ movd m1, %2
+ movd m2, %3
+ movd m3, %4
+ movd m4, %5
+ movd m5, %6
+ movd m6, %7
+ movd m7, %8
+%endmacro
+
+%macro STORE_8_ROWS 8
+ movd %1, m0
+ movd %2, m1
+ movd %3, m2
+ movd %4, m3
+ movd %5, m4
+ movd %6, m5
+ movd %7, m6
+ movd %8, m7
+%endmacro
+
+%macro TRANSPOSE_8x4B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+ punpcklwd m0, m2
+ punpcklwd m4, m6
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ MOVHL m1, m0
+ MOVHL m3, m2
+%endmacro
+
+%macro TRANSPOSE_4x8B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ MOVHL m6, m4
+ MOVHL m2, m0
+ pshufd m1, m0, 1
+ pshufd m3, m2, 1
+ pshufd m5, m4, 1
+ pshufd m7, m6, 1
+%endmacro
+
+%macro CHROMA_INTER_BODY_XMM 1
+ LOAD_MASK alpha_d, beta_d
+ movd m6, [tc0_q]
+ %rep %1
+ punpcklbw m6, m6
+ %endrep
+ pand m7, m6
+ DEBLOCK_P0_Q0
+%endmacro
+
+%macro CHROMA_INTRA_BODY_XMM 0
+ LOAD_MASK alpha_d, beta_d
+ mova m5, m1
+ mova m6, m2
+ pxor m4, m1, m3
+ pand m4, [pb_1]
+ pavgb m1, m3
+ psubusb m1, m4
+ pavgb m1, m0
+ pxor m4, m2, m0
+ pand m4, [pb_1]
+ pavgb m2, m0
+ psubusb m2, m4
+ pavgb m2, m3
+ psubb m1, m5
+ psubb m2, m6
+ pand m1, m7
+ pand m2, m7
+ paddb m1, m5
+ paddb m2, m6
+%endmacro
+
+%macro CHROMA_V_START_XMM 1
+ movsxdifnidn stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ mov %1, pix_q
+ sub %1, stride_q
+ sub %1, stride_q
+%endmacro
+
+%macro CHROMA_H_START_XMM 2
+ movsxdifnidn stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ lea %2, [3*stride_q]
+ mov %1, pix_q
+ add %1, %2
+%endmacro
+
+%macro DEBLOCK_CHROMA_XMM 1
+
+INIT_XMM %1
+
+cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
+ CHROMA_V_START_XMM r5
+ movq m0, [r5]
+ movq m1, [r5 + stride_q]
+ movq m2, [pix_q]
+ movq m3, [pix_q + stride_q]
+ CHROMA_INTER_BODY_XMM 1
+ movq [r5 + stride_q], m1
+ movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
+ CHROMA_H_START_XMM r5, r6
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 1
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_,
+ CHROMA_H_START_XMM r5, r6
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 2
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+
+ lea pix_q, [pix_q + 8*stride_q]
+ lea r5, [r5 + 8*stride_q]
+ add tc0_q, 2
+
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 2
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
+ CHROMA_V_START_XMM r4
+ movq m0, [r4]
+ movq m1, [r4 + stride_q]
+ movq m2, [pix_q]
+ movq m3, [pix_q + stride_q]
+ CHROMA_INTRA_BODY_XMM
+ movq [r4 + stride_q], m1
+ movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+ CHROMA_H_START_XMM r4, r5
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+ CHROMA_H_START_XMM r4, r5
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+
+ lea pix_q, [pix_q + 8*stride_q]
+ lea r4, [r4 + 8*stride_q]
+
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+%endmacro ; DEBLOCK_CHROMA_XMM
+
+DEBLOCK_CHROMA_XMM sse2
+DEBLOCK_CHROMA_XMM avx
+
;-----------------------------------------------------------------------------
; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
; int8_t ref[2][40], int16_t mv[2][40][2],
OpenPOWER on IntegriCloud