diff options
Diffstat (limited to 'libavcodec/x86/h264_deblock.asm')
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 404 |
1 files changed, 375 insertions, 29 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 33fd5a9..6702ae9 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -7,20 +7,20 @@ ;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -37,11 +37,6 @@ cextern pb_0 cextern pb_1 cextern pb_3 -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - %define PASS8ROWS(base, base3, stride, stride3, offset) \ PASS8ROWS(base+offset, base3+offset, stride, stride3) @@ -287,18 +282,18 @@ cextern pb_3 ; int8_t *tc0) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA 0 -cglobal deblock_v_luma_8, 5,5,10 +cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_ movd m8, [r4] ; tc0 - lea r4, [r1*3] - dec r2d ; alpha-1 + lea r4, [stride_q*3] + dec alpha_d ; alpha-1 neg r4 - dec r3d ; beta-1 - add r4, r0 ; pix-3*stride + dec beta_d ; beta-1 + add base3_q, pix_q ; pix-3*stride - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 + mova m0, [base3_q + stride_q] ; p1 + mova m1, [base3_q + 2*stride_q] ; p0 + mova m2, [pix_q] ; q0 + mova m3, [pix_q + stride_q] ; q1 LOAD_MASK r2d, r3d punpcklbw m8, m8 @@ -308,24 +303,24 @@ cglobal deblock_v_luma_8, 5,5,10 pandn m9, m7 pand m8, m9 - movdqa m3, [r4] ; p2 + movdqa m3, [base3_q] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 psubb m7, m8, m6 pand m6, m8 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4 - movdqa m4, [r0+2*r1] ; q2 + movdqa m4, [pix_q + 2*stride_q] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 pand m6, m9 pand m8, m6 psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 + mova m3, [pix_q + stride_q] + LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6 DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 + mova [base3_q + 2*stride_q], m1 + mova [pix_q], m2 RET ;----------------------------------------------------------------------------- @@ -382,10 +377,101 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 RET %endmacro +%macro DEBLOCK_H_LUMA_MBAFF 0 + +cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_ + movsxd stride_q, stride_d + dec alpha_d + dec beta_d + mov base3_q, pix_q + lea stride3_q, [3*stride_q] + add base3_q, stride3_q + + movq m0, [pix_q - 4] + movq m1, [pix_q + stride_q - 4] + movq m2, [pix_q + 2*stride_q - 4] + movq m3, [base3_q - 4] + movq m4, [base3_q + stride_q - 4] + movq m5, [base3_q + 2*stride_q - 4] + movq m6, [base3_q + stride3_q - 4] + movq m7, [base3_q + 4*stride_q - 4] + + TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 + + %assign i 0 + %rep 8 + movq [rsp + 16*i], m %+ i + %assign i i+1 + %endrep + + ; p2 = m1 [rsp + 16] + ; p1 = m2 [rsp + 32] + ; p0 = m3 [rsp + 48] + ; q0 = m4 [rsp + 64] + ; q1 = m5 [rsp + 80] + ; q2 = m6 [rsp + 96] + + SWAP 0, 2 + SWAP 1, 3 + SWAP 2, 4 + SWAP 3, 5 + + LOAD_MASK alpha_d, beta_d + movd m8, [tc0_q] + punpcklbw m8, m8 + pcmpeqb m9, m9 + pcmpeqb m9, m8 + pandn m9, m7 + pand m8, m9 + + movdqa m3, [rsp + 16] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m9 + psubb m7, m8, m6 + pand m6, m8 + LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4 + + movdqa m4, [rsp + 96] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + pand m6, m9 + pand m8, m6 + psubb m7, m6 + mova m3, [rsp + 80] + LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6 + + DEBLOCK_P0_Q0 + SWAP 1, 3 + SWAP 2, 4 + movq m0, [rsp] + movq m1, [rsp + 16] + movq m2, [rsp + 32] + movq m5, [rsp + 80] + movq m6, [rsp + 96] + movq m7, [rsp + 112] + + TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 + movq [pix_q - 4], m0 + movq [pix_q + stride_q - 4], m1 + movq [pix_q + 2*stride_q - 4], m2 + movq [base3_q - 4], m3 + movq [base3_q + stride_q - 4], m4 + movq [base3_q + 2*stride_q - 4], m5 + movq [base3_q + stride3_q - 4], m6 + movq [base3_q + 4*stride_q - 4], m7 + +RET + +%endmacro + INIT_XMM sse2 +DEBLOCK_H_LUMA_MBAFF DEBLOCK_LUMA + +%if HAVE_AVX_EXTERNAL INIT_XMM avx +DEBLOCK_H_LUMA_MBAFF DEBLOCK_LUMA +%endif %else @@ -499,8 +585,10 @@ INIT_MMX mmxext DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA v, 16 +%endif %endif ; ARCH @@ -772,8 +860,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80 INIT_XMM sse2 DEBLOCK_LUMA_INTRA v +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_INTRA v +%endif %if ARCH_X86_64 == 0 INIT_MMX mmxext DEBLOCK_LUMA_INTRA v8 @@ -836,7 +926,11 @@ cglobal deblock_h_chroma_8, 5,7 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call ff_chroma_inter_body_mmxext + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) @@ -854,7 +948,52 @@ ff_chroma_inter_body_mmxext: DEBLOCK_P0_Q0 ret +%define t5 r4 +%define t6 r5 + +cglobal deblock_h_chroma422_8, 5, 6 + SUB rsp, (1+ARCH_X86_64*2)*mmsize + %if ARCH_X86_64 + %define buf0 [rsp+16] + %define buf1 [rsp+8] + %else + %define buf0 r0m + %define buf1 r2m + %endif + + movd m6, [r4] + punpcklbw m6, m6 + movq [rsp], m6 + CHROMA_H_START + TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + LOAD_MASK r2d, r3d + movd m6, [rsp] + punpcklwd m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) + + lea r0, [r0+r1*8] + lea t5, [t5+r1*8] + + TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + LOAD_MASK r2d, r3d + movd m6, [rsp+4] + punpcklwd m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) + ADD rsp, (1+ARCH_X86_64*2)*mmsize +RET ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 @@ -867,9 +1006,6 @@ ff_chroma_inter_body_mmxext: pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro -%define t5 r4 -%define t6 r5 - ;------------------------------------------------------------------------------ ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) ;------------------------------------------------------------------------------ @@ -894,6 +1030,20 @@ cglobal deblock_h_chroma_intra_8, 4,6 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET +cglobal deblock_h_chroma422_intra_8, 4, 6 + CHROMA_H_START + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) + + lea r0, [r0+r1*8] + lea t5, [t5+r1*8] + + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) +RET + ALIGN 16 ff_chroma_intra_body_mmxext: LOAD_MASK r2d, r3d @@ -909,6 +1059,202 @@ ff_chroma_intra_body_mmxext: paddb m2, m6 ret +%macro LOAD_8_ROWS 8 + movd m0, %1 + movd m1, %2 + movd m2, %3 + movd m3, %4 + movd m4, %5 + movd m5, %6 + movd m6, %7 + movd m7, %8 +%endmacro + +%macro STORE_8_ROWS 8 + movd %1, m0 + movd %2, m1 + movd %3, m2 + movd %4, m3 + movd %5, m4 + movd %6, m5 + movd %7, m6 + movd %8, m7 +%endmacro + +%macro TRANSPOSE_8x4B_XMM 0 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + punpcklwd m0, m2 + punpcklwd m4, m6 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + MOVHL m1, m0 + MOVHL m3, m2 +%endmacro + +%macro TRANSPOSE_4x8B_XMM 0 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + MOVHL m6, m4 + MOVHL m2, m0 + pshufd m1, m0, 1 + pshufd m3, m2, 1 + pshufd m5, m4, 1 + pshufd m7, m6, 1 +%endmacro + +%macro CHROMA_INTER_BODY_XMM 1 + LOAD_MASK alpha_d, beta_d + movd m6, [tc0_q] + %rep %1 + punpcklbw m6, m6 + %endrep + pand m7, m6 + DEBLOCK_P0_Q0 +%endmacro + +%macro CHROMA_INTRA_BODY_XMM 0 + LOAD_MASK alpha_d, beta_d + mova m5, m1 + mova m6, m2 + pxor m4, m1, m3 + pand m4, [pb_1] + pavgb m1, m3 + psubusb m1, m4 + pavgb m1, m0 + pxor m4, m2, m0 + pand m4, [pb_1] + pavgb m2, m0 + psubusb m2, m4 + pavgb m2, m3 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 +%endmacro + +%macro CHROMA_V_START_XMM 1 + movsxdifnidn stride_q, stride_d + dec alpha_d + dec beta_d + mov %1, pix_q + sub %1, stride_q + sub %1, stride_q +%endmacro + +%macro CHROMA_H_START_XMM 2 + movsxdifnidn stride_q, stride_d + dec alpha_d + dec beta_d + lea %2, [3*stride_q] + mov %1, pix_q + add %1, %2 +%endmacro + +%macro DEBLOCK_CHROMA_XMM 1 + +INIT_XMM %1 + +cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_ + CHROMA_V_START_XMM r5 + movq m0, [r5] + movq m1, [r5 + stride_q] + movq m2, [pix_q] + movq m3, [pix_q + stride_q] + CHROMA_INTER_BODY_XMM 1 + movq [r5 + stride_q], m1 + movq [pix_q], m2 +RET + +cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ + CHROMA_H_START_XMM r5, r6 + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + TRANSPOSE_8x4B_XMM + movq [rsp], m0 + movq [rsp + 8], m3 + CHROMA_INTER_BODY_XMM 1 + movq m0, [rsp] + movq m3, [rsp + 8] + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) +RET + +cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_, + CHROMA_H_START_XMM r5, r6 + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + TRANSPOSE_8x4B_XMM + movq [rsp], m0 + movq [rsp + 8], m3 + CHROMA_INTER_BODY_XMM 2 + movq m0, [rsp] + movq m3, [rsp + 8] + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + + lea pix_q, [pix_q + 8*stride_q] + lea r5, [r5 + 8*stride_q] + add tc0_q, 2 + + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + TRANSPOSE_8x4B_XMM + movq [rsp], m0 + movq [rsp + 8], m3 + CHROMA_INTER_BODY_XMM 2 + movq m0, [rsp] + movq m3, [rsp + 8] + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) +RET + +cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_ + CHROMA_V_START_XMM r4 + movq m0, [r4] + movq m1, [r4 + stride_q] + movq m2, [pix_q] + movq m3, [pix_q + stride_q] + CHROMA_INTRA_BODY_XMM + movq [r4 + stride_q], m1 + movq [pix_q], m2 +RET + +cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ + CHROMA_H_START_XMM r4, r5 + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) + TRANSPOSE_8x4B_XMM + CHROMA_INTRA_BODY_XMM + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) +RET + +cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ + CHROMA_H_START_XMM r4, r5 + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) + TRANSPOSE_8x4B_XMM + CHROMA_INTRA_BODY_XMM + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) + + lea pix_q, [pix_q + 8*stride_q] + lea r4, [r4 + 8*stride_q] + + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) + TRANSPOSE_8x4B_XMM + CHROMA_INTRA_BODY_XMM + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) +RET + +%endmacro ; DEBLOCK_CHROMA_XMM + +DEBLOCK_CHROMA_XMM sse2 +DEBLOCK_CHROMA_XMM avx + ;----------------------------------------------------------------------------- ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], ; int8_t ref[2][40], int16_t mv[2][40][2], |