diff options
Diffstat (limited to 'libavcodec/x86/huffyuvdsp.asm')
-rw-r--r-- | libavcodec/x86/huffyuvdsp.asm | 217 |
1 files changed, 108 insertions, 109 deletions
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm index 692162b..a1231f1 100644 --- a/libavcodec/x86/huffyuvdsp.asm +++ b/libavcodec/x86/huffyuvdsp.asm @@ -1,48 +1,117 @@ ;****************************************************************************** ;* SIMD-optimized HuffYUV functions ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2014 Christophe Gisquet ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" -SECTION_RODATA -pb_f: times 16 db 15 -pb_zzzzzzzz77777777: times 8 db -1 -pb_7: times 8 db 7 -pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 -pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 - SECTION .text -; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, -; const uint8_t *diff, int w, -; int *left, int *left_top) +%include "libavcodec/x86/huffyuvdsp_template.asm" + +;------------------------------------------------------------------------------ +; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w); +;------------------------------------------------------------------------------ + +%macro ADD_INT16 0 +cglobal add_int16, 4,4,5, dst, src, mask, w, tmp +%if mmsize > 8 + test srcq, mmsize-1 + jnz .unaligned + test dstq, mmsize-1 + jnz .unaligned +%endif + INT16_LOOP a, add +%if mmsize > 8 +.unaligned: + INT16_LOOP u, add +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +ADD_INT16 +%endif + +INIT_XMM sse2 +ADD_INT16 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +ADD_INT16 +%endif + +; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src, +; intptr_t w, uint8_t *left) +%macro LEFT_BGR32 0 +cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left + shl wq, 2 + movd m0, [leftq] + lea dstq, [dstq + wq] + lea srcq, [srcq + wq] + LSHIFT m0, mmsize-4 + neg wq +.loop: + movu m1, [srcq+wq] + mova m2, m1 +%if mmsize == 8 + punpckhdq m0, m0 +%endif + LSHIFT m1, 4 + paddb m1, m2 +%if mmsize == 16 + pshufd m0, m0, q3333 + mova m2, m1 + LSHIFT m1, 8 + paddb m1, m2 +%endif + paddb m0, m1 + movu [dstq+wq], m0 + add wq, mmsize + jl .loop + movd m0, [dstq-4] + movd [leftq], m0 + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +LEFT_BGR32 +%endif +INIT_XMM sse2 +LEFT_BGR32 + +; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top) INIT_MMX mmxext -cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top +cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top + add wd, wd + movd mm6, maskd + SPLATW mm6, mm6 movq mm0, [topq] movq mm2, mm0 movd mm4, [left_topq] - psllq mm2, 8 + psllq mm2, 16 movq mm1, mm0 por mm4, mm2 movd mm3, [leftq] - psubb mm0, mm4 ; t-tl + psubw mm0, mm4 ; t-tl add dstq, wq add topq, wq add diffq, wq @@ -51,115 +120,45 @@ cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top .loop: movq mm4, [topq+wq] movq mm0, mm4 - psllq mm4, 8 + psllq mm4, 16 por mm4, mm1 movq mm1, mm0 ; t - psubb mm0, mm4 ; t-tl + psubw mm0, mm4 ; t-tl .skip: movq mm2, [diffq+wq] %assign i 0 -%rep 8 +%rep 4 movq mm4, mm0 - paddb mm4, mm3 ; t-tl+l + paddw mm4, mm3 ; t-tl+l + pand mm4, mm6 movq mm5, mm3 - pmaxub mm3, mm1 - pminub mm5, mm1 - pminub mm3, mm4 - pmaxub mm3, mm5 ; median - paddb mm3, mm2 ; +residual + pmaxsw mm3, mm1 + pminsw mm5, mm1 + pminsw mm3, mm4 + pmaxsw mm3, mm5 ; median + paddw mm3, mm2 ; +residual + pand mm3, mm6 %if i==0 movq mm7, mm3 - psllq mm7, 56 + psllq mm7, 48 %else - movq mm6, mm3 - psrlq mm7, 8 - psllq mm6, 56 - por mm7, mm6 + movq mm4, mm3 + psrlq mm7, 16 + psllq mm4, 48 + por mm7, mm4 %endif -%if i<7 - psrlq mm0, 8 - psrlq mm1, 8 - psrlq mm2, 8 +%if i<3 + psrlq mm0, 16 + psrlq mm1, 16 + psrlq mm2, 16 %endif %assign i i+1 %endrep movq [dstq+wq], mm7 add wq, 8 jl .loop - movzx r2d, byte [dstq-1] + movzx r2d, word [dstq-2] mov [leftq], r2d - movzx r2d, byte [topq-1] + movzx r2d, word [topq-2] mov [left_topq], r2d RET - - -%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned - add srcq, wq - add dstq, wq - neg wq -%%.loop: -%if %2 - mova m1, [srcq+wq] -%else - movu m1, [srcq+wq] -%endif - mova m2, m1 - psllw m1, 8 - paddb m1, m2 - mova m2, m1 - pshufb m1, m3 - paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 - paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 - paddb m1, m2 -%endif - paddb m0, m1 -%if %1 - mova [dstq+wq], m0 -%else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 -%endif - add wq, mmsize - jl %%.loop - mov eax, mmsize-1 - sub eax, wd - movd m1, eax - pshufb m0, m1 - movd eax, m0 - RET -%endmacro - -; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) -INIT_MMX ssse3 -cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left -.skip_prologue: - mova m5, [pb_7] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - psllq m0, 56 - ADD_HFYU_LEFT_LOOP 1, 1 - -INIT_XMM ssse3 -cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left - mova m5, [pb_f] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 - jnz .src_unaligned - test dstq, 15 - jnz .dst_unaligned - ADD_HFYU_LEFT_LOOP 1, 1 -.dst_unaligned: - ADD_HFYU_LEFT_LOOP 0, 1 -.src_unaligned: - ADD_HFYU_LEFT_LOOP 0, 0 |