diff options
Diffstat (limited to 'libavcodec/x86/hevc_idct.asm')
-rw-r--r-- | libavcodec/x86/hevc_idct.asm | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm new file mode 100644 index 0000000..2edaf9a --- /dev/null +++ b/libavcodec/x86/hevc_idct.asm @@ -0,0 +1,122 @@ +; /* +; * SIMD optimized idct functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * Copyright (c) 2014 James Almer +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION .text + +; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs) +; %1 = HxW +; %2 = number of loops +; %3 = bitdepth +%macro IDCT_DC 3 +cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%3) + 1) + sar tmpw, (15-%3) + movd xm0, tmpd + SPLATW m0, xm0 + DEFINE_ARGS coeff, cnt + mov cntd, %2 +.loop: + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 + add coeffq, mmsize*8 + dec cntd + jg .loop + RET +%endmacro + +; %1 = HxW +; %2 = bitdepth +%macro IDCT_DC_NL 2 ; No loop +cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%2) + 1) + sar tmpw, (15-%2) + movd m0, tmpd + SPLATW m0, xm0 + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 +%if mmsize == 16 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 +%endif + RET +%endmacro + +; 8-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 8 +IDCT_DC 8, 2, 8 + +INIT_XMM sse2 +IDCT_DC_NL 8, 8 +IDCT_DC 16, 4, 8 +IDCT_DC 32, 16, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 8 +IDCT_DC 32, 8, 8 +%endif ;HAVE_AVX2_EXTERNAL + +; 10-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 10 +IDCT_DC 8, 2, 10 + +INIT_XMM sse2 +IDCT_DC_NL 8, 10 +IDCT_DC 16, 4, 10 +IDCT_DC 32, 16, 10 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 10 +IDCT_DC 32, 8, 10 +%endif ;HAVE_AVX2_EXTERNAL + +; 12-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 12 +IDCT_DC 8, 2, 12 + +INIT_XMM sse2 +IDCT_DC_NL 8, 12 +IDCT_DC 16, 4, 12 +IDCT_DC 32, 16, 12 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 12 +IDCT_DC 32, 8, 12 +%endif ;HAVE_AVX2_EXTERNAL |