summaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/exrdsp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/exrdsp.asm')
-rw-r--r--libavcodec/x86/exrdsp.asm118
1 files changed, 118 insertions, 0 deletions
diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
new file mode 100644
index 0000000..3bf240c
--- /dev/null
+++ b/libavcodec/x86/exrdsp.asm
@@ -0,0 +1,118 @@
+;******************************************************************************
+;* X86 Optimized functions for Open Exr Decoder
+;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+;*
+;* reorder_pixels, predictor based on patch by John Loy
+;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
+;*
+;* predictor AVX/AVX2 by Henrik Gramner
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pb_15
+cextern pb_80
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro REORDER_PIXELS 0
+cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
+ lea src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
+ add dstq, sizeq ; dst offset by size
+ shr sizeq, 1 ; half_size
+ add src1q, sizeq ; offset src by half_size
+ neg sizeq ; size = offset for dst, src1, src2
+.loop:
+
+ mova m0, [src1q+sizeq] ; load first part
+ movu m1, [src2q+sizeq] ; load second part
+ SBUTTERFLY bw, 0, 1, 2 ; interleaved
+ mova [dstq+2*sizeq ], xm0 ; copy to dst
+ mova [dstq+2*sizeq+16], xm1
+%if cpuflag(avx2)
+ vperm2i128 m0, m0, m1, q0301
+ mova [dstq+2*sizeq+32], m0
+%endif
+ add sizeq, mmsize
+ jl .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+REORDER_PIXELS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+REORDER_PIXELS
+%endif
+
+
+;------------------------------------------------------------------------------
+; void ff_predictor(uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro PREDICTOR 0
+cglobal predictor, 2,2,5, src, size
+ mova m0, [pb_80]
+ mova xm1, [pb_15]
+ mova xm2, xm0
+ add srcq, sizeq
+ neg sizeq
+.loop:
+ pxor m3, m0, [srcq + sizeq]
+ pslldq m4, m3, 1
+ paddb m3, m4
+ pslldq m4, m3, 2
+ paddb m3, m4
+ pslldq m4, m3, 4
+ paddb m3, m4
+ pslldq m4, m3, 8
+%if mmsize == 32
+ paddb m3, m4
+ paddb xm2, xm3
+ vextracti128 xm4, m3, 1
+ mova [srcq + sizeq], xm2
+ pshufb xm2, xm1
+ paddb xm2, xm4
+ mova [srcq + sizeq + 16], xm2
+%else
+ paddb m2, m3
+ paddb m2, m4
+ mova [srcq + sizeq], m2
+%endif
+ pshufb xm2, xm1
+ add sizeq, mmsize
+ jl .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+PREDICTOR
+
+INIT_XMM avx
+PREDICTOR
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+PREDICTOR
+%endif
OpenPOWER on IntegriCloud