libavfilter/x86/vf_convolution.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

;*****************************************************************************
;* x86-optimized functions for convolution filter
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA
half:   dd 0.5

SECTION .text

; void filter_3x3_sse4(uint8_t *dst, int width,
;                      float rdiv, float bias, const int *const matrix,
;                      const uint8_t *c[], int peak, int radius,
;                      int dstride, int stride)


%macro PROCESS_V 1
    movss m2, [matrixq + 4 * %1]
    VBROADCASTSS m2, m2
    movss m3, [c%1q + xq]
    punpcklbw m3, m6
    punpcklwd m3, m6
    pmulld m2, m3
    paddd m4, m2
%endmacro

%macro PROCESS_S 1
    movzx ptrd, byte [c%1q + xq]
    imul  ptrd, [matrixq + 4 * %1]
    add   rd, ptrd
%endmacro

%macro FILTER_3X3 0
%if UNIX64
cglobal filter_3x3, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
%else
cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
%endif

%if WIN64
    SWAP m0, m2
    SWAP m1, m3
    mov  r2q, matrixmp
    mov  r3q, ptrmp
    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
%endif
    movsxdifnidn widthq, widthd
    VBROADCASTSS m0, m0
    VBROADCASTSS m1, m1
    pxor  m6, m6
    movss m5, [half]
    VBROADCASTSS m5, m5
    mov   c0q, [ptrq + 0*gprsize]
    mov   c1q, [ptrq + 1*gprsize]
    mov   c2q, [ptrq + 2*gprsize]
    mov   c3q, [ptrq + 3*gprsize]
    mov   c4q, [ptrq + 4*gprsize]
    mov   c5q, [ptrq + 5*gprsize]
    mov   c6q, [ptrq + 6*gprsize]
    mov   c7q, [ptrq + 7*gprsize]
    mov   c8q, [ptrq + 8*gprsize]

    xor   xq, xq
    cmp   widthq, mmsize/4
    jl .loop2

    mov   rq, widthq
    and   rq, mmsize/4-1
    sub   widthq, rq

.loop1:
    pxor m4, m4         ; sum = 0;

    PROCESS_V 0
    PROCESS_V 1
    PROCESS_V 2
    PROCESS_V 3
    PROCESS_V 4
    PROCESS_V 5
    PROCESS_V 6
    PROCESS_V 7
    PROCESS_V 8

    cvtdq2ps  m4, m4
    mulps     m4, m0     ; sum *= rdiv
    addps     m4, m1     ; sum += bias
    addps     m4, m5     ; sum += 0.5
    cvttps2dq m4, m4
    packssdw  m4, m4
    packuswb  m4, m4
    movss     [dstq + xq], m4

    add xq, mmsize/4
    cmp xq, widthq
    jl .loop1

    add widthq, rq
    cmp xq, widthq
    jge .end

.loop2:
    ; reuse r to hold sum, init with zero
    xor rd, rd

    PROCESS_S 0
    PROCESS_S 1
    PROCESS_S 2
    PROCESS_S 3
    PROCESS_S 4
    PROCESS_S 5
    PROCESS_S 6
    PROCESS_S 7
    PROCESS_S 8

    pxor      m4, m4
    cvtsi2ss  m4, rd
    mulss     m4, m0     ; sum *= rdiv
    addss     m4, m1     ; sum += bias
    addss     m4, m5     ; sum += 0.5
    ; we don't have simple scalar instructions to convert
    ; from 32bit to 8bit with saturation, so here
    ; just use packed version SSE instructions for simplicity.
    cvttps2dq m4, m4     ; trunc to integer
    packssdw  m4, m4
    packuswb  m4, m4
    movd      rd, m4
    mov       [dstq + xq], rb

    add xq, 1
    cmp xq, widthq
    jl .loop2
.end:
    RET
%endmacro

%if ARCH_X86_64
INIT_XMM sse4
FILTER_3X3
%endif