1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
/*
* Copyright (c) 2017 Meng Wang <wangmeng.kids@bytedance.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
function ff_hevc_sao_band_filter_neon_8, export=1
push {r4-r10}
ldr r5, [sp, #28] // width
ldr r4, [sp, #32] // height
ldr r8, [sp, #36] // offset_table
vpush {d8-d15}
mov r12, r4 // r12 = height
mov r6, r0 // r6 = r0 = dst
mov r7, r1 // r7 = r1 = src
vldm r8, {q0-q3}
vmov.u16 q15, #1
vmov.u8 q14, #32
0: pld [r1]
vld1.8 {d16}, [r1], r3
cmp r5, #4
beq 4f
8: subs r4, #1
vshr.u8 d17, d16, #3 // index = [src>>3]
vshll.u8 q9, d17, #1 // lowIndex = 2*index
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
vadd.u16 q10, q9 // combine high and low index;
// Look-up Table Round 1; index range: 0-15
vtbx.8 d24, {q0-q1}, d20
vtbx.8 d25, {q0-q1}, d21
// Look-up Table Round 2; index range: 16-31
vsub.u8 q10, q14 // Look-up with 8bit
vtbx.8 d24, {q2-q3}, d20
vtbx.8 d25, {q2-q3}, d21
vaddw.u8 q13, q12, d16
vqmovun.s16 d8, q13
vst1.8 d8, [r0], r2
vld1.8 {d16}, [r1], r3
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r1, r7
b 0b
4: subs r4, #1
vshr.u8 d17, d16, #3 // src>>3
vshll.u8 q9, d17, #1 // lowIndex = 2*index
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
vadd.u16 q10, q9 // combine high and low index;
// Look-up Table Round 1; index range: 0-15
vtbx.8 d24, {q0-q1}, d20
vtbx.8 d25, {q0-q1}, d21
// Look-up Table Round 2; index range: 16-32
vsub.u8 q10, q14 // Look-up with 8bit
vtbx.8 d24, {q2-q3}, d20
vtbx.8 d25, {q2-q3}, d21
vaddw.u8 q13, q12, d16
vqmovun.s16 d14, q13
vst1.32 d14[0], [r0], r2
vld1.32 {d16[0]}, [r1], r3
bne 4b
b 99f
99:
vpop {d8-d15}
pop {r4-r10}
bx lr
endfunc
function ff_hevc_sao_edge_filter_neon_8, export=1
push {r4-r11}
ldr r5, [sp, #32] // width
ldr r4, [sp, #36] // height
ldr r8, [sp, #40] // a_stride
ldr r9, [sp, #44] // b_stride
ldr r10, [sp, #48] // sao_offset_val
ldr r11, [sp, #52] // edge_idx
vpush {d8-d15}
mov r12, r4 // r12 = height
mov r6, r0 // r6 = r0 = dst
mov r7, r1 // r7 = r1 = src
vld1.8 {d0}, [r11] // edge_idx tabel load in d0 5x8bit
vld1.16 {q1}, [r10] // sao_offset_val table load in q1, 5x16bit
vmov.u8 d1, #2
vmov.u16 q2, #1
0: mov r10, r1
add r10, r8 // src[x + a_stride]
mov r11, r1
add r11, r9 // src[x + b_stride]
pld [r1]
vld1.8 {d16}, [r1], r3 // src[x] 8x8bit
vld1.8 {d17}, [r10], r3 // src[x + a_stride]
vld1.8 {d18}, [r11], r3 // src[x + b_stride]
cmp r5, #4
beq 4f
8: subs r4, #1
vcgt.u8 d8, d16, d17
vshr.u8 d9, d8, #7
vclt.u8 d8, d16, d17
vadd.u8 d8, d9 // diff0
vcgt.u8 d10, d16, d18
vshr.u8 d11, d10, #7
vclt.u8 d10, d16, d18
vadd.u8 d10, d11 // diff1
vadd.s8 d8, d10
vadd.s8 d8, d1
vtbx.8 d9, {d0}, d8 // offset_val
vshll.u8 q6, d9, #1 // lowIndex
vadd.u16 q7, q6, q2
vshl.u16 q10, q7, #8 // highIndex
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
vtbx.8 d22, {q1}, d20
vtbx.8 d23, {q1}, d21
vaddw.u8 q12, q11, d16
vqmovun.s16 d26, q12
vst1.8 d26, [r0], r2
vld1.8 {d16}, [r1], r3 // src[x] 8x8bit
vld1.8 {d17}, [r10], r3 // src[x + a_stride]
vld1.8 {d18}, [r11], r3 // src[x + b_stride]
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r1, r7
b 0b
4: subs r4, #1
vcgt.u8 d8, d16, d17
vshr.u8 d9, d8, #7
vclt.u8 d8, d16, d17
vadd.u8 d8, d9 // diff0
vcgt.u8 d10, d16, d18
vshr.u8 d11, d10, #7
vclt.u8 d10, d16, d18
vadd.u8 d10, d11 // diff1
vadd.s8 d8, d10
vadd.s8 d8, d1
vtbx.8 d9, {d0}, d8 // offset_val
vshll.u8 q6, d9, #1 // lowIndex
vadd.u16 q7, q6, q2
vshl.u16 q10, q7, #8 // highIndex
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
vtbx.8 d22, {q1}, d20
vtbx.8 d23, {q1}, d21
vaddw.u8 q12, q11, d16
vqmovun.s16 d26, q12
vst1.32 d26[0], [r0], r2
vld1.32 {d16[0]}, [r1], r3
vld1.32 {d17[0]}, [r10], r3 // src[x + a_stride]
vld1.32 {d18[0]}, [r11], r3 // src[x + b_stride]
bne 4b
b 99f
99:
vpop {d8-d15}
pop {r4-r11}
bx lr
endfunc
|