libswscale/arm/rgb2yuv_neon_common.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291

/*
 * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

.macro alias name, tgt, set=1
.if \set != 0
    \name   .req    \tgt
.else
    .unreq  \name
.endif
.endm

.altmacro

.macro alias_dw_all qw, dw_l, dw_h
    alias   q\qw\()_l, d\dw_l
    alias   q\qw\()_h, d\dw_h
    .if \qw < 15
        alias_dw_all  %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
    .endif
.endm

alias_dw_all    0, 0, 1

.noaltmacro

.macro alias_qw     name, qw, set=1
    alias   \name\(), \qw, \set
    alias   \name\()_l, \qw\()_l, \set
    alias   \name\()_h, \qw\()_h, \set
.endm

.macro prologue
    push            {r4-r12, lr}
    vpush           {q4-q7}
.endm

.macro epilogue
    vpop            {q4-q7}
    pop             {r4-r12, pc}
.endm

.macro  load_arg    reg, ix
    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm


/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
 *                  int width, int height,
 *                  int y_stride, int c_stride, int src_stride,
 *                  int32_t coeff_table[9]);
 */
.macro  alias_loop_420sp set=1
    alias   src,        r0, \set
    alias   src0,       src, \set
    alias   y,          r1, \set
    alias   y0,         y, \set
    alias   chroma,     r2, \set
    alias   width,      r3, \set
    alias   header,     width, \set

    alias   height,     r4, \set
    alias   y_stride,   r5, \set
    alias   c_stride,   r6, \set
    alias   c_padding,  c_stride, \set
    alias   src_stride, r7, \set

    alias   y0_end,     r8, \set

    alias   src_padding,r9, \set
    alias   y_padding,  r10, \set

    alias   src1,       r11, \set
    alias   y1,         r12, \set

    alias   coeff_table,r12, \set
.endm


.macro  loop_420sp s_fmt, d_fmt, init, kernel, precision

function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
    prologue

    alias_loop_420sp

    load_arg    height,         4
    load_arg    y_stride,       5
    load_arg    c_stride,       6
    load_arg    src_stride,     7
    load_arg    coeff_table,    8

    \init       coeff_table

    sub         y_padding,      y_stride,       width
    sub         c_padding,      c_stride,       width
    sub         src_padding,    src_stride,     width, LSL #2

    add         y0_end,         y0,             width
    and         header,         width,          #15

    add         y1,             y0,             y_stride
    add         src1,           src0,           src_stride

0:
    cmp         header,     #0
    beq         1f

    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header

1:
    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma

    cmp         y0,         y0_end
    blt         1b
2:
    add         y0,         y1,         y_padding
    add         y0_end,     y1,         y_stride
    add         chroma,     chroma,     c_padding
    add         src0,       src1,       src_padding

    add         y1,         y0,         y_stride
    add         src1,       src0,       src_stride

    subs        height,     height,     #2

    bgt         0b

    epilogue

    alias_loop_420sp 0

endfunc
.endm

.macro downsample
    vpaddl.u8   r16x8,  r8x16
    vpaddl.u8   g16x8,  g8x16
    vpaddl.u8   b16x8,  b8x16
.endm


/* acculumate and right shift by 2 */
.macro downsample_ars2
    vpadal.u8   r16x8,  r8x16
    vpadal.u8   g16x8,  g8x16
    vpadal.u8   b16x8,  b8x16

    vrshr.u16   r16x8,  r16x8,  #2
    vrshr.u16   g16x8,  g16x8,  #2
    vrshr.u16   b16x8,  b16x8,  #2
.endm

.macro store_y8_16x1            dst, count
.ifc "\count",""
    vstmia      \dst!,  {y8x16}
.else
    vstmia      \dst,   {y8x16}
    add         \dst,   \dst,           \count
.endif
.endm

.macro store_chroma_nv12_8x1    dst, count
.ifc "\count",""
    vst2.i8     {u8x8, v8x8},   [\dst]!
.else
    vst2.i8     {u8x8, v8x8},   [\dst], \count
.endif
.endm

.macro store_chroma_nv21_8x1    dst, count
.ifc "\count",""
    vst2.i8     {v8x8, u8x8},   [\dst]!
.else
    vst2.i8     {v8x8, u8x8},   [\dst], \count
.endif
.endm

.macro load_8888_16x1   a, b, c, d, src, count
.ifc "\count",""
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]!
.else
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]
    sub         \src,   \src,   #32
    add         \src,   \src,   \count, LSL #2
.endif
.endm

.macro load_rgbx_16x1   src, count
    load_8888_16x1  r, g, b, x, \src, \count
.endm

.macro load_bgrx_16x1   src, count
    load_8888_16x1  b, g, r, x, \src, \count
.endm

.macro alias_src_rgbx   set
    alias_src_8888  r, g, b, x, \set
.endm

.macro alias_src_bgrx   set
    alias_src_8888  b, g, r, x, \set
.endm

.macro alias_dst_nv12   set
    alias   u8x8, c8x8x2_l, \set
    alias   v8x8, c8x8x2_h, \set
.endm

.macro alias_dst_nv21   set
    alias   v8x8, c8x8x2_l, \set
    alias   u8x8, c8x8x2_h, \set
.endm


// common aliases

alias   CO_R    d0
CO_RY   .dn     d0.s16[0]
CO_RU   .dn     d0.s16[1]
CO_RV   .dn     d0.s16[2]

alias   CO_G    d1
CO_GY   .dn     d1.s16[0]
CO_GU   .dn     d1.s16[1]
CO_GV   .dn     d1.s16[2]

alias   CO_B    d2
CO_BY   .dn     d2.s16[0]
CO_BU   .dn     d2.s16[1]
CO_BV   .dn     d2.s16[2]

alias   BIAS_U, d3
alias   BIAS_V, BIAS_U

alias   BIAS_Y, q2


/* q3-q6 R8G8B8X8 x16 */

.macro alias_src_8888   a, b, c, d, set
    alias_qw  \a\()8x16, q3, \set
    alias_qw  \b\()8x16, q4, \set
    alias_qw  \c\()8x16, q5, \set
    alias_qw  \d\()8x16, q6, \set
.endm

.macro kernel_420_16x2  rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
    alias_src_\rgb_fmt
    alias_dst_\yuv_fmt

    load_\rgb_fmt\()_16x1   \rgb0, \count

    downsample
    compute_y_16x1
    store_y8_16x1   \y0, \count


    load_\rgb_fmt\()_16x1   \rgb1, \count
    downsample_ars2
    compute_y_16x1
    store_y8_16x1   \y1, \count

    compute_chroma_8x1  u, U
    compute_chroma_8x1  v, V

    store_chroma_\yuv_fmt\()_8x1 \chroma, \count

    alias_dst_\yuv_fmt 0
    alias_src_\rgb_fmt 0
.endm