summaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2011-07-02 03:07:06 +0200
committerMichael Niedermayer <michaelni@gmx.at>2011-07-02 03:24:32 +0200
commit3074f03a074de3aab79639d261cbd0ccc265b5b4 (patch)
tree9710041e852ee69f6de6ef6e6333af82f6ca5931 /libavcodec
parent392acaedcb052fa64386d5d0aea4931386f72d64 (diff)
parent23ce6e72123a40895baaeefeb27c7c18748bd67e (diff)
downloadffmpeg-streaming-3074f03a074de3aab79639d261cbd0ccc265b5b4.zip
ffmpeg-streaming-3074f03a074de3aab79639d261cbd0ccc265b5b4.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: get_bits: remove x86 inline asm in A32 bitstream reader doc: Remove outdated information about our issue tracker avidec: Factor out the sync fucntionality. fate-aac: Expand coverage. ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). ac3dsp: simplify extract_exponents() now that it does not need to do clipping. ac3enc: clip coefficients after MDCT. ac3enc: add int32_t array clipping function to DSPUtil, including x86 versions. swscale: for >8bit scaling, read in native bit-depth. matroskadec: matroska_read_seek after after EBML_STOP leads to failure. doxygen: fix usage of @file directive in libavutil/{dict,file}.h doxygen: Help doxygen parser to understand the DECLARE_ALIGNED and offsetof macros Conflicts: doc/issue_tracker.txt libavformat/avidec.c libavutil/dict.h libswscale/swscale.c libswscale/utils.c tests/ref/lavfi/pixfmts_scale Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/ac3dsp.c15
-rw-r--r--libavcodec/ac3enc.h4
-rw-r--r--libavcodec/ac3enc_fixed.c9
-rw-r--r--libavcodec/ac3enc_float.c9
-rw-r--r--libavcodec/ac3enc_template.c16
-rw-r--r--libavcodec/dsputil.c17
-rw-r--r--libavcodec/dsputil.h16
-rw-r--r--libavcodec/get_bits.h8
-rw-r--r--libavcodec/x86/ac3dsp.asm102
-rw-r--r--libavcodec/x86/ac3dsp_mmx.c9
-rw-r--r--libavcodec/x86/dsputil_mmx.c23
-rw-r--r--libavcodec/x86/dsputil_yasm.asm115
12 files changed, 318 insertions, 25 deletions
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 619addc..96bd123 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -164,21 +164,8 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
int i;
for (i = 0; i < nb_coefs; i++) {
- int e;
int v = abs(coef[i]);
- if (v == 0)
- e = 24;
- else {
- e = 23 - av_log2(v);
- if (e >= 24) {
- e = 24;
- coef[i] = 0;
- } else if (e < 0) {
- e = 0;
- coef[i] = av_clip(coef[i], -16777215, 16777215);
- }
- }
- exp[i] = e;
+ exp[i] = v ? 23 - av_log2(v) : 24;
}
}
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 5f5d2c2..be2767a 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -50,12 +50,16 @@
#if CONFIG_AC3ENC_FLOAT
#define AC3_NAME(x) ff_ac3_float_ ## x
#define MAC_COEF(d,a,b) ((d)+=(a)*(b))
+#define COEF_MIN (-16777215.0/16777216.0)
+#define COEF_MAX ( 16777215.0/16777216.0)
typedef float SampleType;
typedef float CoefType;
typedef float CoefSumType;
#else
#define AC3_NAME(x) ff_ac3_fixed_ ## x
#define MAC_COEF(d,a,b) MAC64(d,a,b)
+#define COEF_MIN -16777215
+#define COEF_MAX 16777215
typedef int16_t SampleType;
typedef int32_t CoefType;
typedef int64_t CoefSumType;
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index b189609..cbe92e1 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -104,6 +104,15 @@ static void scale_coefficients(AC3EncodeContext *s)
}
+/**
+ * Clip MDCT coefficients to allowable range.
+ */
+static void clip_coefficients(DSPContext *dsp, int32_t *coef, unsigned int len)
+{
+ dsp->vector_clip_int32(coef, coef, COEF_MIN, COEF_MAX, len);
+}
+
+
static av_cold int ac3_fixed_encode_init(AVCodecContext *avctx)
{
AC3EncodeContext *s = avctx->priv_data;
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 7d01b18..e21b99d 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -111,6 +111,15 @@ static void scale_coefficients(AC3EncodeContext *s)
}
+/**
+ * Clip MDCT coefficients to allowable range.
+ */
+static void clip_coefficients(DSPContext *dsp, float *coef, unsigned int len)
+{
+ dsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len);
+}
+
+
#if CONFIG_AC3_ENCODER
AVCodec ff_ac3_float_encoder = {
"ac3_float",
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index 85eea54..c7243c7 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -41,6 +41,8 @@ static void apply_window(DSPContext *dsp, SampleType *output,
static int normalize_samples(AC3EncodeContext *s);
+static void clip_coefficients(DSPContext *dsp, CoefType *coef, unsigned int len);
+
int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
{
@@ -171,8 +173,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
cpl_coef[i] += ch_coef[i];
}
- /* coefficients must be clipped to +/- 1.0 in order to be encoded */
- s->dsp.vector_clipf(cpl_coef, cpl_coef, -1.0f, 1.0f, num_cpl_coefs);
+ /* coefficients must be clipped in order to be encoded */
+ clip_coefficients(&s->dsp, cpl_coef, num_cpl_coefs);
/* scale coupling coefficients from float to 24-bit fixed-point */
s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][cpl_start],
@@ -300,6 +302,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
if (!block->cpl_in_use || !block->new_cpl_coords)
continue;
+ clip_coefficients(&s->dsp, cpl_coords[blk][1], s->fbw_channels * 16);
s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
cpl_coords[blk][1],
s->fbw_channels * 16);
@@ -433,7 +436,11 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, unsigned char *frame,
apply_mdct(s);
- scale_coefficients(s);
+ if (s->fixed_point)
+ scale_coefficients(s);
+
+ clip_coefficients(&s->dsp, s->blocks[0].mdct_coef[1],
+ AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
s->cpl_on = s->cpl_enabled;
ff_ac3_compute_coupling_strategy(s);
@@ -443,6 +450,9 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, unsigned char *frame,
compute_rematrixing_strategy(s);
+ if (!s->fixed_point)
+ scale_coefficients(s);
+
ff_ac3_apply_rematrixing(s);
ff_ac3_process_exponents(s);
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 0e596b1..bfbe12e 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2664,6 +2664,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
}
}
+static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len)
+{
+ do {
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ len -= 8;
+ } while (len > 0);
+}
+
#define W0 2048
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -3106,6 +3122,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->apply_window_int16 = apply_window_int16_c;
+ c->vector_clip_int32 = vector_clip_int32_c;
c->scalarproduct_float = scalarproduct_float_c;
c->butterflies_float = butterflies_float_c;
c->vector_fmul_scalar = vector_fmul_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index f2054a4..401a87a 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -553,6 +553,22 @@ typedef struct DSPContext {
void (*apply_window_int16)(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
+ /**
+ * Clip each element in an array of int32_t to a given minimum and maximum value.
+ * @param dst destination array
+ * constraints: 16-byte aligned
+ * @param src source array
+ * constraints: 16-byte aligned
+ * @param min minimum value
+ * constraints: must in the the range [-(1<<24), 1<<24]
+ * @param max maximum value
+ * constraints: must in the the range [-(1<<24), 1<<24]
+ * @param len number of elements in the array
+ * constraints: multiple of 32 greater than zero
+ */
+ void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+
/* rv30 functions */
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index 3b09dfd..96d33b3 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -201,19 +201,11 @@ static inline void skip_bits_long(GetBitContext *s, int n){
} \
} while (0)
-#if ARCH_X86
-# define SKIP_CACHE(name, gb, num) \
- __asm__("shldl %2, %1, %0 \n\t" \
- "shll %2, %1 \n\t" \
- : "+r" (name##_cache0), "+r" (name##_cache1) \
- : "Ic" ((uint8_t)(num)))
-#else
# define SKIP_CACHE(name, gb, num) do { \
name##_cache0 <<= (num); \
name##_cache0 |= NEG_USR32(name##_cache1,num); \
name##_cache1 <<= (num); \
} while (0)
-#endif
# define SKIP_COUNTER(name, gb, num) name##_bit_count += (num)
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 99c5df3..8c958a1 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -32,6 +32,11 @@ cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
+; used in ff_ac3_extract_exponents()
+pd_1: times 4 dd 1
+pd_151: times 4 dd 151
+pb_shuf_4dwb: db 0, 4, 8, 12
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
movd eax, m0
add eax, sumd
RET
+
+;------------------------------------------------------------------------------
+; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
+;------------------------------------------------------------------------------
+
+%macro PABSD_MMX 2 ; src/dst, tmp
+ pxor %2, %2
+ pcmpgtd %2, %1
+ pxor %1, %2
+ psubd %1, %2
+%endmacro
+
+%macro PABSD_SSSE3 1-2 ; src/dst, unused
+ pabsd %1, %1
+%endmacro
+
+%ifdef HAVE_AMD3DNOW
+INIT_MMX
+cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
+ add expq, lenq
+ lea coefq, [coefq+4*lenq]
+ neg lenq
+ movq m3, [pd_1]
+ movq m4, [pd_151]
+.loop:
+ movq m0, [coefq+4*lenq ]
+ movq m1, [coefq+4*lenq+8]
+ PABSD_MMX m0, m2
+ PABSD_MMX m1, m2
+ pslld m0, 1
+ por m0, m3
+ pi2fd m2, m0
+ psrld m2, 23
+ movq m0, m4
+ psubd m0, m2
+ pslld m1, 1
+ por m1, m3
+ pi2fd m2, m1
+ psrld m2, 23
+ movq m1, m4
+ psubd m1, m2
+ packssdw m0, m0
+ packuswb m0, m0
+ packssdw m1, m1
+ packuswb m1, m1
+ punpcklwd m0, m1
+ movd [expq+lenq], m0
+ add lenq, 4
+ jl .loop
+ REP_RET
+%endif
+
+%macro AC3_EXTRACT_EXPONENTS 1
+cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
+ add expq, lenq
+ lea coefq, [coefq+4*lenq]
+ neg lenq
+ mova m2, [pd_1]
+ mova m3, [pd_151]
+%ifidn %1, ssse3 ;
+ movd m4, [pb_shuf_4dwb]
+%endif
+.loop:
+ ; move 4 32-bit coefs to xmm0
+ mova m0, [coefq+4*lenq]
+ ; absolute value
+ PABSD m0, m1
+ ; convert to float and extract exponents
+ pslld m0, 1
+ por m0, m2
+ cvtdq2ps m1, m0
+ psrld m1, 23
+ mova m0, m3
+ psubd m0, m1
+ ; move the lowest byte in each of 4 dwords to the low dword
+%ifidn %1, ssse3
+ pshufb m0, m4
+%else
+ packssdw m0, m0
+ packuswb m0, m0
+%endif
+ movd [expq+lenq], m0
+
+ add lenq, 4
+ jl .loop
+ REP_RET
+%endmacro
+
+%ifdef HAVE_SSE
+INIT_XMM
+%define PABSD PABSD_MMX
+AC3_EXTRACT_EXPONENTS sse2
+%ifdef HAVE_SSSE3
+%define PABSD PABSD_SSSE3
+AC3_EXTRACT_EXPONENTS ssse3
+%endif
+%endif
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index e853b88..3127570 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i
extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int mm_flags = av_get_cpu_flags();
@@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+ c->extract_exponents = ff_ac3_extract_exponents_3dnow;
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
@@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
+ c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
@@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
}
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
+ if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
+ c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+ }
}
#endif
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 78cad4c..80bb6cd 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2333,6 +2333,15 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
@@ -2473,6 +2482,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
+
+ c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif
if (mm_flags & AV_CPU_FLAG_MMX2) {
@@ -2756,6 +2767,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+ if (mm_flags & AV_CPU_FLAG_ATOM) {
+ c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
+ } else {
+ c->vector_clip_int32 = ff_vector_clip_int32_sse2;
+ }
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else {
@@ -2781,6 +2797,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
#endif
}
+
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
+#if HAVE_YASM
+ c->vector_clip_int32 = ff_vector_clip_int32_sse41;
+#endif
+ }
+
#if HAVE_AVX && HAVE_YASM
if (mm_flags & AV_CPU_FLAG_AVX) {
if (bit_depth == 10) {
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 695aba5..1f5a4f6 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1048,3 +1048,118 @@ emu_edge sse
%ifdef ARCH_X86_32
emu_edge mmx
%endif
+
+;-----------------------------------------------------------------------------
+; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
+; int32_t max, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro PMINSD_MMX 3 ; dst, src, tmp
+ mova %3, %2
+ pcmpgtd %3, %1
+ pxor %1, %2
+ pand %1, %3
+ pxor %1, %2
+%endmacro
+
+%macro PMAXSD_MMX 3 ; dst, src, tmp
+ mova %3, %1
+ pcmpgtd %3, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endmacro
+
+%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
+ PMINSD_MMX %1, %3, %4
+ PMAXSD_MMX %1, %2, %4
+%endmacro
+
+%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+ cvtdq2ps %1, %1
+ minps %1, %3
+ maxps %1, %2
+ cvtps2dq %1, %1
+%endmacro
+
+%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
+ pminsd %1, %3
+ pmaxsd %1, %2
+%endmacro
+
+%macro SPLATD_MMX 1
+ punpckldq %1, %1
+%endmacro
+
+%macro SPLATD_SSE2 1
+ pshufd %1, %1, 0
+%endmacro
+
+%macro VECTOR_CLIP_INT32 4
+cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
+%ifidn %1, sse2
+ cvtsi2ss m4, minm
+ cvtsi2ss m5, maxm
+%else
+ movd m4, minm
+ movd m5, maxm
+%endif
+ SPLATD m4
+ SPLATD m5
+.loop:
+%assign %%i 1
+%rep %3
+ mova m0, [srcq+mmsize*0*%%i]
+ mova m1, [srcq+mmsize*1*%%i]
+ mova m2, [srcq+mmsize*2*%%i]
+ mova m3, [srcq+mmsize*3*%%i]
+%if %4
+ mova m7, [srcq+mmsize*4*%%i]
+ mova m8, [srcq+mmsize*5*%%i]
+ mova m9, [srcq+mmsize*6*%%i]
+ mova m10, [srcq+mmsize*7*%%i]
+%endif
+ CLIPD m0, m4, m5, m6
+ CLIPD m1, m4, m5, m6
+ CLIPD m2, m4, m5, m6
+ CLIPD m3, m4, m5, m6
+%if %4
+ CLIPD m7, m4, m5, m6
+ CLIPD m8, m4, m5, m6
+ CLIPD m9, m4, m5, m6
+ CLIPD m10, m4, m5, m6
+%endif
+ mova [dstq+mmsize*0*%%i], m0
+ mova [dstq+mmsize*1*%%i], m1
+ mova [dstq+mmsize*2*%%i], m2
+ mova [dstq+mmsize*3*%%i], m3
+%if %4
+ mova [dstq+mmsize*4*%%i], m7
+ mova [dstq+mmsize*5*%%i], m8
+ mova [dstq+mmsize*6*%%i], m9
+ mova [dstq+mmsize*7*%%i], m10
+%endif
+%assign %%i %%i+1
+%endrep
+ add srcq, mmsize*4*(%3+%4)
+ add dstq, mmsize*4*(%3+%4)
+ sub lend, mmsize*(%3+%4)
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATD SPLATD_MMX
+%define CLIPD CLIPD_MMX
+VECTOR_CLIP_INT32 mmx, 0, 1, 0
+INIT_XMM
+%define SPLATD SPLATD_SSE2
+VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
+%define CLIPD CLIPD_SSE2
+VECTOR_CLIP_INT32 sse2, 6, 2, 0
+%define CLIPD CLIPD_SSE41
+%ifdef m8
+VECTOR_CLIP_INT32 sse41, 11, 1, 1
+%else
+VECTOR_CLIP_INT32 sse41, 6, 1, 0
+%endif
OpenPOWER on IntegriCloud