diff options
author | Oskar Arvidsson <oskar@irock.se> | 2011-05-09 11:18:37 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-05-10 07:24:27 -0400 |
commit | de3e760720ecc0c755cc2216ef9bf3dba39450a4 (patch) | |
tree | 90d925afd5b1bb536be6aa803d57cef16cb6514b | |
parent | 325eefa2ca14b487120b164bc232e6c650fb20f3 (diff) | |
download | ffmpeg-streaming-de3e760720ecc0c755cc2216ef9bf3dba39450a4.zip ffmpeg-streaming-de3e760720ecc0c755cc2216ef9bf3dba39450a4.tar.gz |
Preparatory patch for high bit depth h264 decoding support.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/h264dsp_template.c | 119 | ||||
-rw-r--r-- | libavcodec/h264pred_template.c | 585 |
2 files changed, 406 insertions, 298 deletions
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c index 2fe09ff..7190f4d 100644 --- a/libavcodec/h264dsp_template.c +++ b/libavcodec/h264dsp_template.c @@ -25,12 +25,19 @@ * @author Michael Niedermayer <michaelni@gmx.at> */ -#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) -#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) +#define BIT_DEPTH 8 +#define pixel uint8_t +#define av_clip_pixel av_clip_uint8 +#define FUNCC(a) a ## _c + +#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) +#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) #define H264_WEIGHT(W,H) \ -static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ +static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ int y; \ - offset <<= log2_denom; \ + pixel *block = (pixel*)_block; \ + stride /= sizeof(pixel); \ + offset <<= (log2_denom + (BIT_DEPTH-8)); \ if(log2_denom) offset += 1<<(log2_denom-1); \ for(y=0; y<H; y++, block += stride){ \ op_scale1(0); \ @@ -54,8 +61,11 @@ static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, op_scale1(15); \ } \ } \ -static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ +static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \ int y; \ + pixel *dst = (pixel*)_dst; \ + pixel *src = (pixel*)_src; \ + stride /= sizeof(pixel); \ offset = ((offset + 1) | 1) << log2_denom; \ for(y=0; y<H; y++, dst += stride, src += stride){ \ op_scale2(0); \ @@ -95,11 +105,17 @@ H264_WEIGHT(2,2) #undef op_scale2 #undef H264_WEIGHT -static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) +static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) { + pixel *pix = (pixel*)_pix; int i, d; + xstride /= sizeof(pixel); + ystride /= sizeof(pixel); + alpha <<= BIT_DEPTH - 8; + beta <<= BIT_DEPTH - 8; for( i = 0; i < 4; i++ ) { - if( tc0[i] < 0 ) { + const int tc_orig = tc0[i] << (BIT_DEPTH - 8); + if( tc_orig < 0 ) { pix += inner_iters*ystride; continue; } @@ -115,44 +131,49 @@ static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, in FFABS( p1 - p0 ) < beta && FFABS( q1 - q0 ) < beta ) { - int tc = tc0[i]; + int tc = tc_orig; int i_delta; if( FFABS( p2 - p0 ) < beta ) { - if(tc0[i]) - pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); + if(tc_orig) + pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc_orig, tc_orig ); tc++; } if( FFABS( q2 - q0 ) < beta ) { - if(tc0[i]) - pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); + if(tc_orig) + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc_orig, tc_orig ); tc++; } i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ + pix[-xstride] = av_clip_pixel( p0 + i_delta ); /* p0' */ + pix[0] = av_clip_pixel( q0 - i_delta ); /* q0' */ } pix += ystride; } } } -static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_luma_c(pix, stride, 1, 4, alpha, beta, tc0); + FUNCC(h264_loop_filter_luma)(pix, stride, sizeof(pixel), 4, alpha, beta, tc0); } -static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_luma_c(pix, 1, stride, 4, alpha, beta, tc0); + FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0); } -static void h264_h_loop_filter_luma_mbaff_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_luma_c(pix, 1, stride, 2, alpha, beta, tc0); + FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0); } -static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta) +static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta) { + pixel *pix = (pixel*)_pix; int d; + xstride /= sizeof(pixel); + ystride /= sizeof(pixel); + alpha <<= BIT_DEPTH - 8; + beta <<= BIT_DEPTH - 8; for( d = 0; d < 4 * inner_iters; d++ ) { const int p2 = pix[-3*xstride]; const int p1 = pix[-2*xstride]; @@ -198,24 +219,29 @@ static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *p pix += ystride; } } -static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_luma_intra_c(pix, stride, 1, 4, alpha, beta); + FUNCC(h264_loop_filter_luma_intra)(pix, stride, sizeof(pixel), 4, alpha, beta); } -static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_luma_intra_c(pix, 1, stride, 4, alpha, beta); + FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta); } -static void h264_h_loop_filter_luma_mbaff_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_luma_intra_c(pix, 1, stride, 2, alpha, beta); + FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta); } -static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) +static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) { + pixel *pix = (pixel*)_pix; int i, d; + xstride /= sizeof(pixel); + ystride /= sizeof(pixel); + alpha <<= BIT_DEPTH - 8; + beta <<= BIT_DEPTH - 8; for( i = 0; i < 4; i++ ) { - const int tc = tc0[i]; + const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1; if( tc <= 0 ) { pix += inner_iters*ystride; continue; @@ -232,29 +258,34 @@ static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ + pix[-xstride] = av_clip_pixel( p0 + delta ); /* p0' */ + pix[0] = av_clip_pixel( q0 - delta ); /* q0' */ } pix += ystride; } } } -static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_chroma_c(pix, stride, 1, 2, alpha, beta, tc0); + FUNCC(h264_loop_filter_chroma)(pix, stride, sizeof(pixel), 2, alpha, beta, tc0); } -static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_chroma_c(pix, 1, stride, 2, alpha, beta, tc0); + FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0); } -static void h264_h_loop_filter_chroma_mbaff_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - h264_loop_filter_chroma_c(pix, 1, stride, 1, alpha, beta, tc0); + FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0); } -static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta) +static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta) { + pixel *pix = (pixel*)_pix; int d; + xstride /= sizeof(pixel); + ystride /= sizeof(pixel); + alpha <<= BIT_DEPTH - 8; + beta <<= BIT_DEPTH - 8; for( d = 0; d < 4 * inner_iters; d++ ) { const int p0 = pix[-1*xstride]; const int p1 = pix[-2*xstride]; @@ -271,15 +302,15 @@ static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t pix += ystride; } } -static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_chroma_intra_c(pix, stride, 1, 2, alpha, beta); + FUNCC(h264_loop_filter_chroma_intra)(pix, stride, sizeof(pixel), 2, alpha, beta); } -static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_chroma_intra_c(pix, 1, stride, 2, alpha, beta); + FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta); } -static void h264_h_loop_filter_chroma_mbaff_intra_c(uint8_t *pix, int stride, int alpha, int beta) +static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta) { - h264_loop_filter_chroma_intra_c(pix, 1, stride, 1, alpha, beta); + FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta); } diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index 7856315..4bd26e2 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -28,68 +28,98 @@ #include "mathops.h" #include "dsputil.h" -static void pred4x4_vertical_c(uint8_t *src, const uint8_t *topright, int stride){ - const uint32_t a= ((uint32_t*)(src-stride))[0]; - ((uint32_t*)(src+0*stride))[0]= a; - ((uint32_t*)(src+1*stride))[0]= a; - ((uint32_t*)(src+2*stride))[0]= a; - ((uint32_t*)(src+3*stride))[0]= a; -} - -static void pred4x4_horizontal_c(uint8_t *src, const uint8_t *topright, int stride){ - ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; - ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; - ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; - ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; -} - -static void pred4x4_dc_c(uint8_t *src, const uint8_t *topright, int stride){ +#define BIT_DEPTH 8 + +#define pixel uint8_t +#define pixel4 uint32_t +#define dctcoef DCTELEM + +#define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; +#define CLIP(a) cm[a] +#define FUNC(a) a +#define FUNCC(a) a ## _c +#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) +#define AV_WN4P AV_WN32 +#define AV_WN4PA AV_WN32A + +static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + const pixel4 a= ((pixel4*)(src-stride))[0]; + ((pixel4*)(src+0*stride))[0]= a; + ((pixel4*)(src+1*stride))[0]= a; + ((pixel4*)(src+2*stride))[0]= a; + ((pixel4*)(src+3*stride))[0]= a; +} + +static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]); + ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]); + ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]); + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]); +} + +static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); } -static void pred4x4_left_dc_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); } -static void pred4x4_top_dc_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); } -static void pred4x4_128_dc_c(uint8_t *src, const uint8_t *topright, int stride){ - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; +static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); } -static void pred4x4_127_dc_c(uint8_t *src, const uint8_t *topright, int stride){ - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= 127U*0x01010101U; +static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1); } -static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, int stride){ - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= 129U*0x01010101U; +static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + ((pixel4*)(src+0*stride))[0]= + ((pixel4*)(src+1*stride))[0]= + ((pixel4*)(src+2*stride))[0]= + ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1); } @@ -117,7 +147,9 @@ static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, int stride){ const int av_unused t2= src[ 2-1*stride];\ const int av_unused t3= src[ 3-1*stride];\ -static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int lt= src[-1-1*stride]; LOAD_TOP_EDGE LOAD_LEFT_EDGE @@ -140,7 +172,10 @@ static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stri src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; } -static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){ + pixel *src = (pixel*)_src; + const pixel *topright = (const pixel*)_topright; + int stride = _stride/sizeof(pixel); LOAD_TOP_EDGE LOAD_TOP_RIGHT_EDGE // LOAD_LEFT_EDGE @@ -163,7 +198,9 @@ static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int strid src[3+3*stride]=(t6 + 3*t7 + 2)>>2; } -static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int lt= src[-1-1*stride]; LOAD_TOP_EDGE LOAD_LEFT_EDGE @@ -186,7 +223,10 @@ static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; } -static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){ + pixel *src = (pixel*)_src; + const pixel *topright = (const pixel*)_topright; + int stride = _stride/sizeof(pixel); LOAD_TOP_EDGE LOAD_TOP_RIGHT_EDGE @@ -208,7 +248,9 @@ static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int s src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; } -static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); LOAD_LEFT_EDGE src[0+0*stride]=(l0 + l1 + 1)>>1; @@ -229,7 +271,9 @@ static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int s src[3+3*stride]=l3; } -static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int stride){ +static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){ + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); const int lt= src[-1-1*stride]; LOAD_TOP_EDGE LOAD_LEFT_EDGE @@ -252,34 +296,50 @@ static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; } -static void pred16x16_vertical_c(uint8_t *src, int stride){ +static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){ int i; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; - const uint32_t c= ((uint32_t*)(src-stride))[2]; - const uint32_t d= ((uint32_t*)(src-stride))[3]; + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + const pixel4 a = ((pixel4*)(src-stride))[0]; + const pixel4 b = ((pixel4*)(src-stride))[1]; + const pixel4 c = ((pixel4*)(src-stride))[2]; + const pixel4 d = ((pixel4*)(src-stride))[3]; for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; - ((uint32_t*)(src+i*stride))[2]= c; - ((uint32_t*)(src+i*stride))[3]= d; + ((pixel4*)(src+i*stride))[0] = a; + ((pixel4*)(src+i*stride))[1] = b; + ((pixel4*)(src+i*stride))[2] = c; + ((pixel4*)(src+i*stride))[3] = d; } } -static void pred16x16_horizontal_c(uint8_t *src, int stride){ +static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){ int i; + pixel *src = (pixel*)_src; + stride /= sizeof(pixel); for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; + ((pixel4*)(src+i*stride))[0] = + ((pixel4*)(src+i*stride))[1] = + ((pixel4*)(src+i*stride))[2] = + ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]); } } -static void pred16x16_dc_c(uint8_t *src, int stride){ +#define PREDICT_16x16_DC(v)\ + for(i=0; i<16; i++){\ + AV_WN4P(src+ 0, v);\ + AV_WN4P(src+ 4, v);\ + AV_WN4P(src+ 8, v);\ + AV_WN4P(src+12, v);\ + src += stride;\ + } + +static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){ int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride /= sizeof(pixel); for(i=0;i<16; i++){ dc+= src[-1+i*stride]; @@ -289,89 +349,59 @@ static void pred16x16_dc_c(uint8_t *src, int stride){ dc+= src[i-stride]; } - dc= 0x01010101*((dc + 16)>>5); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } + dcsplat = PIXEL_SPLAT_X4((dc+16)>>5); + PREDICT_16x16_DC(dcsplat); } -static void pred16x16_left_dc_c(uint8_t *src, int stride){ +static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){ int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride /= sizeof(pixel); for(i=0;i<16; i++){ dc+= src[-1+i*stride]; } - dc= 0x01010101*((dc + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } + dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); + PREDICT_16x16_DC(dcsplat); } -static void pred16x16_top_dc_c(uint8_t *src, int stride){ +static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){ int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride /= sizeof(pixel); for(i=0;i<16; i++){ dc+= src[i-stride]; } - dc= 0x01010101*((dc + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} -static void pred16x16_128_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; - } + dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); + PREDICT_16x16_DC(dcsplat); } -static void pred16x16_127_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= 0x01010101U*127U; - } +#define PRED16x16_X(n, v) \ +static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\ + int i;\ + pixel *src = (pixel*)_src;\ + stride /= sizeof(pixel);\ + PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\ } -static void pred16x16_129_dc_c(uint8_t *src, int stride){ - int i; +PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1); +PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0); +PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1); - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= 0x01010101U*129U; - } -} - -static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){ +static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){ int i, j, k; int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+7-stride; - const uint8_t *src1 = src+8*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + const pixel * const src0 = src +7-stride; + const pixel * src1 = src +8*stride-1; + const pixel * src2 = src1-2*stride; // == src+6*stride-1; int H = src0[1] - src0[-1]; int V = src1[0] - src2[ 0]; for(k=2; k<=8; ++k) { @@ -398,113 +428,115 @@ static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int int b = a; a += V; for(i=-16; i<0; i+=4) { - src[16+i] = cm[ (b ) >> 5 ]; - src[17+i] = cm[ (b+ H) >> 5 ]; - src[18+i] = cm[ (b+2*H) >> 5 ]; - src[19+i] = cm[ (b+3*H) >> 5 ]; + src[16+i] = CLIP((b ) >> 5); + src[17+i] = CLIP((b+ H) >> 5); + src[18+i] = CLIP((b+2*H) >> 5); + src[19+i] = CLIP((b+3*H) >> 5); b += 4*H; } src += stride; } } -static void pred16x16_plane_c(uint8_t *src, int stride){ - pred16x16_plane_compat_c(src, stride, 0, 0); +static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){ + FUNCC(pred16x16_plane_compat)(src, stride, 0, 0); } -static void pred8x8_vertical_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){ int i; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + const pixel4 a= ((pixel4*)(src-stride))[0]; + const pixel4 b= ((pixel4*)(src-stride))[1]; for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; + ((pixel4*)(src+i*stride))[0]= a; + ((pixel4*)(src+i*stride))[1]= b; } } -static void pred8x8_horizontal_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){ int i; + pixel *src = (pixel*)_src; + stride /= sizeof(pixel); for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; + ((pixel4*)(src+i*stride))[0]= + ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]); } } -static void pred8x8_128_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; - } +#define PRED8x8_X(n, v)\ +static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\ + int i;\ + pixel *src = (pixel*)_src;\ + stride /= sizeof(pixel);\ + for(i=0; i<8; i++){\ + ((pixel4*)(src+i*stride))[0]=\ + ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\ + }\ } -static void pred8x8_127_dc_c(uint8_t *src, int stride){ - int i; +PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1); +PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0); +PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1); - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= 0x01010101U*127U; - } -} -static void pred8x8_129_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= 0x01010101U*129U; - } -} - -static void pred8x8_left_dc_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){ int i; int dc0, dc2; + pixel4 dc0splat, dc2splat; + pixel *src = (pixel*)_src; + stride /= sizeof(pixel); dc0=dc2=0; for(i=0;i<4; i++){ dc0+= src[-1+i*stride]; dc2+= src[-1+(i+4)*stride]; } - dc0= 0x01010101*((dc0 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc0; + ((pixel4*)(src+i*stride))[0]= + ((pixel4*)(src+i*stride))[1]= dc0splat; } for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc2; + ((pixel4*)(src+i*stride))[0]= + ((pixel4*)(src+i*stride))[1]= dc2splat; } } -static void pred8x8_top_dc_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){ int i; int dc0, dc1; + pixel4 dc0splat, dc1splat; + pixel *src = (pixel*)_src; + stride /= sizeof(pixel); dc0=dc1=0; for(i=0;i<4; i++){ dc0+= src[i-stride]; dc1+= src[4+i-stride]; } - dc0= 0x01010101*((dc0 + 2)>>2); - dc1= 0x01010101*((dc1 + 2)>>2); + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; + ((pixel4*)(src+i*stride))[0]= dc0splat; + ((pixel4*)(src+i*stride))[1]= dc1splat; } for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; + ((pixel4*)(src+i*stride))[0]= dc0splat; + ((pixel4*)(src+i*stride))[1]= dc1splat; } } -static void pred8x8_dc_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){ int i; - int dc0, dc1, dc2, dc3; + int dc0, dc1, dc2; + pixel4 dc0splat, dc1splat, dc2splat, dc3splat; + pixel *src = (pixel*)_src; + stride /= sizeof(pixel); dc0=dc1=dc2=0; for(i=0;i<4; i++){ @@ -512,51 +544,53 @@ static void pred8x8_dc_c(uint8_t *src, int stride){ dc1+= src[4+i-stride]; dc2+= src[-1+(i+4)*stride]; } - dc3= 0x01010101*((dc1 + dc2 + 4)>>3); - dc0= 0x01010101*((dc0 + 4)>>3); - dc1= 0x01010101*((dc1 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); + dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); + dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; + ((pixel4*)(src+i*stride))[0]= dc0splat; + ((pixel4*)(src+i*stride))[1]= dc1splat; } for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc2; - ((uint32_t*)(src+i*stride))[1]= dc3; + ((pixel4*)(src+i*stride))[0]= dc2splat; + ((pixel4*)(src+i*stride))[1]= dc3splat; } } //the following 4 function should not be optimized! -static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){ - pred8x8_top_dc_c(src, stride); - pred4x4_dc_c(src, NULL, stride); +static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){ + FUNCC(pred8x8_top_dc)(src, stride); + FUNCC(pred4x4_dc)(src, NULL, stride); } -static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){ - pred8x8_dc_c(src, stride); - pred4x4_top_dc_c(src, NULL, stride); +static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){ + FUNCC(pred8x8_dc)(src, stride); + FUNCC(pred4x4_top_dc)(src, NULL, stride); } -static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){ - pred8x8_left_dc_c(src, stride); - pred4x4_128_dc_c(src + 4*stride , NULL, stride); - pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride); +static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){ + FUNCC(pred8x8_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); } -static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){ - pred8x8_left_dc_c(src, stride); - pred4x4_128_dc_c(src , NULL, stride); - pred4x4_128_dc_c(src + 4, NULL, stride); +static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){ + FUNCC(pred8x8_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); } -static void pred8x8_plane_c(uint8_t *src, int stride){ +static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){ int j, k; int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+3-stride; - const uint8_t *src1 = src+4*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + const pixel * const src0 = src +3-stride; + const pixel * src1 = src +4*stride-1; + const pixel * src2 = src1-2*stride; // == src+2*stride-1; int H = src0[1] - src0[-1]; int V = src1[0] - src2[ 0]; for(k=2; k<=4; ++k) { @@ -571,14 +605,14 @@ static void pred8x8_plane_c(uint8_t *src, int stride){ for(j=8; j>0; --j) { int b = a; a += V; - src[0] = cm[ (b ) >> 5 ]; - src[1] = cm[ (b+ H) >> 5 ]; - src[2] = cm[ (b+2*H) >> 5 ]; - src[3] = cm[ (b+3*H) >> 5 ]; - src[4] = cm[ (b+4*H) >> 5 ]; - src[5] = cm[ (b+5*H) >> 5 ]; - src[6] = cm[ (b+6*H) >> 5 ]; - src[7] = cm[ (b+7*H) >> 5 ]; + src[0] = CLIP((b ) >> 5); + src[1] = CLIP((b+ H) >> 5); + src[2] = CLIP((b+2*H) >> 5); + src[3] = CLIP((b+3*H) >> 5); + src[4] = CLIP((b+4*H) >> 5); + src[5] = CLIP((b+5*H) >> 5); + src[6] = CLIP((b+6*H) >> 5); + src[7] = CLIP((b+7*H) >> 5); src += stride; } } @@ -616,46 +650,64 @@ static void pred8x8_plane_c(uint8_t *src, int stride){ #define PREDICT_8x8_DC(v) \ int y; \ for( y = 0; y < 8; y++ ) { \ - ((uint32_t*)src)[0] = \ - ((uint32_t*)src)[1] = v; \ + ((pixel4*)src)[0] = \ + ((pixel4*)src)[1] = v; \ src += stride; \ } -static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { - PREDICT_8x8_DC(0x80808080); + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + + PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1))); } -static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + PREDICT_8x8_LOAD_LEFT; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; + const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3); PREDICT_8x8_DC(dc); } -static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; + const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3); PREDICT_8x8_DC(dc); } -static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 - +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; + const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4); PREDICT_8x8_DC(dc); } -static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + PREDICT_8x8_LOAD_LEFT; -#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ - ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y +#define ROW(y) ((pixel4*)(src+y*stride))[0] =\ + ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y) ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); #undef ROW } -static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { int y; + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); + PREDICT_8x8_LOAD_TOP; src[0] = t0; src[1] = t1; @@ -665,11 +717,15 @@ static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, src[5] = t5; src[6] = t6; src[7] = t7; - for( y = 1; y < 8; y++ ) - *(uint64_t*)(src+y*stride) = *(uint64_t*)src; + for( y = 1; y < 8; y++ ) { + ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0]; + ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1]; + } } -static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOPRIGHT; SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; @@ -688,8 +744,10 @@ static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; SRC(7,7)= (t14 + 3*t15 + 2) >> 2; } -static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_TOPLEFT; @@ -708,10 +766,11 @@ static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_toprigh SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; - } -static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_TOPLEFT; @@ -738,8 +797,10 @@ static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_top SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; SRC(7,0)= (t6 + t7 + 1) >> 1; } -static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_TOPLEFT; @@ -766,8 +827,10 @@ static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_to SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; } -static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOPRIGHT; SRC(0,0)= (t0 + t1 + 1) >> 1; @@ -793,8 +856,10 @@ static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topr SRC(7,6)= (t10 + t11 + 1) >> 1; SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; } -static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride) +static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride) { + pixel *src = (pixel*)_src; + int stride = _stride/sizeof(pixel); PREDICT_8x8_LOAD_LEFT; SRC(0,0)= (l0 + l1 + 1) >> 1; SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; @@ -825,11 +890,14 @@ static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topr #undef PL #undef SRC -static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ +static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){ int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride /= sizeof(pixel); pix -= stride; for(i=0; i<4; i++){ - uint8_t v = pix[0]; + pixel v = pix[0]; pix[1*stride]= v += block[0]; pix[2*stride]= v += block[4]; pix[3*stride]= v += block[8]; @@ -839,10 +907,13 @@ static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int strid } } -static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ +static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){ int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride /= sizeof(pixel); for(i=0; i<4; i++){ - uint8_t v = pix[-1]; + pixel v = pix[-1]; pix[0]= v += block[0]; pix[1]= v += block[1]; pix[2]= v += block[2]; @@ -852,11 +923,14 @@ static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int str } } -static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ +static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){ int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride /= sizeof(pixel); pix -= stride; for(i=0; i<8; i++){ - uint8_t v = pix[0]; + pixel v = pix[0]; pix[1*stride]= v += block[0]; pix[2*stride]= v += block[8]; pix[3*stride]= v += block[16]; @@ -870,10 +944,13 @@ static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stri } } -static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ +static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){ int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride /= sizeof(pixel); for(i=0; i<8; i++){ - uint8_t v = pix[-1]; + pixel v = pix[-1]; pix[0]= v += block[0]; pix[1]= v += block[1]; pix[2]= v += block[2]; @@ -887,26 +964,26 @@ static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int st } } -static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ +static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<16; i++) - pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } -static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ +static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<16; i++) - pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } -static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ +static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<4; i++) - pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } -static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ +static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<4; i++) - pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } |