this c fuction, gets weight values of src
, stores them dst
.
static int _medium_c( dctelem * src, int index, int *dst ) { int i; //get weighted value for( = 0; < 16; i++ ) { unsigned int threshold1 = threshold[index][i];//threshold contains constant value unsigned int threshold2 = ( threshold1<<1 ); int level= src[i]; if( ( ( unsigned )( level+threshold1 ) ) > threshold2 ) { if( ( ( unsigned )( level+2*threshold1 ) ) > 2*threshold2 ) { dst[i] = level * factor[i]; } else { if( level>0 ) { dst[i] = 2*( level - ( int )threshold1 ) * factor[i]; } else { dst[i] = 2*( level + ( int )threshold1 ) * factor[i]; } } } } return 0; }
the intrinsic version is:
int medium_intrinsic16( dctelem * src, int index, int* dst ) { int i, j = 0, c[16], k = 0; for( j = 0;j < 2;j++ ) { __m128i zero128 = _mm_setzero_si128(); __m128i mask = _mm_set_epi8( 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x0d, 0x0c,0x09,0x08,0x05,0x04,0x01,0x00 ); __m128i factor_a = _mm_loadu_si128 ( (__m128i*)&factor[8*j] ); factor_a = _mm_shuffle_epi8( factor_a, mask); __m128i factor_b = _mm_loadu_si128 ( (__m128i*)&factor[8*j+4] ); factor_b = _mm_shuffle_epi8( factor_b, mask); factor_a = _mm_unpacklo_epi64( factor_a, factor_b ); __m128i level_a = _mm_loadu_si128( (__m128i*)&src[8*j] ); __m128i threshold1_a = _mm_loadu_si128((__m128i*)&threshold[index][8*j] ); threshold1_a = _mm_shuffle_epi8( threshold1_a, mask); __m128i threshold1_b = _mm_loadu_si128((__m128i*)&threshold[index][8*j+4] ); threshold1_b = _mm_shuffle_epi8( threshold1_b, mask); threshold1_a = _mm_unpacklo_epi64( threshold1_a, threshold1_b ); __m128i threshold2_a = _mm_slli_epi32( threshold1_a, 1 ); __m128i mif = _mm_cmpgt_epi16( level_a, zero128 ); //keep __m128i m0 = _mm_sub_epi16( level_a, threshold1_a );//( level - ( int )threshold1 ) __m128i m1 = _mm_add_epi16( level_a, threshold1_a );//( level + ( int )threshold1 ) __m128i m2 = _mm_slli_epi16( factor_a, 1); __m128i m3 = _mm_mullo_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i]; __m128i m4 = _mm_mulhi_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i]; __m128i m5 = _mm_mullo_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i]; __m128i m6 = _mm_mulhi_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i]; //keep m3 = _mm_blendv_epi8( m5, m3, mif); m4 = _mm_blendv_epi8( m6, m4, mif); m0 = _mm_add_epi16( level_a, threshold2_a );//( level+2*threshold1 ) m1 = _mm_slli_epi16( threshold2_a, 1 );//2*threshold2 m2 = _mm_max_epu16( m0, m1 ); mif = _mm_cmpeq_epi16( m2, m0 ); m0 = _mm_mullo_epi16( level_a, factor_a ); m1 = _mm_mulhi_epi16( level_a, factor_a ); //keep m0 = _mm_blendv_epi8( m3, m0, mif ); m1 = _mm_blendv_epi8( m4, m1, mif ); m2 = _mm_add_epi16( level_a, threshold1_a ); m3 = _mm_max_epu16( m2, threshold2_a ); mif = _mm_cmpeq_epi16( m3, m2); m0 = _mm_and_si128( mif, m0 ); m1 = _mm_and_si128( mif, m1 ); m2 = _mm_unpacklo_epi16( m0, m1 ); m3 = _mm_unpackhi_epi16( m0, m1 ); _mm_storeu_si128((__m128i*)&dst[8*j] , m2 );//will run fast if removed _mm_storeu_si128((__m128i*)&dst[8*j+4], m3 );//will run fast if removed } return 0; }
the intrinsic version not faster c version.the problem if remove last 2 lines of loop, indicted in codes , _mm_storeu_si128((__m128i*)&dst[8*j] , m2)
, _mm_storeu_si128((__m128i*)&dst[8*j+4], m3)
, intrinsic version run faster signicantly c version( 4 times faster ). explain why happens? _mm_storeu_si128()
cost time? thanks
you're bottlenecked on memory bandwidth if it's same speed c version. in case, yes, storing memory expensive thing in algorithm.
or perhaps compiler optimizes away lot of code when results aren't stored anywhere! you'd have @ asm make sure left out store instructions, instead of optimizing away of function.
see http://agner.org/optimize/, , other links @ https://stackoverflow.com/tags/x86/info (esp. ulrich drepper's paper caches.)
look cache blocking, aka loop tiling.
Comments
Post a Comment