c - _mm_storeu_si128 cost too much time? -


this c fuction, gets weight values of src , stores them dst.

static int _medium_c( dctelem * src, int index, int *dst ) {     int i;     //get weighted value     for( = 0; < 16; i++ )     {         unsigned int threshold1 = threshold[index][i];//threshold contains constant value         unsigned int threshold2 = ( threshold1<<1 );         int level= src[i];         if( ( ( unsigned )( level+threshold1 ) ) > threshold2 )         {             if( ( ( unsigned )( level+2*threshold1 ) ) > 2*threshold2 )             {                 dst[i] = level * factor[i];             }             else             {                 if( level>0 )                 {                     dst[i] =  2*( level - ( int )threshold1 ) * factor[i];                 }                 else                 {                     dst[i] =  2*( level + ( int )threshold1 ) * factor[i];                 }             }         }     }     return 0; } 

the intrinsic version is:

int medium_intrinsic16( dctelem * src, int index, int* dst ) {    int i, j = 0,  c[16], k = 0;    for( j = 0;j < 2;j++ )    {         __m128i zero128 = _mm_setzero_si128();         __m128i mask = _mm_set_epi8( 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x0d, 0x0c,0x09,0x08,0x05,0x04,0x01,0x00 );         __m128i factor_a  = _mm_loadu_si128 ( (__m128i*)&factor[8*j] );         factor_a = _mm_shuffle_epi8( factor_a, mask);         __m128i factor_b  = _mm_loadu_si128 ( (__m128i*)&factor[8*j+4] );         factor_b = _mm_shuffle_epi8( factor_b, mask);         factor_a = _mm_unpacklo_epi64( factor_a, factor_b );          __m128i  level_a  = _mm_loadu_si128( (__m128i*)&src[8*j] );          __m128i  threshold1_a = _mm_loadu_si128((__m128i*)&threshold[index][8*j] );         threshold1_a = _mm_shuffle_epi8( threshold1_a, mask);         __m128i  threshold1_b = _mm_loadu_si128((__m128i*)&threshold[index][8*j+4] );         threshold1_b = _mm_shuffle_epi8( threshold1_b, mask);         threshold1_a = _mm_unpacklo_epi64( threshold1_a, threshold1_b );         __m128i  threshold2_a = _mm_slli_epi32( threshold1_a, 1 );          __m128i mif = _mm_cmpgt_epi16( level_a, zero128 );         //keep         __m128i m0 = _mm_sub_epi16( level_a, threshold1_a );//( level - ( int )threshold1 )         __m128i m1 = _mm_add_epi16( level_a, threshold1_a );//( level + ( int )threshold1 )         __m128i m2 = _mm_slli_epi16( factor_a, 1);          __m128i m3 = _mm_mullo_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];         __m128i m4 = _mm_mulhi_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];         __m128i m5 = _mm_mullo_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];         __m128i m6 = _mm_mulhi_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];          //keep         m3 = _mm_blendv_epi8( m5, m3, mif);         m4 = _mm_blendv_epi8( m6, m4, mif);          m0 = _mm_add_epi16( level_a, threshold2_a );//( level+2*threshold1 )         m1 = _mm_slli_epi16( threshold2_a, 1 );//2*threshold2         m2 = _mm_max_epu16( m0, m1 );         mif = _mm_cmpeq_epi16( m2, m0 );         m0 = _mm_mullo_epi16( level_a, factor_a );         m1 = _mm_mulhi_epi16( level_a, factor_a );          //keep         m0 = _mm_blendv_epi8( m3, m0, mif );         m1 = _mm_blendv_epi8( m4, m1, mif );          m2  = _mm_add_epi16( level_a, threshold1_a );         m3  = _mm_max_epu16( m2, threshold2_a );         mif = _mm_cmpeq_epi16( m3, m2);          m0 = _mm_and_si128( mif, m0 );         m1 = _mm_and_si128( mif, m1 );          m2 = _mm_unpacklo_epi16( m0, m1 );         m3 = _mm_unpackhi_epi16( m0, m1 );         _mm_storeu_si128((__m128i*)&dst[8*j] , m2 );//will run fast if removed          _mm_storeu_si128((__m128i*)&dst[8*j+4], m3 );//will run fast if removed           }       return 0; } 

the intrinsic version not faster c version.the problem if remove last 2 lines of loop, indicted in codes , _mm_storeu_si128((__m128i*)&dst[8*j] , m2) , _mm_storeu_si128((__m128i*)&dst[8*j+4], m3), intrinsic version run faster signicantly c version( 4 times faster ). explain why happens? _mm_storeu_si128() cost time? thanks

you're bottlenecked on memory bandwidth if it's same speed c version. in case, yes, storing memory expensive thing in algorithm.

or perhaps compiler optimizes away lot of code when results aren't stored anywhere! you'd have @ asm make sure left out store instructions, instead of optimizing away of function.

see http://agner.org/optimize/, , other links @ https://stackoverflow.com/tags/x86/info (esp. ulrich drepper's paper caches.)

look cache blocking, aka loop tiling.


Comments