c - How can i optimize my AVX implementation of dot product? -


i`ve tried implement dot product of 2 arrays using avx https://stackoverflow.com/a/10459028. code slow.

a , xb arrays of doubles, n number. can me?

const int mask = 0x31; int sum =0;  (int = 0; < n; i++) {     int ind = i;     if (i + 8 > n) // padding     {         sum += a[ind] * xb[i].x;         i++;         ind = n * j + i;         sum += a[ind] * xb[i].x;         continue;     }      __declspec(align(32)) double ar[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };     __m256d x = _mm256_loadu_pd(&a[ind]);     __m256d y = _mm256_load_pd(ar);     i+=4; ind = n * j + i;     __declspec(align(32)) double arr[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };     __m256d z = _mm256_loadu_pd(&a[ind]);     __m256d w = _mm256_load_pd(arr);      __m256d xy = _mm256_mul_pd(x, y);     __m256d zw = _mm256_mul_pd(z, w);     __m256d temp = _mm256_hadd_pd(xy, zw);     __m128d hi128 = _mm256_extractf128_pd(temp, 1);     __m128d low128 = _mm256_extractf128_pd(temp, 0);     //__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128);     __m128d dotproduct = _mm_add_pd(low128, hi128);      sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1];     += 3; } 

there 2 big inefficiencies in loop apparent:

(1) these 2 chunks of scalar code:

__declspec(align(32)) double ar[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x }; ... __m256d y = _mm256_load_pd(ar); 

and

__declspec(align(32)) double arr[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x }; ... __m256d w = _mm256_load_pd(arr); 

should implemented using simd loads , shuffles (or @ least use _mm256_set_pd , give compiler chance half-reasonable job of generating code gathered load).

(2) horizontal summation @ end of loop:

for (int = 0; < n; i++) {     ...     __m256d xy = _mm256_mul_pd(x, y);     __m256d zw = _mm256_mul_pd(z, w);     __m256d temp = _mm256_hadd_pd(xy, zw);     __m128d hi128 = _mm256_extractf128_pd(temp, 1);     __m128d low128 = _mm256_extractf128_pd(temp, 0);     //__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128);     __m128d dotproduct = _mm_add_pd(low128, hi128);      sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1];     += 3; } 

should moved out of loop:

__m256d xy = _mm256_setzero_pd(); __m256d zw = _mm256_setzero_pd(); ... (int = 0; < n; i++) {     ...     xy = _mm256_add_pd(xy, _mm256_mul_pd(x, y));     zw = _mm256_add_pd(zw, _mm256_mul_pd(z, w));     += 3; } __m256d temp = _mm256_hadd_pd(xy, zw); __m128d hi128 = _mm256_extractf128_pd(temp, 1); __m128d low128 = _mm256_extractf128_pd(temp, 0); //__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128); __m128d dotproduct = _mm_add_pd(low128, hi128);  sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1]; 

Comments

Popular posts from this blog

angularjs - ADAL JS Angular- WebAPI add a new role claim to the token -

php - CakePHP HttpSockets send array of paramms -

node.js - Using Node without global install -