- 论坛徽章:
- 0
|
代码如下:
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <xmmintrin.h>
using namespace std;
typedef union{
__m128 m;
float f[4];
} sseTemp;
float dotWithSSE(float *a, float *b, const int len){
int step = len/4;
__m128* one = (__m128*)a;
__m128* two = (__m128*)b;
if(NULL == one || NULL == two){
printf("%d\n", __LINE__);
exit(1);
}
__m128 result = _mm_setzero_ps();
__m128 temp = _mm_set_ps(0.1f, 0.1f, 0.1f, 0.1f);
for(int i = 0; i < step -1; i++){
temp = _mm_add_ps(*one, *two);
temp = _mm_mul_ps(temp, temp);
result = _mm_add_ps(result, temp);
}
sseTemp s;
s.m = result;
float r = 0.0f;
r += s.f[0] + s.f[1] + s.f[2] + s.f[3];
if(0 != (len & 3)){
for(int i = 4*step; i < len; i++){
r += (a[i] - b[i])*(a[i] - b[i]);
}
}
return r;
}
float dotWithoutSSE(float* __restrict__ a, float* __restrict__ b, const int len){
float result = 0.0f, temp;
for(int i = 0; i < len; i++){
temp = (a[i] - b[i]);
result += temp*temp;
}
return result;
}
int runTest(const int len){
float *a, *b;
a = (float*) malloc(sizeof(float)*len);
b = (float*) malloc(sizeof(float)*len);
if(NULL == a || NULL == b){
printf("malloc fail\n");
exit(1);
}
for(int i = 0; i < len; i++){
a[i] = 1.0f*rand()/RAND_MAX;
b[i] = 1.0f*rand()/RAND_MAX;
}
clock_t start = clock();
float sse = dotWithSSE(a, b, len);
clock_t end = clock();
clock_t timeSSE = end - start;
start = clock();
float withoutSSE = dotWithoutSSE(a, b, len);
end = clock();
clock_t timeWithoutSSE = end - start;
if(fabs(sse - withoutSSE) < 0.00001){
printf("test pass\n");
}
printf("withSSE = %f, wihtoutSSE = %f", sse, withoutSSE);
free(a);
free(b);
}
int main(){
runTest(1000);
return 0;
}
使用gcc测试时,代码 temp = _mm_add_ps(*one, *two);越界,这另我百思不得其解,先谢谢了! |
|