我想利用sse指令计算一个4*4的float矩阵,然后写了一段内嵌汇编的c程序,结果编译没问题,运行时出了问题。
程序在输出“hello”之后就卡在那里不动了,请教各位是什么问题啊~~- #include <stdio.h>
- float a[4][4] __attribute__ ((aligned (16))) = {
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0}
- };
- float b[4][4] __attribute__ ((aligned (16))) = {
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0},
- {1.0,2.0,3.0,4.0}
- };
- float c[4][4] __attribute__ ((aligned (16)));
- void matMul(float result[4][4],float left[4][4],float right[4][4])
- {
- printf("hello\n");
- __asm__ __volatile__
- (
- "movaps (%2),%%xmm4 \n\t"
- "movaps 16(%2),%%xmm5 \n\t"
- "movaps 32(%2),%%xmm6 \n\t"
- "movaps 48(%2),%%xmm7 \n\t"
- "movl $0,%%edi \n\t"
- "movl $4,%%ecx \n\t"
- "lab:\n\t" "movups (%1,%2,1),%%xmm0 \n\t"
- "movaps %%xmm0,%%xmm1 \n\t"
- "movaps %%xmm0,%%xmm2 \n\t"
- "movaps %%xmm0,%%xmm3 \n\t"
- "shufps $0x00,%%xmm0,%%xmm0 \n\t"
- "shufps $0x55,%%xmm1,%%xmm1 \n\t"
- "shufps $0xAA,%%xmm2,%%xmm2 \n\t"
- "shufps $0xFF,%%xmm3,%%xmm3 \n\t"
- "mulps %%xmm4,%%xmm0 \n\t"
- "mulps %%xmm5,%%xmm1 \n\t"
- "mulps %%xmm6,%%xmm2 \n\t"
- "mulps %%xmm7,%%xmm3 \n\t"
- "addps %%xmm0,%%xmm2 \n\t"
- "addps %%xmm1,%%xmm3 \n\t"
- "addps %%xmm2,%%xmm3 \n\t"
- "movups %%xmm3,(%0,%2,1) \n\t"
- "addl $16,%%edi \n\t"
- "loop lab \n\t"
- :"=d"(result)
- :"a"(left),"D"(right)
- :"%ecx"
- );
- }
- int main()
- {
- int i,j;
- matMul(c,a,b);
-
- for(i = 0; i<4; i++)
- {
- for(j = 0; j<4; j++)
- printf("%lf ",c[i][j]);
- printf("\n");
- }
- }
复制代码 |