|
|

楼主 |
发表于 2006-11-13 17:58:00
|
显示全部楼层
Re: 懂SSE请进,为什么老是运行报错?(思路是正确的)
typedef struct VECTOR3D_TYP
{
union
{
__declspec(align(16)) float M[4];
struct
{
float x,y,z,w;
};
};
} VECTOR3D, POINT3D, *VECTOR3D_PTR, *POINT3D_PTR;
typedef struct MATRIX4X4_TYP
{
union
{
__declspec(align(16)) float M[4][4];
struct
{
float M00, M01, M02, M03;
float M10, M11, M12, M13;
float M20, M21, M22, M23;
float M30, M31, M32, M33;
};
};
} MATRIX4X4, *MATRIX4X4_PTR;
inline void VECTOR3D_INIT(VECTOR3D &vec, float x, float y, float z)
{
vec.x = x;
vec.y = y;
vec.z = z;
vec.w = 1;
}
void Mat_Mul_VECTOR3D_4X4_SSE(const VECTOR3D &vec,const MATRIX4X4 &mat, VECTOR3D &dst_vec)
{
__asm
{
mov eax, vec;
movaps xmm0, [eax];
movaps xmm4, xmm0;
shufps xmm4, xmm0, 0x00;//xmm4 = {x,x,x,x}
movaps xmm5, xmm0;
shufps xmm5, xmm0, 0x55;//xmm5 = {y,y,y,y}
movaps xmm6, xmm0;
shufps xmm6, xmm0, 0xAA;//xmm6 = {z,z,z,z}
movaps xmm7, xmm0;
shufps xmm7, xmm0, 0xFF;//xmm7 = {w,w,w,w}
mov esi, mat;
mulps xmm4, [esi];
mulps xmm5, [esi + 16];
mulps xmm6, [esi + 32];
mulps xmm7, [esi + 48];
xorps xmm0, xmm0;
addps xmm0, xmm4;
addps xmm0, xmm5;
addps xmm0, xmm6;
addps xmm0, xmm7;
mov edi, dst_vec;
movaps [edi], xmm0;
}
}
//////////////////////////////////////////////////////////////////////////
//SSE-Debug Code/////////////////////////////////////////
VECTOR3D vec3d = {1,2,3,4},vec2,vec1,vec[3];
VECTOR3D_PTR vec_ptr = new VECTOR3D[3];
MATRIX4X4 mat = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
VECTOR3D_INIT(vec[0], 1, 2, 3);
VECTOR3D_INIT(vec_ptr[0], 1, 2, 3);
Mat_Mul_VECTOR3D_4X4_SSE(vec3d, mat, vec1);//ok
Mat_Mul_VECTOR3D_4X4_SSE(vec[0], mat, vec1);//ok
Mat_Mul_VECTOR3D_4X4_SSE(vec_ptr[0], mat, vec1);//err 0xC0000005: 读取位置 0xffffffff 时发生访问冲突 。
注意!__declspec(align(16)) 进行16字对齐即可
另外由于C++使用new动态分配的内存不是16字对齐的,所以抱错!需要自己写内存分配和释放的函数,进行16字对齐即可,测试通过了,没有问题
比较郁闷的就是测试发现我这个代码,好象跟C++的代码速度差不多,哪个大哥能够解释一下啊,小弟谢了先
|
|