几个用内联汇编写的数学运算函数

oz01 · 发表于 2010-5-28 18:57:00

//4元数规范化
inline void Easy_quat_normalize(__m128 * QuatOut,const __m128 QuatIn)
{
_asm
{
movaps xmm1, QuatIn
movaps xmm0, xmm1
mulps    xmm1, xmm1


movaps xmm2, xmm1
movaps xmm3, xmm1
movaps xmm4, xmm1

shufps xmm1, xmm1, 0x00
shufps xmm2, xmm2, 0x55
shufps xmm3, xmm3, 0xaa
shufps xmm4, xmm4, 0xff

addps    xmm1, xmm2
addps xmm1, xmm3
addps xmm1, xmm4

sqrtps xmm1, xmm1
divps xmm0, xmm1

mov    ecx, QuatOut
movups    [ecx],  xmm0
  }
}

//4元数转矩阵
#define ALIGN16( x ) __declspec(align(16)) x

#define ALIGN4_INIT1( X, I ) ALIGN16( static X[4] = { I, I, I, I } )
#define ALIGN4_INIT4( X, I0,I1,I2,I3 ) ALIGN16( static X[4] = { I0,I1,I2,I3 } )
#define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define R_SHUFFLE_D R_SHUFFLE_PS
#define IEEE_SP_ZERO 0
#define IEEE_SP_SIGN ((unsigned long) ( 1 << 31 ))
__declspec(align(16)) static const unsigned long SIMD_SP_quat2mat_x0[4] ={IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN};
__declspec(align(16)) static const unsigned long SIMD_SP_quat2mat_x1[4] ={IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN};
__declspec(align(16)) static const unsigned long SIMD_SP_quat2mat_x2[4] ={IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN};
static const __m128 SIMD_SP_one = _mm_set_ps(0.0f,0.0f,0.0f,1.0f);
static const __m128 SIMD_SP_one1 = _mm_set_ps(1.0f,1.0f,1.0f,1.0f);
static const __m128 SIMD_SP_one3 = _mm_set_ps(1.0f,0.0f,0.0f,0.0f);
static const __m128 SSE_ONE = _mm_set_ps(1.0f,1.0f,1.0f,1.0f);
inline void Easy_quat_to_matrix(__m128 MatrixOut[4],const __m128 QuatIn)
{
_asm
{
// PUSHAD
mov eax,MatrixOut
movaps xmm0,QuatIn
movaps xmm6,SIMD_SP_one3

movaps xmm1, xmm0 // xmm1 = x, y, z, w
addps xmm1, xmm1 // xmm1 = x2, y2, z2, w2

pshufd xmm2, xmm0, R_SHUFFLE_D( 1, 0, 0, 1 ) // xmm2 = y, x, x, y
pshufd xmm3, xmm1, R_SHUFFLE_D( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2
mulps xmm2, xmm3 // xmm2 = yy2, xy2, xz2, yz2

pshufd xmm4, xmm0, R_SHUFFLE_D( 2, 3, 3, 3 ) // xmm4 = z, w, w, w
pshufd xmm5, xmm1, R_SHUFFLE_D( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2
mulps xmm4, xmm5 // xmm4 = zz2, wz2, wy2, wx2

mulss xmm0, xmm1 // xmm0 = xx2, y2, z2, w2

// calculate the last two elements of the third row

movss xmm7, SIMD_SP_one1 // xmm7 = 1, 0, 0, 0
subss xmm7, xmm0 // xmm7 = -xx2+1, 0, 0, 0
subss xmm7, xmm2 // xmm7 = -xx2-yy2+1, 0, 0, 0
shufps xmm7, xmm6, R_SHUFFLE_PS( 0, 1, 2, 3 ) // xmm7 = -xx2-yy2+1, 0, t.z, t.w

// calcluate first row

xorps xmm2, SIMD_SP_quat2mat_x0 // xmm2 = yy2, -xy2, -xz2, -yz2
xorps xmm4, SIMD_SP_quat2mat_x1 // xmm4 = -zz2, wz2, -wy2, -wx2
addss xmm4, SIMD_SP_one1 // xmm4 = -zz2+1, wz2, -wy2, -wx2
movaps xmm3, xmm4 // xmm3 = -zz2+1, wz2, -wy2, -wx2
subps xmm3, xmm2 // xmm3 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
movaps [eax+0*16+0*4], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
movss [eax+0*16+3*4], xmm6 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, t.x

// calculate second row

movss xmm2, xmm0 // xmm2 = xx2, -xy2, -xz2, -yz2
xorps xmm4, SIMD_SP_quat2mat_x2 // xmm4 = -zz2+1, -wz2, wy2, wx2
subps xmm4, xmm2 // xmm4 = -xx2-zz2+1, xy2-wz2, xz2+wy2, yz2+wx2
shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) // xmm6 = t.y, t.z, t.w, t.x
shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 0, 3, 2 ) // xmm4 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
movaps [eax+1*16+0*4], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
movss [eax+1*16+3*4], xmm6 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, t.y

// calculate third row

movhlps xmm3, xmm4 // xmm3 = yz2+wx2, xz2+wy2, xz2-wy2, yz2-wx2
shufps xmm3, xmm7, R_SHUFFLE_PS( 1, 3, 0, 2 ) // xmm3 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
movaps [eax+2*16+0*4], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
movaps xmm6,SIMD_SP_one3
movaps [eax+3*16+0*4], xmm6

// POPAD
}
}

//4元数乘法

inline void Easy_quat_Mult(__m128 * QuatOut,const __m128 QuatIn1,const __m128 QuatIn2)
{
_asm
{
movaps xmm6,QuatIn1
movaps xmm7,QuatIn2

movaps xmm4,xmm6
movaps xmm0,xmm7
shufps xmm0,xmm0,swiz1m
mulps  xmm4,xmm0//__m128 mul1=_mm_mul_ps(a,swiz1);
movaps xmm5,xmm6
movaps xmm0,xmm7
shufps xmm5,xmm5,swiz2m
shufps xmm0,xmm0,swiz3m
mulps  xmm5,xmm0//__m128 mul2=_mm_mul_ps(swiz2,swiz3);
subps  xmm4,xmm5//__m128 retVal=_mm_sub_ps(mul1,mul2);

movaps xmm5,xmm6
movaps xmm0,xmm7
shufps xmm5,xmm5,swiz6m
shufps xmm0,xmm0,swiz7m
mulps  xmm5,xmm0//__m128 mul4=_mm_mul_ps(swiz6,swiz7);
shufps xmm6,xmm6,swiz4m
shufps xmm7,xmm7,swiz5m
mulps  xmm6,xmm7//__m128 mul3=_mm_mul_ps(swiz4,swiz5);
xorps  xmm5,quat_mask//__m128 flip1=_mm_xor_ps(mul4,quat_mask);
xorps  xmm6,quat_mask//__m128 flip2=_mm_xor_ps(mul3,quat_mask);
addps  xmm5,xmm6//__m128 retVal2=_mm_add_ps(flip1,flip2);

addps  xmm4,xmm5
mov eax,QuatOut
movaps [eax] , xmm4
}
}

oz01 · 发表于 2010-5-28 19:00:00

//矩阵乘法

inline void Easy_matrix_mult(__m128 MatrixOut[4], const __m128 in1[4], const __m128 in2[4])
{

_asm
{

mov ecx,MatrixOut
mov edx,in1
movaps xmm4,[edx]
movaps xmm5,[edx+16]
movaps xmm6,[edx+32]
movaps xmm7,[edx+48]

mov edx,in2
//-------------------------------------------
movaps xmm0,[edx]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0

shufps xmm3,xmm3,0xff
shufps xmm2,xmm2,0xaa
shufps xmm1,xmm1,0x55
shufps xmm0,xmm0,0x00

mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
mulps xmm3,xmm7

addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movaps [ecx], xmm0
//-------------------------------------------
movaps xmm0,[edx+16]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0

shufps xmm3,xmm3,0xff
shufps xmm2,xmm2,0xaa
shufps xmm1,xmm1,0x55
shufps xmm0,xmm0,0x00

mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
mulps xmm3,xmm7

addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movaps [ecx+16], xmm0
//-------------------------------------------
movaps xmm0,[edx+32]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0

shufps xmm3,xmm3,0xff
shufps xmm2,xmm2,0xaa
shufps xmm1,xmm1,0x55
shufps xmm0,xmm0,0x00

mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
mulps xmm3,xmm7

addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movaps [ecx+32], xmm0
//-------------------------------------------
movaps xmm0,[edx+48]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0

shufps xmm3,xmm3,0xff
shufps xmm2,xmm2,0xaa
shufps xmm1,xmm1,0x55
shufps xmm0,xmm0,0x00

mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
mulps xmm3,xmm7

addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movaps [ecx+48], xmm0
}

}

inline void Easy_matrix_mult_vector4X4(__m128 * vOut ,const __m128 MatrixIn[4],const __m128 vIN)
{

_asm
{
mov edx,MatrixIn
movaps xmm4,[edx]
movaps xmm5,[edx+16]
movaps xmm6,[edx+32]
movaps xmm7,[edx+48]

movaps xmm0,vIN
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0

shufps xmm3,xmm3,0xff
shufps xmm2,xmm2,0xaa
shufps xmm1,xmm1,0x55
shufps xmm0,xmm0,0x00

mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
mulps xmm3,xmm7

addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
mov ecx, vOut
movaps [ecx], xmm0
}

}

instemast · 发表于 2010-5-28 19:03:00

LZ幸苦！

fanatic · 发表于 2010-5-28 22:18:00

dx sdk里都有独立于dx开源的sse/sse2数学库了
不知道lz单贴几个函数出来有什么意思......

oz01 · 发表于 2010-5-28 22:22:00

fanatic: Re:几个用内联汇编写的数学运算函数

dx sdk里都有独立于dx开源的sse/sse2数学库了
不知道lz单贴几个函数出来有什么意思......

你把那东西拿来用一下就明白了

fanatic · 发表于 2010-5-28 22:26:00

人家开源的目的就是让你可以自己优化
而且也说了哪些地方可以优化，应该怎么优化

账号		自动登录	找回密码
密码			立即注册

几个用内联汇编写的数学运算函数

Re:几个用内联汇编写的数学运算函数

Re:几个用内联汇编写的数学运算函数

Re:几个用内联汇编写的数学运算函数

Re: Re:几个用内联汇编写的数学运算函数

Re:几个用内联汇编写的数学运算函数