|
|
看到一C++代码中内联的一段汇编,做4*4矩阵乘法的,怎么看都觉得像优化过,我弄不明白这个代码是标准的float 4*4 矩阵的SSE优化代码,还是反汇编之后copy进来的,还是手动优化的,还是什么?如果是手动优化的,优化的策略又是什么?
代码如下,注释我自己看代码的时候加的:
mov edx, src1 ; matrix 4x4 m = src1 (edx)
mov eax, dest ; matrix 4x4 d = dest (eax)
mov ecx, src2 ; matrix 4x4 n = src2 (ecx)
movss xmm0, dword ptr [edx] ; xmm0[0] <- m11
movups xmm1, xmmword ptr [ecx] ; xmm1 <- n11 n12 n13 n14
shufps xmm0, xmm0, 0 ; xmm0 <- m11 m11 m11 m11
movss xmm2, dword ptr [edx+4] ; xmm2[0] <- m12
mulps xmm0, xmm1 ; xmm0 <- m11*n11 m11*n12 m11*n13 m11*n14
shufps xmm2, xmm2, 0 ; xmm2 <- m12 m12 m12 m12
movups xmm3, xmmword ptr [ecx+10h] ; xmm3 <- n21 n22 n23 n24
movss xmm7, dword ptr [edx+8] ; xmm7 <- m13
mulps xmm2, xmm3 ; xmm2 <- m12*n21 m12*n22 m12*n23 m12*n24
shufps xmm7, xmm7, 0 ; xmm7 <- m13 m13 m13 m13
addps xmm0, xmm2 ; xmm0 <- m11*n11+m12*n21 m11*n12+m12*n22
m11*n13+m12*n23 m11*n14+m12*n24
movups xmm4, xmmword ptr [ecx+20h] ; xmm4 <- n31 n32 n33 n34
movss xmm2, dword ptr [edx+0Ch] ; xmm2 <- m14
mulps xmm7, xmm4 ; xmm7 <- m13*n31 m13*n32 m13*n33 m13*n34
shufps xmm2, xmm2, 0 ; xmm2 <- m14 m14 m14 m14
addps xmm0, xmm7 ; xmm0 <- m11*n11+m12*n21+m13*n31 m11*n12+m12*n22+m13*n32
m11*n13+m12*n23+m13*n33 m11*n14+m12*n24+m13*n34
movups xmm5, xmmword ptr [ecx+30h] ; xmm5 <- n41 n42 n43 n44
movss xmm6, dword ptr [edx+10h] ; xmm6 <- m21
mulps xmm2, xmm5 ; xmm2 <- m14*n41 m14*n42 m14*n43 m14*n44
movss xmm7, dword ptr [edx+14h] ; xmm7 <- m22
shufps xmm6, xmm6, 0 ; xmm6 <- m21 m21 m21 m21
addps xmm0, xmm2 ; xmm0 <- m11*n11+m12*n21+m13*n31+m14*n41
m11*n12+m12*n22+m13*n32+m14*n42
m11*n13+m12*n23+m13*n33+m14*n43
m11*n14+m12*n24+m13*n34+m14*n44
shufps xmm7, xmm7, 0 ; xmm7 <- m22 m22 m22 m22
movlps qword ptr [eax], xmm0 ; d12 d11 <- xmm0[1] xmm0[0]
movhps qword ptr [eax+8], xmm0 ; d14 d13 <- xmm0[3] xmm0[2]
mulps xmm7, xmm3 ; xmm7 <- m22*n21 m22*n22 m22*n23 m22*n24
movss xmm0, dword ptr [edx+18h] ; xmm0 <- m23
mulps xmm6, xmm1 ; xmm6 <- m21*n11 m21*n12 m21*n13 m21*n14
shufps xmm0, xmm0, 0 ; xmm0 <- m23 m23 m23 m23
addps xmm6, xmm7 ; xmm6 <- m21*n11+m22*n21 m21*n12+m22*n22
m21*n13+m22*n23 m21*n14+m22*n24
mulps xmm0, xmm4 ; xmm0 <- m23*n31 m23*n32 m23*n33 m23*n34
movss xmm2, dword ptr [edx+24h] ; xmm2 <- m32
addps xmm6, xmm0 ; xmm6 <- m21*n11+m22*n21+m23*n31 m21*n12+m22*n22+m23*n32
m21*n13+m22*n23+m23*n33 m21*n14+m22*n24+m23*n34
movss xmm0, dword ptr [edx+1Ch] ; xmm0 <- m24
movss xmm7, dword ptr [edx+20h] ; xmm7 <- m31
shufps xmm0, xmm0, 0 ; xmm0 <- m24 m24 m24 m24
shufps xmm7, xmm7, 0 ; xmm7 <- m31 m31 m31 m31
mulps xmm0, xmm5 ; xmm0 <- m24*n41 m24*n42 m24*n43 m24*n44
mulps xmm7, xmm1 ; xmm7 <- m31*n11 m31*n12 m31*n13 m31*n14
addps xmm6, xmm0 ; xmm6 <- m21*n11+m22*n21+m23*n31+m24*n41
m21*n12+m22*n22+m23*n32+m24*n42
m21*n13+m22*n23+m23*n33+m24*n43
m21*n14+m22*n24+m23*n34+m24*n44
shufps xmm2, xmm2, 0 ; xmm2 <- m32 m32 m32 m32
movlps qword ptr [eax+10h], xmm6 ; d22 d21 <- xmm6[1] xmm6[0]
movhps qword ptr [eax+18h], xmm6 ; d24 d23 <- xmm6[3] xmm6[2]
mulps xmm2, xmm3 ; xmm2 <- m32*n21 m32*n22 m32*n23 m32*n24
movss xmm6, dword ptr [edx+28h] ; xmm6 <- m33
addps xmm7, xmm2 ; xmm7 <- m31*n11+m32*n21 m31*n12+m32*n22
m31*n13+m32*n23 m31*n14+m32*n24
shufps xmm6, xmm6, 0 ; xmm6 <- m33 m33 m33 m33
movss xmm2, dword ptr [edx+2Ch] ; xmm2 <- m34
mulps xmm6, xmm4 ; xmm6 <- m33*n31 m33*n32 m33*n33 m33*n34
shufps xmm2, xmm2, 0 ; xmm2 <- m34 m34 m34 m34
addps xmm7, xmm6 ; xmm7 <- m31*n11+m32*n21+m33*n31 m31*n12+m32*n22+m33*n32
m31*n13+m32*n23+m33*n33 m31*n14+m32*n24+m33*n34
mulps xmm2, xmm5 ; xmm2 <- m34*n41 m34*n42 m34*n43 m34*n44
movss xmm0, dword ptr [edx+34h] ; xmm0 <- m42
addps xmm7, xmm2 ; xmm7 <- m31*n11+m32*n21+m33*n31+m34*n41
m31*n12+m32*n22+m33*n32+m34*n42
m31*n13+m32*n23+m33*n33+m34*n43
m31*n14+m32*n24+m33*n34+m34*n44
shufps xmm0, xmm0, 0 ; xmm0 <- m42 m42 m42 m42
movlps qword ptr [eax+20h], xmm7 ; d32 d31 <- xmm7[1] xmm7[0]
movss xmm2, dword ptr [edx+30h] ; xmm2 <- m41
movhps qword ptr [eax+28h], xmm7 ; d34 d33 <- xmm7[3] xmm7[2]
mulps xmm0, xmm3 ; xmm0 <- m42*n21 m42*n22 m42*n23 m42*n24
shufps xmm2, xmm2, 0 ; xmm2 <- m41 m41 m41 m41
movss xmm6, dword ptr [edx+38h] ; xmm6 <- m43
mulps xmm2, xmm1 ; xmm2 <- m41*n11 m41*n12 m41*n13 m41*n14
shufps xmm6, xmm6, 0 ; xmm6 <- m43 m43 m43 m43
addps xmm2, xmm0 ; xmm2 <- m41*n11+m42*n21 m41*n12+m42*n22
m41*n13+m42*n23 m41*n14+m42*n24
mulps xmm6, xmm4 ; xmm6 <- m43*n31 m43*n32 m43*n33 m43*n34
movss xmm7, dword ptr [edx+3Ch] ; xmm7 <- m44
shufps xmm7, xmm7, 0 ; xmm7 <- m44 m44 m44 m44
addps xmm2, xmm6 ; xmm2 <- m41*n11+m42*n21+m43*n31 m41*n12+m42*n22+m43*n32
m41*n13+m42*n23+m43*n33 m41*n14+m42*n24+m43*n34
mulps xmm7, xmm5 ; xmm7 <- m44*n41 m44*n42 m44*n43 m44*n44
addps xmm2, xmm7 ; xmm2 <- m41*n11+m42*n21+m43*n31+m44*n41
m41*n12+m42*n22+m43*n32+m44*n42
m41*n13+m42*n23+m43*n33+m44*n43
m41*n14+m42*n24+m43*n34+m44*n44
movups xmmword ptr [eax+30h], xmm2 ; d44 d43 d42 d41 < xmm2[3] xmm2[2] xmm2[1] xmm2[0] |
|