|

楼主 |
发表于 2011-1-1 16:24:00
|
显示全部楼层
Re: 求高速的位图拷贝函数
有个问题搞不太懂,我写了一个图像缩放函数,因为拷贝像素中有很多需要参与计算的东西,不可能全部放在寄存器里,于是我就放在了内存里,为了避免缓存命中失效我特意开辟了128字的空间,但是不知道为什么我在代码里加上 _asm nop与不加速度有几十帧的差异,为什么啊?
char buf[128];
#define X_SCALE_SH16 0
#define Y_SCALE_SH16 4
#define SRC_PITCH 8
#define DES_PITCH 12
#define SRC_Y_SH16 16
#define ESI_PUSH 20
#define DES_W 24
#define DES_H 26
#define COLOR_KEY 28
void vgSURFACE::Blt(vgSURFACE &des, vgRECT &srt, vgRECT &drt)
{
U16 *des_bits = (U16 *)des.Bits() + drt.x + drt.y * (des.Pitch()>>1);
U16 *src_bits = (U16 *)Bits() + srt.x + srt.y * (Pitch()>>1);
_asm nop
U32 x_scale_sh16 = (srt.w<<16) / drt.w;
U32 y_scale_sh16 = (srt.h<<16) / drt.h;
U32 src_pitch = Pitch();
U32 des_pitch = des.Pitch() - (drt.w<<1);
U16 color_key = ColorKey();
_asm
{
lea eax, buf
test eax, 63
jz _duiqi
and eax, 0xffffffc0
add eax, 64
_duiqi:
mov ecx, x_scale_sh16
mov [eax+X_SCALE_SH16], ecx
mov ecx, y_scale_sh16
mov [eax+Y_SCALE_SH16], ecx
mov ecx, src_pitch
mov [eax+SRC_PITCH], ecx
mov ecx, des_pitch
mov [eax+DES_PITCH], ecx
xor ecx, ecx
mov esi, drt
mov [eax+SRC_Y_SH16], ecx
mov cx, [esi + vgRECT::w]
mov [eax+DES_W], cx
mov cx, [esi + vgRECT::h]
mov [eax+DES_H], cx
mov cx, color_key
mov [eax+COLOR_KEY], cx
}
_asm
{
mov esi, src_bits
mov edi, des_bits
movzx edx, word ptr[eax+DES_H]
_loop:
mov [eax+DES_H], dx
movzx ecx, word ptr[eax+DES_W]
xor ebx, ebx //ebx用来累加src的x
mov [eax+ESI_PUSH], esi
_sub_loop:
mov dx, [esi]
cmp dx, [eax+COLOR_KEY]
jz _is_color_key
mov [edi], dx
_is_color_key:
add ebx, [eax+X_SCALE_SH16]
add edi, 2
test ebx, 65536
jz _no_inc_src_x
add esi, 2
and ebx, 65535
_no_inc_src_x:
dec ecx
jnz _sub_loop
mov esi, [eax+ESI_PUSH]
mov edx, [eax+SRC_Y_SH16]
add edx, [eax+Y_SCALE_SH16]
test edx, 65536
jz _no_inc_src_y
and edx, 65535
add esi, [eax+SRC_PITCH]
_no_inc_src_y:
mov [eax+SRC_Y_SH16], edx
add edi, [eax+DES_PITCH]
movzx edx, word ptr[eax + DES_H]
dec edx
jnz _loop
}
}
|
|