|
|
我看《游戏编程指南》上说DirectDraw提供的带透明色的BltFast( )此时的工作效率不容乐观,于是就想着自己写个函数试试。
暂时不考虑RLE压缩,至少使用MMX优化应该效率不会很低吧,可是结果证明自己的函数根本没有DirectDraw的效率高。
程序贴出来,希望大家能给看看,问题出在哪了。检查部分去掉了,代码有点长。
void DDBltCKMMX(LPDIRECTDRAWSURFACE7 lpDDSDest, LPDIRECTDRAWSURFACE7 lpDDSSrc,
int iDestX, int iDestY, int iSrcX, int iSrcY, int W, int H)
{
DDSURFACEDESC2 ddsdSrc, ddsdDest;
RECT rcSrc, rcDest;
BYTE *lpbSrc, *lpbDest;
DWORD dwSrcQuad, dwDestQuad;
DWORD dwRemainder;
DWORD dwColorKey;
// 获取图像宽高
ddsdSrc.dwSize = sizeof(DDSURFACEDESC2);
ddsdSrc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT;
lpDDSSrc->GetSurfaceDesc(&ddsdSrc);
ddsdDest.dwSize = sizeof(DDSURFACEDESC2);
ddsdDest.dwFlags = DDSD_WIDTH | DDSD_HEIGHT;
lpDDSDest->GetSurfaceDesc(&ddsdDest);
DDGetColorKey(lpDDSSrc, &dwColorKey);
// 锁定页面
ddsdSrc.dwSize = sizeof(DDSURFACEDESC2);
ddsdDest.dwSize = sizeof(DDSURFACEDESC2);
MakeRect(&rcSrc, iSrcX, iSrcY, W, H);
MakeRect(&rcDest, iDestX, iDestY, W, H);
// EnterCriticalSection(&gDDrawCS);
if(lpDDSSrc->Lock(&rcSrc, &ddsdSrc, DDLOCK_READONLY | DDLOCK_WAIT, NULL) != DD_OK){
//LeaveCriticalSection(&gDDrawCS);
return;
}
if(lpDDSDest->Lock(&rcDest, &ddsdDest, DDLOCK_WAIT, NULL) != DD_OK){
lpDDSSrc->Unlock(&rcSrc);
//LeaveCriticalSection(&gDDrawCS);
return;
}
// LeaveCriticalSection(&gDDrawCS);
// 执行复制操作
lpbSrc = (BYTE*)ddsdSrc.lpSurface;
lpbDest = (BYTE*)ddsdDest.lpSurface;
switch(ScreenMode){
case RGBMODE_555: // 555, 565代码相同
case RGBMODE_565: // 一次执行4个点
dwSrcQuad = ddsdSrc.lPitch - (W << 1);
dwDestQuad = ddsdDest.lPitch - (W << 1);
dwColorKey = (dwColorKey << 16) | dwColorKey;
dwRemainder = W & 0x03;
W = ((W - dwRemainder) >> 2);
__asm{
mov esi, lpbSrc;
mov edi, lpbDest;
mov ebx, dwColorKey;
movd mm0, ebx;
punpckldq mm0, mm0;// 关键色
mov eax, H;
loop_H16: // 高循环
mov ecx, W;
mov edx, dwRemainder;
loop_W16: // 宽循环
cmp ecx, 0;
jz loop_W_R16;
movq mm1, [esi];
movq mm2, [edi];
movq mm3, mm1;// save original source pixels
pcmpeqw mm3, mm0;// pixels equal to color key are set to 0xffff
pand mm2, mm3; // if not , 0x0000
pcmpeqw mm4, mm4;//mm4 设为全1
pandn mm3, mm4;// 关键色的点将为0
pand mm1, mm3; // 去除关键色
por mm2, mm1;
movq [edi], mm2;
add esi, 8;
add edi, 8;
dec ecx;
jmp loop_W16;
loop_W_R16:
cmp edx, 0;
jz loop_W_End16;
mov cx, [esi];
cmp cx, bx;
jz skip_16;
mov [edi], cx;
add esi, 2;
add edi, 2;
skip_16:
dec edx;
jmp loop_W_R16;
loop_W_End16:
add esi, dwSrcQuad;
add edi, dwDestQuad;
dec eax;
cmp eax, 0;
jnz loop_H16;
emms;
}
break;
case RGBMODE_32: // 32位颜色模式, 一次执行2个点
dwSrcQuad = ddsdSrc.lPitch - (W << 2);
dwDestQuad = ddsdDest.lPitch - (W << 2);
dwRemainder = W & 0x01;
W = ((W - dwRemainder) >> 1);
lpbSrc = (BYTE*)ddsdSrc.lpSurface;
lpbDest = (BYTE*)ddsdDest.lpSurface;
__asm{
mov esi, lpbSrc;
mov edi, lpbDest;
mov ebx, dwColorKey;
movd mm0, ebx;
punpckldq mm0, mm0;
mov eax, H;
loop_H32: // 高循环
mov ecx, W;
mov edx, dwRemainder;
loop_W32: // 宽循环
cmp ecx, 0;
jz loop_W_R32;
movq mm1, [esi];
movq mm2, [edi];
movq mm3, mm1;// save original source pixels
pcmpeqd mm3, mm0;// pixels equal to color key are set to 0xffffffff
pand mm2, mm3; // if not , 0x0
pcmpeqw mm4, mm4;
pandn mm3, mm4;
pand mm1, mm3;
por mm2, mm1;
movq [edi], mm2;
add esi, 8;
add edi, 8;
dec ecx;
jmp loop_W32;
loop_W_R32:
cmp edx, 0;
jz loop_W_End32;
movd mm0, DWORD PTR [esi];
movd DWORD PTR [edi], mm0;
add esi, 4;
add edi, 4;
loop_W_End32:
add esi, dwSrcQuad;
add edi, dwDestQuad;
dec eax;
cmp eax, 0;
jnz loop_H32;
emms;
}
break;
}
// EnterCriticalSection(&gDDrawCS);
lpDDSSrc->Unlock(&rcSrc);
lpDDSDest->Unlock(&rcDest);
// LeaveCriticalSection(&gDDrawCS);
}
|
|