请问这个快速的内存拷贝函数有什么错误?

longg2046 · 发表于 2007-1-1 00:17:00

void Qmemcpy(void *dst, void *src, int nQWORDs)
{
#define CACHEBLOCK 1024 //一个块中有多少QWORDs
//修改此值有可能实现更高的速度
int n=((int)(nQWORDs/CACHEBLOCK))*CACHEBLOCK;
int m=nQWORDs-n;
if (n)
{
_asm //下面先拷贝整数个块
{
mov esi, src
mov edi, dst
mov ecx, n //要拷贝多少个块
lea esi, [esi+ecx*8]
lea edi, [edi+ecx*8]
neg ecx
mainloop:
mov eax, CACHEBLOCK / 16
prefetchloop:
mov ebx, [esi+ecx*8] //预读此循环
mov ebx, [esi+ecx*8+64] //预读下循环
add ecx, 16
dec eax
jnz prefetchloop
sub ecx, CACHEBLOCK
mov eax, CACHEBLOCK / 8
writeloop:
movq mm0, qword ptr [esi+ecx*8 ]
movq mm1, qword ptr [esi+ecx*8+8 ]
movq mm2, qword ptr [esi+ecx*8+16]
movq mm3, qword ptr [esi+ecx*8+24]
movq mm4, qword ptr [esi+ecx*8+32]
movq mm5, qword ptr [esi+ecx*8+40]
movq mm6, qword ptr [esi+ecx*8+48]
movq mm7, qword ptr [esi+ecx*8+56]

movntq qword ptr [edi+ecx*8 ], mm0
movntq qword ptr [edi+ecx*8+8 ], mm1
movntq qword ptr [edi+ecx*8+16], mm2
movntq qword ptr [edi+ecx*8+24], mm3
movntq qword ptr [edi+ecx*8+32], mm4
movntq qword ptr [edi+ecx*8+40], mm5
movntq qword ptr [edi+ecx*8+48], mm6
movntq qword ptr [edi+ecx*8+56], mm7
add ecx, 8
dec eax
jnz writeloop
or ecx, ecx
jnz mainloop
}
}
if (m)
{
_asm
{
mov esi, src
mov edi, dst
mov ecx, m
mov ebx, nQWORDs
lea esi, [esi+ebx*8]
lea edi, [edi+ebx*8]
neg ecx
copyloop:
prefetchnta [esi+ecx*8+512] //预读
movq mm0, qword ptr [esi+ecx*8 ]
movq mm1, qword ptr [esi+ecx*8+8 ]
movq mm2, qword ptr [esi+ecx*8+16]
movq mm3, qword ptr [esi+ecx*8+24]
movq mm4, qword ptr [esi+ecx*8+32]
movq mm5, qword ptr [esi+ecx*8+40]
movq mm6, qword ptr [esi+ecx*8+48]
movq mm7, qword ptr [esi+ecx*8+56]

movntq qword ptr [edi+ecx*8 ], mm0
movntq qword ptr [edi+ecx*8+8 ], mm1
movntq qword ptr [edi+ecx*8+16], mm2
movntq qword ptr [edi+ecx*8+24], mm3
movntq qword ptr [edi+ecx*8+32], mm4
movntq qword ptr [edi+ecx*8+40], mm5
movntq qword ptr [edi+ecx*8+48], mm6
movntq qword ptr [edi+ecx*8+56], mm7
add ecx, 8
jnz copyloop
sfence
emms
}
}
else
{
_asm
{
sfence
emms
}
}
}

FlyMagic · 发表于 2007-1-1 09:46:00

超出机器的位宽（比如说32、64）位，还会有性能上的提升吗？还有，为什么你的汇编语言中还有乘法啊？ecx应该是寄存器，不是常数吧？我不太懂，只是问问。我只写过32位的内存设置，而且还是抄来的。代码很短……
inline void memset32(void * dest, DWORD data, int count)
{
_asm
{
mov edi, dest;
mov ecx, count;
mov eax, data;
rep stosd;
}
}

yh1979 · 发表于 2007-1-2 12:34:00

http://www.programmersheaven.com/c/MsgBoard/read.asp?Board=3&MsgID=270913

Very optimized memcpy() routine for all AMD Athlon and Duron family.
可以参考，人家amd写的，应该够快了。哦哦哦

void * memcpy_amd(void *dest, const void *src, size_t n)
{
  __asm {

mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count

cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy

cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ;  it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's

align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb

$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes

cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test

// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time.  It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy

prefetchnta [esi + (200*64/34+192)] ; start reading ahead

movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note:  the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3

add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?

$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's

$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1

$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left

// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy

prefetchnta [esi + (200*64/34+192)] ; start reading ahead

movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?

jmp $memcpy_ic_2 ; almost done

// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy

cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy

mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache

mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer

dec eax ; count down

jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks

// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd

$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes

$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer

}
}

--------------------------------------------------------------------------------

yh1979 · 发表于 2007-1-2 12:35:00

http://www.cs.virginia.edu/stream/FTP/Contrib/AMD/memcpy_amd.cpp

jjjyes · 发表于 2007-1-8 16:12:00

自己写这种东西。。。没考虑过，佩服楼上的各位

账号		自动登录	找回密码
密码			立即注册

请问这个快速的内存拷贝函数有什么错误?

Re:请问这个快速的内存拷贝函数有什么错误?

Re:请问这个快速的内存拷贝函数有什么错误?

Re:请问这个快速的内存拷贝函数有什么错误?

Re:请问这个快速的内存拷贝函数有什么错误?