上一次逆了 ntdll!memcpy_s() 函数,里面最终调用 ntdll!memcpy() 进行复制,这次将 ntdll!memcpy() 作为对象,进行逆向和分析。
下面是 windows 7 x64 里的 ntdll!memcpy() 函数:
代码:
ntdll!memcpy: 00000000`77a7e6d0 4c8bd9 mov r11,rcx 00000000`77a7e6d3 482bd1 sub rdx,rcx 00000000`77a7e6d6 0f829e010000 jb ntdll!memcpy+0x1aa (00000000`77a7e87a) ntdll!memcpy+0xc: 00000000`77a7e6dc 4983f808 cmp r8,8 00000000`77a7e6e0 7262 jb ntdll!memcpy+0x74 (00000000`77a7e744) ntdll!memcpy+0x12: 00000000`77a7e6e2 f6c107 test cl,7 00000000`77a7e6e5 7437 je ntdll!memcpy+0x4e (00000000`77a7e71e) ntdll!memcpy+0x17: 00000000`77a7e6e7 f6c101 test cl,1 00000000`77a7e6ea 740c je ntdll!memcpy+0x28 (00000000`77a7e6f8) ntdll!memcpy+0x1c: 00000000`77a7e6ec 8a040a mov al,byte ptr [rdx+rcx] 00000000`77a7e6ef 49ffc8 dec r8 00000000`77a7e6f2 8801 mov byte ptr [rcx],al 00000000`77a7e6f4 4883c101 add rcx,1 ntdll!memcpy+0x28: 00000000`77a7e6f8 f6c102 test cl,2 00000000`77a7e6fb 740f je ntdll!memcpy+0x3c (00000000`77a7e70c) ntdll!memcpy+0x2d: 00000000`77a7e6fd 668b040a mov ax,word ptr [rdx+rcx] 00000000`77a7e701 4983e802 sub r8,2 00000000`77a7e705 668901 mov word ptr [rcx],ax 00000000`77a7e708 4883c102 add rcx,2 ntdll!memcpy+0x3c: 00000000`77a7e70c f6c104 test cl,4 00000000`77a7e70f 740d je ntdll!memcpy+0x4e (00000000`77a7e71e) ntdll!memcpy+0x41: 00000000`77a7e711 8b0411 mov eax,dword ptr [rcx+rdx] 00000000`77a7e714 4983e804 sub r8,4 00000000`77a7e718 8901 mov dword ptr [rcx],eax 00000000`77a7e71a 4883c104 add rcx,4 ntdll!memcpy+0x4e: 00000000`77a7e71e 4d8bc8 mov r9,r8 00000000`77a7e721 49c1e905 shr r9,5 00000000`77a7e725 7550 jne ntdll!memcpy+0xa7 (00000000`77a7e777) ntdll!memcpy+0x57: 00000000`77a7e727 4d8bc8 mov r9,r8 00000000`77a7e72a 49c1e903 shr r9,3 00000000`77a7e72e 7414 je ntdll!memcpy+0x74 (00000000`77a7e744) ntdll!memcpy+0x60: 00000000`77a7e730 488b040a mov rax,qword ptr [rdx+rcx] 00000000`77a7e734 488901 mov qword ptr [rcx],rax 00000000`77a7e737 4883c108 add rcx,8 00000000`77a7e73b 49ffc9 dec r9 00000000`77a7e73e 75f0 jne ntdll!memcpy+0x60 (00000000`77a7e730) ntdll!memcpy+0x70: 00000000`77a7e740 4983e007 and r8,7 ntdll!memcpy+0x74: 00000000`77a7e744 4d85c0 test r8,r8 00000000`77a7e747 7507 jne ntdll!memcpy+0x80 (00000000`77a7e750) ntdll!memcpy+0x79: 00000000`77a7e749 498bc3 mov rax,r11 00000000`77a7e74c c3 ret ntdll!memcpy+0x80: 00000000`77a7e750 8a040a mov al,byte ptr [rdx+rcx] 00000000`77a7e753 8801 mov byte ptr [rcx],al 00000000`77a7e755 48ffc1 inc rcx 00000000`77a7e758 49ffc8 dec r8 00000000`77a7e75b 75f3 jne ntdll!memcpy+0x80 (00000000`77a7e750) ntdll!memcpy+0x8d: 00000000`77a7e75d 498bc3 mov rax,r11 00000000`77a7e760 c3 ret ntdll!memcpy+0xa7: 00000000`77a7e777 4981f900200000 cmp r9,2000h 00000000`77a7e77e 7342 jae ntdll!memcpy+0xf2 (00000000`77a7e7c2) ntdll!memcpy+0xb0: 00000000`77a7e780 488b040a mov rax,qword ptr [rdx+rcx] 00000000`77a7e784 4c8b540a08 mov r10,qword ptr [rdx+rcx+8] 00000000`77a7e789 4883c120 add rcx,20h 00000000`77a7e78d 488941e0 mov qword ptr [rcx-20h],rax 00000000`77a7e791 4c8951e8 mov qword ptr [rcx-18h],r10 00000000`77a7e795 488b440af0 mov rax,qword ptr [rdx+rcx-10h] 00000000`77a7e79a 4c8b540af8 mov r10,qword ptr [rdx+rcx-8] 00000000`77a7e79f 49ffc9 dec r9 00000000`77a7e7a2 488941f0 mov qword ptr [rcx-10h],rax 00000000`77a7e7a6 4c8951f8 mov qword ptr [rcx-8],r10 00000000`77a7e7aa 75d4 jne ntdll!memcpy+0xb0 (00000000`77a7e780) ntdll!memcpy+0xdc: 00000000`77a7e7ac 4983e01f and r8,1Fh 00000000`77a7e7b0 e972ffffff jmp ntdll!memcpy+0x57 (00000000`77a7e727) ntdll!memcpy+0xf2: 00000000`77a7e7c2 4881fa00100000 cmp rdx,1000h 00000000`77a7e7c9 72b5 jb ntdll!memcpy+0xb0 (00000000`77a7e780) ntdll!memcpy+0xfb: 00000000`77a7e7cb b820000000 mov eax,20h ntdll!memcpy+0x100: 00000000`77a7e7d0 0f18040a prefetchnta [rdx+rcx] 00000000`77a7e7d4 0f18440a40 prefetchnta [rdx+rcx+40h] 00000000`77a7e7d9 4881c180000000 add rcx,80h 00000000`77a7e7e0 ffc8 dec eax 00000000`77a7e7e2 75ec jne ntdll!memcpy+0x100 (00000000`77a7e7d0) ntdll!memcpy+0x114: 00000000`77a7e7e4 4881e900100000 sub rcx,1000h 00000000`77a7e7eb b840000000 mov eax,40h ntdll!memcpy+0x120: 00000000`77a7e7f0 4c8b0c0a mov r9,qword ptr [rdx+rcx] 00000000`77a7e7f4 4c8b540a08 mov r10,qword ptr [rdx+rcx+8] 00000000`77a7e7f9 4c0fc309 movnti qword ptr [rcx],r9 00000000`77a7e7fd 4c0fc35108 movnti qword ptr [rcx+8],r10 00000000`77a7e802 4c8b4c0a10 mov r9,qword ptr [rdx+rcx+10h] 00000000`77a7e807 4c8b540a18 mov r10,qword ptr [rdx+rcx+18h] 00000000`77a7e80c 4c0fc34910 movnti qword ptr [rcx+10h],r9 00000000`77a7e811 4c0fc35118 movnti qword ptr [rcx+18h],r10 00000000`77a7e816 4c8b4c0a20 mov r9,qword ptr [rdx+rcx+20h] 00000000`77a7e81b 4c8b540a28 mov r10,qword ptr [rdx+rcx+28h] 00000000`77a7e820 4883c140 add rcx,40h 00000000`77a7e824 4c0fc349e0 movnti qword ptr [rcx-20h],r9 00000000`77a7e829 4c0fc351e8 movnti qword ptr [rcx-18h],r10 00000000`77a7e82e 4c8b4c0af0 mov r9,qword ptr [rdx+rcx-10h] 00000000`77a7e833 4c8b540af8 mov r10,qword ptr [rdx+rcx-8] 00000000`77a7e838 ffc8 dec eax 00000000`77a7e83a 4c0fc349f0 movnti qword ptr [rcx-10h],r9 00000000`77a7e83f 4c0fc351f8 movnti qword ptr [rcx-8],r10 00000000`77a7e844 75aa jne ntdll!memcpy+0x120 (00000000`77a7e7f0) ntdll!memcpy+0x176: 00000000`77a7e846 4981e800100000 sub r8,1000h 00000000`77a7e84d 4981f800100000 cmp r8,1000h 00000000`77a7e854 0f8371ffffff jae ntdll!memcpy+0xfb (00000000`77a7e7cb) ntdll!memcpy+0x18a: 00000000`77a7e85a f0800c2400 lock or byte ptr [rsp],0 00000000`77a7e85f e9bafeffff jmp ntdll!memcpy+0x4e (00000000`77a7e71e) ntdll!memcpy+0x1aa: 00000000`77a7e87a 4903c8 add rcx,r8 00000000`77a7e87d 4983f808 cmp r8,8 00000000`77a7e881 7261 jb ntdll!memcpy+0x214 (00000000`77a7e8e4) ntdll!memcpy+0x1b3: 00000000`77a7e883 f6c107 test cl,7 00000000`77a7e886 7436 je ntdll!memcpy+0x1ee (00000000`77a7e8be) ntdll!memcpy+0x1b8: 00000000`77a7e888 f6c101 test cl,1 00000000`77a7e88b 740b je ntdll!memcpy+0x1c8 (00000000`77a7e898) ntdll!memcpy+0x1bd: 00000000`77a7e88d 48ffc9 dec rcx 00000000`77a7e890 8a040a mov al,byte ptr [rdx+rcx] 00000000`77a7e893 49ffc8 dec r8 00000000`77a7e896 8801 mov byte ptr [rcx],al ntdll!memcpy+0x1c8: 00000000`77a7e898 f6c102 test cl,2 00000000`77a7e89b 740f je ntdll!memcpy+0x1dc (00000000`77a7e8ac) ntdll!memcpy+0x1cd: 00000000`77a7e89d 4883e902 sub rcx,2 00000000`77a7e8a1 668b040a mov ax,word ptr [rdx+rcx] 00000000`77a7e8a5 4983e802 sub r8,2 00000000`77a7e8a9 668901 mov word ptr [rcx],ax ntdll!memcpy+0x1dc: 00000000`77a7e8ac f6c104 test cl,4 00000000`77a7e8af 740d je ntdll!memcpy+0x1ee (00000000`77a7e8be) ntdll!memcpy+0x1e1: 00000000`77a7e8b1 4883e904 sub rcx,4 00000000`77a7e8b5 8b0411 mov eax,dword ptr [rcx+rdx] 00000000`77a7e8b8 4983e804 sub r8,4 00000000`77a7e8bc 8901 mov dword ptr [rcx],eax ntdll!memcpy+0x1ee: 00000000`77a7e8be 4d8bc8 mov r9,r8 00000000`77a7e8c1 49c1e905 shr r9,5 00000000`77a7e8c5 7550 jne ntdll!memcpy+0x247 (00000000`77a7e917) ntdll!memcpy+0x1f7: 00000000`77a7e8c7 4d8bc8 mov r9,r8 00000000`77a7e8ca 49c1e903 shr r9,3 00000000`77a7e8ce 7414 je ntdll!memcpy+0x214 (00000000`77a7e8e4) ntdll!memcpy+0x200: 00000000`77a7e8d0 4883e908 sub rcx,8 00000000`77a7e8d4 488b040a mov rax,qword ptr [rdx+rcx] 00000000`77a7e8d8 49ffc9 dec r9 00000000`77a7e8db 488901 mov qword ptr [rcx],rax 00000000`77a7e8de 75f0 jne ntdll!memcpy+0x200 (00000000`77a7e8d0) ntdll!memcpy+0x210: 00000000`77a7e8e0 4983e007 and r8,7 ntdll!memcpy+0x214: 00000000`77a7e8e4 4d85c0 test r8,r8 00000000`77a7e8e7 7507 jne ntdll!memcpy+0x220 (00000000`77a7e8f0) ntdll!memcpy+0x219: 00000000`77a7e8e9 498bc3 mov rax,r11 00000000`77a7e8ec c3 ret ntdll!memcpy+0x220: 00000000`77a7e8f0 48ffc9 dec rcx 00000000`77a7e8f3 8a040a mov al,byte ptr [rdx+rcx] 00000000`77a7e8f6 49ffc8 dec r8 00000000`77a7e8f9 8801 mov byte ptr [rcx],al 00000000`77a7e8fb 75f3 jne ntdll!memcpy+0x220 (00000000`77a7e8f0) ntdll!memcpy+0x22d: 00000000`77a7e8fd 498bc3 mov rax,r11 00000000`77a7e900 c3 ret ntdll!memcpy+0x247: 00000000`77a7e917 4981f900200000 cmp r9,2000h 00000000`77a7e91e 7342 jae ntdll!memcpy+0x292 (00000000`77a7e962) ntdll!memcpy+0x250: 00000000`77a7e920 488b440af8 mov rax,qword ptr [rdx+rcx-8] 00000000`77a7e925 4c8b540af0 mov r10,qword ptr [rdx+rcx-10h] 00000000`77a7e92a 4883e920 sub rcx,20h 00000000`77a7e92e 48894118 mov qword ptr [rcx+18h],rax 00000000`77a7e932 4c895110 mov qword ptr [rcx+10h],r10 00000000`77a7e936 488b440a08 mov rax,qword ptr [rdx+rcx+8] 00000000`77a7e93b 4c8b140a mov r10,qword ptr [rdx+rcx] 00000000`77a7e93f 49ffc9 dec r9 00000000`77a7e942 48894108 mov qword ptr [rcx+8],rax 00000000`77a7e946 4c8911 mov qword ptr [rcx],r10 00000000`77a7e949 75d5 jne ntdll!memcpy+0x250 (00000000`77a7e920) ntdll!memcpy+0x27b: 00000000`77a7e94b 4983e01f and r8,1Fh 00000000`77a7e94f e973ffffff jmp ntdll!memcpy+0x1f7 (00000000`77a7e8c7) ntdll!memcpy+0x292: 00000000`77a7e962 4881fa00f0ffff cmp rdx,0FFFFFFFFFFFFF000h 00000000`77a7e969 77b5 ja ntdll!memcpy+0x250 (00000000`77a7e920) ntdll!memcpy+0x29b: 00000000`77a7e96b b820000000 mov eax,20h ntdll!memcpy+0x2a0: 00000000`77a7e970 4881e980000000 sub rcx,80h 00000000`77a7e977 0f18040a prefetchnta [rdx+rcx] 00000000`77a7e97b 0f18440a40 prefetchnta [rdx+rcx+40h] 00000000`77a7e980 ffc8 dec eax 00000000`77a7e982 75ec jne ntdll!memcpy+0x2a0 (00000000`77a7e970) ntdll!memcpy+0x2b4: 00000000`77a7e984 4881c100100000 add rcx,1000h 00000000`77a7e98b b840000000 mov eax,40h ntdll!memcpy+0x2c0: 00000000`77a7e990 4c8b4c0af8 mov r9,qword ptr [rdx+rcx-8] 00000000`77a7e995 4c8b540af0 mov r10,qword ptr [rdx+rcx-10h] 00000000`77a7e99a 4c0fc349f8 movnti qword ptr [rcx-8],r9 00000000`77a7e99f 4c0fc351f0 movnti qword ptr [rcx-10h],r10 00000000`77a7e9a4 4c8b4c0ae8 mov r9,qword ptr [rdx+rcx-18h] 00000000`77a7e9a9 4c8b540ae0 mov r10,qword ptr [rdx+rcx-20h] 00000000`77a7e9ae 4c0fc349e8 movnti qword ptr [rcx-18h],r9 00000000`77a7e9b3 4c0fc351e0 movnti qword ptr [rcx-20h],r10 00000000`77a7e9b8 4c8b4c0ad8 mov r9,qword ptr [rdx+rcx-28h] 00000000`77a7e9bd 4c8b540ad0 mov r10,qword ptr [rdx+rcx-30h] 00000000`77a7e9c2 4883e940 sub rcx,40h 00000000`77a7e9c6 4c0fc34918 movnti qword ptr [rcx+18h],r9 00000000`77a7e9cb 4c0fc35110 movnti qword ptr [rcx+10h],r10 00000000`77a7e9d0 4c8b4c0a08 mov r9,qword ptr [rdx+rcx+8] 00000000`77a7e9d5 4c8b140a mov r10,qword ptr [rdx+rcx] 00000000`77a7e9d9 ffc8 dec eax 00000000`77a7e9db 4c0fc34908 movnti qword ptr [rcx+8],r9 00000000`77a7e9e0 4c0fc311 movnti qword ptr [rcx],r10 00000000`77a7e9e4 75aa jne ntdll!memcpy+0x2c0 (00000000`77a7e990) ntdll!memcpy+0x316: 00000000`77a7e9e6 4981e800100000 sub r8,1000h 00000000`77a7e9ed 4981f800100000 cmp r8,1000h 00000000`77a7e9f4 0f8371ffffff jae ntdll!memcpy+0x29b (00000000`77a7e96b) ntdll!memcpy+0x32a: 00000000`77a7e9fa f0800c2400 lock or byte ptr [rsp],0 00000000`77a7e9ff e9bafeffff jmp ntdll!memcpy+0x1ee (00000000`77a7e8be)
代码:
char *memcpy(char *dest, char *source, unsigned long long count);
1. memcpy() 分两大情况进行处理
memcpy() 会首先判断 dest 和 source 的位置:
00000000`77a7e6d0 4c8bd9 mov r11,rcx
00000000`77a7e6d3 482bd1 sub rdx,rcx
00000000`77a7e6d6 0f829e010000 jb ntdll!memcpy+0x1aa (00000000`77a7e87a)
ntdll!memcpy+0xc:
00000000`77a7e6dc 4983f808 cmp r8,8
00000000`77a7e6e0 7262 jb ntdll!memcpy+0x74 (00000000`77a7e744)
... ...
ntdll!memcpy+0x1aa:
00000000`77a7e87a 4903c8 add rcx,r8
00000000`77a7e87d 4983f808 cmp r8,8
00000000`77a7e881 7261 jb ntdll!memcpy+0x214 (00000000`77a7e8e4)
第1种情况是: source >= dest 的时候,第 2 种情况是:source < dest 的时候,在这里我们得到下面的逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { // 相关处理 } else { // 相关处理 } return p; }
2. 当 count 小于 8 的时候
接下来,首先处理当复制数量小于 8 bytes 时,在这种情况肯定是最简单的:
ntdll!memcpy+0xc:
00000000`77a7e6dc 4983f808 cmp r8,8 ; count < 8 时候
00000000`77a7e6e0 7262 jb ntdll!memcpy+0x74 (00000000`77a7e744)
... ...
ntdll!memcpy+0x74:
00000000`77a7e744 4d85c0 test r8,r8
00000000`77a7e747 7507 jne ntdll!memcpy+0x80 (00000000`77a7e750)
... ...
ntdll!memcpy+0x80:
00000000`77a7e750 8a040a mov al,byte ptr [rdx+rcx]
00000000`77a7e753 8801 mov byte ptr [rcx],al
00000000`77a7e755 48ffc1 inc rcx
00000000`77a7e758 49ffc8 dec r8
00000000`77a7e75b 75f3 jne ntdll!memcpy+0x80 (00000000`77a7e750)
ntdll!memcpy+0x8d:
00000000`77a7e75d 498bc3 mov rax,r11
00000000`77a7e760 c3 ret
当小于 8 时候只是做简单的复制就可以了,于是,我们得到下面的逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { if (count < 8) { while (count--) *dest++ = *source++; } } else { // 相关处理 } return p; }
3. 处理地址非对齐的情况
我们都知道,当复制进候,源地址和目标地址都是对齐的情况下,效率是最高的。
当然这是一种理想的状态,实际上我们会遇到非对齐的地址,那么 memcpy() 会怎样处理呢:
ntdll!memcpy+0x12:
00000000`77a7e6e2 f6c107 test cl,7
00000000`77a7e6e5 7437 je ntdll!memcpy+0x4e (00000000`77a7e71e)
ntdll!memcpy+0x17:
00000000`77a7e6e7 f6c101 test cl,1
00000000`77a7e6ea 740c je ntdll!memcpy+0x28 (00000000`77a7e6f8)
ntdll!memcpy+0x1c:
00000000`77a7e6ec 8a040a mov al,byte ptr [rdx+rcx]
00000000`77a7e6ef 49ffc8 dec r8
00000000`77a7e6f2 8801 mov byte ptr [rcx],al
00000000`77a7e6f4 4883c101 add rcx,1
ntdll!memcpy+0x28:
00000000`77a7e6f8 f6c102 test cl,2
00000000`77a7e6fb 740f je ntdll!memcpy+0x3c (00000000`77a7e70c)
ntdll!memcpy+0x2d:
00000000`77a7e6fd 668b040a mov ax,word ptr [rdx+rcx]
00000000`77a7e701 4983e802 sub r8,2
00000000`77a7e705 668901 mov word ptr [rcx],ax
00000000`77a7e708 4883c102 add rcx,2
ntdll!memcpy+0x3c:
00000000`77a7e70c f6c104 test cl,4
00000000`77a7e70f 740d je ntdll!memcpy+0x4e (00000000`77a7e71e)
ntdll!memcpy+0x41:
00000000`77a7e711 8b0411 mov eax,dword ptr [rcx+rdx]
00000000`77a7e714 4983e804 sub r8,4
00000000`77a7e718 8901 mov dword ptr [rcx],eax
00000000`77a7e71a 4883c104 add rcx,4
上面一段代码都是在处理当目标地址处理非对齐的情况下,memcpy() 的处理手法是:在进行大批量复制之前,先处理掉非对齐的部分,从而变得对齐了
上面这段代码的逻辑是:
代码:
if (dest & 0x07) { if (dest & 0x01) { *dest++ = *source++; count--; } if (dest & 0x02) { *dest++ = *source++; *dest++ = *source++; count -= 2; } if (dest & 0x04) { *(int *)dest++ = *(int *)source++; count -= 4; } }
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { if (count < 8) { while (count--) *dest++ = *source++; } swtich (dest & 0x07) { case 7: *dest++ = *source++; count--; case 6: *dest++ = *source++; count--; case 5: *dest++ = *source++; count--; case 4: *dest++ = *source++; count--; case 3: *dest++ = *source++; count--; case 2: *dest++ = *source++; count--; case 1: *dest++ = *source++; count--; } } else { // 相关处理 } return p; }
4. 当 count 小于 32 的情况下,以 32 bytes 作为一个处理单元
接下来会根据将 count 分为几种情况,首先是以 32 bytes 作为一个处理单元:
ntdll!memcpy+0x4e:
00000000`77a7e71e 4d8bc8 mov r9,r8 ;
00000000`77a7e721 49c1e905 shr r9,5 ; 以 count / 32 为一个单元
00000000`77a7e725 7550 jne ntdll!memcpy+0xa7 (00000000`77a7e777)
ntdll!memcpy+0x57:
00000000`77a7e727 4d8bc8 mov r9,r8
00000000`77a7e72a 49c1e903 shr r9,3 ; 以 count / 8 作为一个单元
00000000`77a7e72e 7414 je ntdll!memcpy+0x74 (00000000`77a7e744)
ntdll!memcpy+0x60:
00000000`77a7e730 488b040a mov rax,qword ptr [rdx+rcx] ; 一次复制 8 bytes
00000000`77a7e734 488901 mov qword ptr [rcx],rax
00000000`77a7e737 4883c108 add rcx,8
00000000`77a7e73b 49ffc9 dec r9
00000000`77a7e73e 75f0 jne ntdll!memcpy+0x60 (00000000`77a7e730)
ntdll!memcpy+0x70:
00000000`77a7e740 4983e007 and r8,7
ntdll!memcpy+0x74:
00000000`77a7e744 4d85c0 test r8,r8
00000000`77a7e747 7507 jne ntdll!memcpy+0x80 (00000000`77a7e750) ; 一次复制 1 byte
ntdll!memcpy+0x79:
00000000`77a7e749 498bc3 mov rax,r11
00000000`77a7e74c c3 ret
这段代码在 32 bytes 内,先以 8 bytes 为单位进行复制,剩下不足的按 1 byte 复制,我们得到下面逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { if (count < 8) { while (count--) *dest++ = *source++; return p; } // 处理非对齐 swtich (dest & 0x07) { case 7: *dest++ = *source++; count--; case 6: *dest++ = *source++; count--; case 5: *dest++ = *source++; count--; case 4: *dest++ = *source++; count--; case 3: *dest++ = *source++; count--; case 2: *dest++ = *source++; count--; case 1: *dest++ = *source++; count--; } // 处理 32 bytes if (count < 32) { if (count >= 8) { for (int i = count / 8; i; i++) *(long long *)dest++ = *(long long *)source++; for (int i = count % 8; i; i++) *dest++ = *source++; } } else { // 大于 32 bytes 的处理 } } else { // 相关处理 } return p; }
5. 当复制数量大于 32 bytes 时的情况
这里的情况有些复杂,memcpy() 还以 256K bytes 作为分界线
5.1 小于 256K bytes 的时候
下面看看小于 256k bytes 时的处理:
ntdll!memcpy+0xa7:
00000000`77a7e777 4981f900200000 cmp r9,2000h ; 8K * 32 = 256K bytes
00000000`77a7e77e 7342 jae ntdll!memcpy+0xf2 (00000000`77a7e7c2)
ntdll!memcpy+0xb0:
00000000`77a7e780 488b040a mov rax,qword ptr [rdx+rcx] ; 一次复制 32 bytes
00000000`77a7e784 4c8b540a08 mov r10,qword ptr [rdx+rcx+8]
00000000`77a7e789 4883c120 add rcx,20h
00000000`77a7e78d 488941e0 mov qword ptr [rcx-20h],rax
00000000`77a7e791 4c8951e8 mov qword ptr [rcx-18h],r10
00000000`77a7e795 488b440af0 mov rax,qword ptr [rdx+rcx-10h]
00000000`77a7e79a 4c8b540af8 mov r10,qword ptr [rdx+rcx-8]
00000000`77a7e79f 49ffc9 dec r9
00000000`77a7e7a2 488941f0 mov qword ptr [rcx-10h],rax
00000000`77a7e7a6 4c8951f8 mov qword ptr [rcx-8],r10
00000000`77a7e7aa 75d4 jne ntdll!memcpy+0xb0 (00000000`77a7e780)
ntdll!memcpy+0xdc:
00000000`77a7e7ac 4983e01f and r8,1Fh
00000000`77a7e7b0 e972ffffff jmp ntdll!memcpy+0x57 (00000000`77a7e727)
这里以 256K bytes 以为一个处理单元,每次循环复制 32 bytes:
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { if (count < 8) { while (count--) *dest++ = *source++; return p; } // 处理非对齐 swtich (dest & 0x07) { case 7: *dest++ = *source++; count--; case 6: *dest++ = *source++; count--; case 5: *dest++ = *source++; count--; case 4: *dest++ = *source++; count--; case 3: *dest++ = *source++; count--; case 2: *dest++ = *source++; count--; case 1: *dest++ = *source++; count--; } // 处理 32 bytes process_count32: if (count < 32) { if (count >= 8) { for (int i = count / 8; i; i++) *(long long *)dest++ = *(long long *)source++; for (int i = count % 8; i; i++) *dest++ = *source++; } } else { // 大于 32 bytes 的处理 int count32 = count / 32; if (count32 >= 8192) { // 大于 256K bytes 时处理 } else { for (int i = 0; i < count32 / 32; i++) { *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; } goto process_count32; // 处理 32 bytes 单元 } } } else { // 相关处理 } return p; }
5.2 当复制大于256K bytes
这里为了效率,采用了一系列 Move Non-Temporal 指令来优化,以及 Prefetch 指令
ntdll!memcpy+0xf2:
00000000`77a7e7c2 4881fa00100000 cmp rdx,1000h
00000000`77a7e7c9 72b5 jb ntdll!memcpy+0xb0 (00000000`77a7e780)
ntdll!memcpy+0xfb:
00000000`77a7e7cb b820000000 mov eax,20h
ntdll!memcpy+0x100:
00000000`77a7e7d0 0f18040a prefetchnta [rdx+rcx]
00000000`77a7e7d4 0f18440a40 prefetchnta [rdx+rcx+40h]
00000000`77a7e7d9 4881c180000000 add rcx,80h
00000000`77a7e7e0 ffc8 dec eax
00000000`77a7e7e2 75ec jne ntdll!memcpy+0x100 (00000000`77a7e7d0)
ntdll!memcpy+0x114:
00000000`77a7e7e4 4881e900100000 sub rcx,1000h
00000000`77a7e7eb b840000000 mov eax,40h
ntdll!memcpy+0x120:
00000000`77a7e7f0 4c8b0c0a mov r9,qword ptr [rdx+rcx]
00000000`77a7e7f4 4c8b540a08 mov r10,qword ptr [rdx+rcx+8]
00000000`77a7e7f9 4c0fc309 movnti qword ptr [rcx],r9
00000000`77a7e7fd 4c0fc35108 movnti qword ptr [rcx+8],r10
00000000`77a7e802 4c8b4c0a10 mov r9,qword ptr [rdx+rcx+10h]
00000000`77a7e807 4c8b540a18 mov r10,qword ptr [rdx+rcx+18h]
00000000`77a7e80c 4c0fc34910 movnti qword ptr [rcx+10h],r9
00000000`77a7e811 4c0fc35118 movnti qword ptr [rcx+18h],r10
00000000`77a7e816 4c8b4c0a20 mov r9,qword ptr [rdx+rcx+20h]
00000000`77a7e81b 4c8b540a28 mov r10,qword ptr [rdx+rcx+28h]
00000000`77a7e820 4883c140 add rcx,40h
00000000`77a7e824 4c0fc349e0 movnti qword ptr [rcx-20h],r9
00000000`77a7e829 4c0fc351e8 movnti qword ptr [rcx-18h],r10
00000000`77a7e82e 4c8b4c0af0 mov r9,qword ptr [rdx+rcx-10h]
00000000`77a7e833 4c8b540af8 mov r10,qword ptr [rdx+rcx-8]
00000000`77a7e838 ffc8 dec eax
00000000`77a7e83a 4c0fc349f0 movnti qword ptr [rcx-10h],r9
00000000`77a7e83f 4c0fc351f8 movnti qword ptr [rcx-8],r10
00000000`77a7e844 75aa jne ntdll!memcpy+0x120 (00000000`77a7e7f0)
ntdll!memcpy+0x176:
00000000`77a7e846 4981e800100000 sub r8,1000h
00000000`77a7e84d 4981f800100000 cmp r8,1000h
00000000`77a7e854 0f8371ffffff jae ntdll!memcpy+0xfb (00000000`77a7e7cb)
在这里以 4K bytes 作为一个处理单元进行复制,首先使用 prefetchnta 指令来加载 4K bytes 到 catch(80h * 20h = 4K)
然后每次循环复制 4K bytes
代码:
char *memcpy(char *dest, char *source, unsigned long long count) { char *p = dest; if (source >= dest) { if (count < 8) { while (count--) *dest++ = *source++; return p; } // 处理非对齐 swtich (dest & 0x07) { case 7: *dest++ = *source++; count--; case 6: *dest++ = *source++; count--; case 5: *dest++ = *source++; count--; case 4: *dest++ = *source++; count--; case 3: *dest++ = *source++; count--; case 2: *dest++ = *source++; count--; case 1: *dest++ = *source++; count--; } // 处理 32 bytes process_count32: if (count < 32) { if (count >= 8) { for (int i = count / 8; i; i++) *(long long *)dest++ = *(long long *)source++; for (int i = count % 8; i; i++) *dest++ = *source++; } } else { // 大于 32 bytes 的处理 int count32 = count / 32; if (count32 >= 8192) { // 大于 256K bytes 时处理 // 使用 prefetchnta 指令加载,使用 movnti 加速复制 } else { for (int i = 0; i < count32 / 32; i++) { *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; *(long long *)dest++ = *(long long *)source++; } goto process_count32; // 处理 32 bytes 单元 } } } else { // 相关处理 } return p; }