上一次逆了 ntdll!memcpy_s() 函数,里面最终调用 ntdll!memcpy() 进行复制,这次将 ntdll!memcpy() 作为对象,进行逆向和分析。

下面是 windows 7 x64 里的 ntdll!memcpy() 函数:

代码:
ntdll!memcpy:
00000000`77a7e6d0 4c8bd9          mov     r11,rcx
00000000`77a7e6d3 482bd1          sub     rdx,rcx
00000000`77a7e6d6 0f829e010000    jb      ntdll!memcpy+0x1aa (00000000`77a7e87a)

ntdll!memcpy+0xc:
00000000`77a7e6dc 4983f808        cmp     r8,8
00000000`77a7e6e0 7262            jb      ntdll!memcpy+0x74 (00000000`77a7e744)

ntdll!memcpy+0x12:
00000000`77a7e6e2 f6c107          test    cl,7
00000000`77a7e6e5 7437            je      ntdll!memcpy+0x4e (00000000`77a7e71e)

ntdll!memcpy+0x17:
00000000`77a7e6e7 f6c101          test    cl,1
00000000`77a7e6ea 740c            je      ntdll!memcpy+0x28 (00000000`77a7e6f8)

ntdll!memcpy+0x1c:
00000000`77a7e6ec 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e6ef 49ffc8          dec     r8
00000000`77a7e6f2 8801            mov     byte ptr [rcx],al
00000000`77a7e6f4 4883c101        add     rcx,1

ntdll!memcpy+0x28:
00000000`77a7e6f8 f6c102          test    cl,2
00000000`77a7e6fb 740f            je      ntdll!memcpy+0x3c (00000000`77a7e70c)

ntdll!memcpy+0x2d:
00000000`77a7e6fd 668b040a        mov     ax,word ptr [rdx+rcx]
00000000`77a7e701 4983e802        sub     r8,2
00000000`77a7e705 668901          mov     word ptr [rcx],ax
00000000`77a7e708 4883c102        add     rcx,2

ntdll!memcpy+0x3c:
00000000`77a7e70c f6c104          test    cl,4
00000000`77a7e70f 740d            je      ntdll!memcpy+0x4e (00000000`77a7e71e)

ntdll!memcpy+0x41:
00000000`77a7e711 8b0411          mov     eax,dword ptr [rcx+rdx]
00000000`77a7e714 4983e804        sub     r8,4
00000000`77a7e718 8901            mov     dword ptr [rcx],eax
00000000`77a7e71a 4883c104        add     rcx,4

ntdll!memcpy+0x4e:
00000000`77a7e71e 4d8bc8          mov     r9,r8
00000000`77a7e721 49c1e905        shr     r9,5
00000000`77a7e725 7550            jne     ntdll!memcpy+0xa7 (00000000`77a7e777)

ntdll!memcpy+0x57:
00000000`77a7e727 4d8bc8          mov     r9,r8
00000000`77a7e72a 49c1e903        shr     r9,3
00000000`77a7e72e 7414            je      ntdll!memcpy+0x74 (00000000`77a7e744)

ntdll!memcpy+0x60:
00000000`77a7e730 488b040a        mov     rax,qword ptr [rdx+rcx]
00000000`77a7e734 488901          mov     qword ptr [rcx],rax
00000000`77a7e737 4883c108        add     rcx,8
00000000`77a7e73b 49ffc9          dec     r9
00000000`77a7e73e 75f0            jne     ntdll!memcpy+0x60 (00000000`77a7e730)

ntdll!memcpy+0x70:
00000000`77a7e740 4983e007        and     r8,7

ntdll!memcpy+0x74:
00000000`77a7e744 4d85c0          test    r8,r8
00000000`77a7e747 7507            jne     ntdll!memcpy+0x80 (00000000`77a7e750)

ntdll!memcpy+0x79:
00000000`77a7e749 498bc3          mov     rax,r11
00000000`77a7e74c c3              ret

ntdll!memcpy+0x80:
00000000`77a7e750 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e753 8801            mov     byte ptr [rcx],al
00000000`77a7e755 48ffc1          inc     rcx
00000000`77a7e758 49ffc8          dec     r8
00000000`77a7e75b 75f3            jne     ntdll!memcpy+0x80 (00000000`77a7e750)

ntdll!memcpy+0x8d:
00000000`77a7e75d 498bc3          mov     rax,r11
00000000`77a7e760 c3              ret

ntdll!memcpy+0xa7:
00000000`77a7e777 4981f900200000  cmp     r9,2000h
00000000`77a7e77e 7342            jae     ntdll!memcpy+0xf2 (00000000`77a7e7c2)

ntdll!memcpy+0xb0:
00000000`77a7e780 488b040a        mov     rax,qword ptr [rdx+rcx]
00000000`77a7e784 4c8b540a08      mov     r10,qword ptr [rdx+rcx+8]
00000000`77a7e789 4883c120        add     rcx,20h
00000000`77a7e78d 488941e0        mov     qword ptr [rcx-20h],rax
00000000`77a7e791 4c8951e8        mov     qword ptr [rcx-18h],r10
00000000`77a7e795 488b440af0      mov     rax,qword ptr [rdx+rcx-10h]
00000000`77a7e79a 4c8b540af8      mov     r10,qword ptr [rdx+rcx-8]
00000000`77a7e79f 49ffc9          dec     r9
00000000`77a7e7a2 488941f0        mov     qword ptr [rcx-10h],rax
00000000`77a7e7a6 4c8951f8        mov     qword ptr [rcx-8],r10
00000000`77a7e7aa 75d4            jne     ntdll!memcpy+0xb0 (00000000`77a7e780)

ntdll!memcpy+0xdc:
00000000`77a7e7ac 4983e01f        and     r8,1Fh
00000000`77a7e7b0 e972ffffff      jmp     ntdll!memcpy+0x57 (00000000`77a7e727)

ntdll!memcpy+0xf2:
00000000`77a7e7c2 4881fa00100000  cmp     rdx,1000h
00000000`77a7e7c9 72b5            jb      ntdll!memcpy+0xb0 (00000000`77a7e780)

ntdll!memcpy+0xfb:
00000000`77a7e7cb b820000000      mov     eax,20h

ntdll!memcpy+0x100:
00000000`77a7e7d0 0f18040a        prefetchnta [rdx+rcx]
00000000`77a7e7d4 0f18440a40      prefetchnta [rdx+rcx+40h]
00000000`77a7e7d9 4881c180000000  add     rcx,80h
00000000`77a7e7e0 ffc8            dec     eax
00000000`77a7e7e2 75ec            jne     ntdll!memcpy+0x100 (00000000`77a7e7d0)

ntdll!memcpy+0x114:
00000000`77a7e7e4 4881e900100000  sub     rcx,1000h
00000000`77a7e7eb b840000000      mov     eax,40h

ntdll!memcpy+0x120:
00000000`77a7e7f0 4c8b0c0a        mov     r9,qword ptr [rdx+rcx]
00000000`77a7e7f4 4c8b540a08      mov     r10,qword ptr [rdx+rcx+8]
00000000`77a7e7f9 4c0fc309        movnti  qword ptr [rcx],r9
00000000`77a7e7fd 4c0fc35108      movnti  qword ptr [rcx+8],r10
00000000`77a7e802 4c8b4c0a10      mov     r9,qword ptr [rdx+rcx+10h]
00000000`77a7e807 4c8b540a18      mov     r10,qword ptr [rdx+rcx+18h]
00000000`77a7e80c 4c0fc34910      movnti  qword ptr [rcx+10h],r9
00000000`77a7e811 4c0fc35118      movnti  qword ptr [rcx+18h],r10
00000000`77a7e816 4c8b4c0a20      mov     r9,qword ptr [rdx+rcx+20h]
00000000`77a7e81b 4c8b540a28      mov     r10,qword ptr [rdx+rcx+28h]
00000000`77a7e820 4883c140        add     rcx,40h
00000000`77a7e824 4c0fc349e0      movnti  qword ptr [rcx-20h],r9
00000000`77a7e829 4c0fc351e8      movnti  qword ptr [rcx-18h],r10
00000000`77a7e82e 4c8b4c0af0      mov     r9,qword ptr [rdx+rcx-10h]
00000000`77a7e833 4c8b540af8      mov     r10,qword ptr [rdx+rcx-8]
00000000`77a7e838 ffc8            dec     eax
00000000`77a7e83a 4c0fc349f0      movnti  qword ptr [rcx-10h],r9
00000000`77a7e83f 4c0fc351f8      movnti  qword ptr [rcx-8],r10
00000000`77a7e844 75aa            jne     ntdll!memcpy+0x120 (00000000`77a7e7f0)

ntdll!memcpy+0x176:
00000000`77a7e846 4981e800100000  sub     r8,1000h
00000000`77a7e84d 4981f800100000  cmp     r8,1000h
00000000`77a7e854 0f8371ffffff    jae     ntdll!memcpy+0xfb (00000000`77a7e7cb)

ntdll!memcpy+0x18a:
00000000`77a7e85a f0800c2400      lock or byte ptr [rsp],0
00000000`77a7e85f e9bafeffff      jmp     ntdll!memcpy+0x4e (00000000`77a7e71e)

ntdll!memcpy+0x1aa:
00000000`77a7e87a 4903c8          add     rcx,r8
00000000`77a7e87d 4983f808        cmp     r8,8
00000000`77a7e881 7261            jb      ntdll!memcpy+0x214 (00000000`77a7e8e4)

ntdll!memcpy+0x1b3:
00000000`77a7e883 f6c107          test    cl,7
00000000`77a7e886 7436            je      ntdll!memcpy+0x1ee (00000000`77a7e8be)

ntdll!memcpy+0x1b8:
00000000`77a7e888 f6c101          test    cl,1
00000000`77a7e88b 740b            je      ntdll!memcpy+0x1c8 (00000000`77a7e898)

ntdll!memcpy+0x1bd:
00000000`77a7e88d 48ffc9          dec     rcx
00000000`77a7e890 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e893 49ffc8          dec     r8
00000000`77a7e896 8801            mov     byte ptr [rcx],al

ntdll!memcpy+0x1c8:
00000000`77a7e898 f6c102          test    cl,2
00000000`77a7e89b 740f            je      ntdll!memcpy+0x1dc (00000000`77a7e8ac)

ntdll!memcpy+0x1cd:
00000000`77a7e89d 4883e902        sub     rcx,2
00000000`77a7e8a1 668b040a        mov     ax,word ptr [rdx+rcx]
00000000`77a7e8a5 4983e802        sub     r8,2
00000000`77a7e8a9 668901          mov     word ptr [rcx],ax

ntdll!memcpy+0x1dc:
00000000`77a7e8ac f6c104          test    cl,4
00000000`77a7e8af 740d            je      ntdll!memcpy+0x1ee (00000000`77a7e8be)

ntdll!memcpy+0x1e1:
00000000`77a7e8b1 4883e904        sub     rcx,4
00000000`77a7e8b5 8b0411          mov     eax,dword ptr [rcx+rdx]
00000000`77a7e8b8 4983e804        sub     r8,4
00000000`77a7e8bc 8901            mov     dword ptr [rcx],eax

ntdll!memcpy+0x1ee:
00000000`77a7e8be 4d8bc8          mov     r9,r8
00000000`77a7e8c1 49c1e905        shr     r9,5
00000000`77a7e8c5 7550            jne     ntdll!memcpy+0x247 (00000000`77a7e917)

ntdll!memcpy+0x1f7:
00000000`77a7e8c7 4d8bc8          mov     r9,r8
00000000`77a7e8ca 49c1e903        shr     r9,3
00000000`77a7e8ce 7414            je      ntdll!memcpy+0x214 (00000000`77a7e8e4)

ntdll!memcpy+0x200:
00000000`77a7e8d0 4883e908        sub     rcx,8
00000000`77a7e8d4 488b040a        mov     rax,qword ptr [rdx+rcx]
00000000`77a7e8d8 49ffc9          dec     r9
00000000`77a7e8db 488901          mov     qword ptr [rcx],rax
00000000`77a7e8de 75f0            jne     ntdll!memcpy+0x200 (00000000`77a7e8d0)

ntdll!memcpy+0x210:
00000000`77a7e8e0 4983e007        and     r8,7

ntdll!memcpy+0x214:
00000000`77a7e8e4 4d85c0          test    r8,r8
00000000`77a7e8e7 7507            jne     ntdll!memcpy+0x220 (00000000`77a7e8f0)

ntdll!memcpy+0x219:
00000000`77a7e8e9 498bc3          mov     rax,r11
00000000`77a7e8ec c3              ret

ntdll!memcpy+0x220:
00000000`77a7e8f0 48ffc9          dec     rcx
00000000`77a7e8f3 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e8f6 49ffc8          dec     r8
00000000`77a7e8f9 8801            mov     byte ptr [rcx],al
00000000`77a7e8fb 75f3            jne     ntdll!memcpy+0x220 (00000000`77a7e8f0)

ntdll!memcpy+0x22d:
00000000`77a7e8fd 498bc3          mov     rax,r11
00000000`77a7e900 c3              ret

ntdll!memcpy+0x247:
00000000`77a7e917 4981f900200000  cmp     r9,2000h
00000000`77a7e91e 7342            jae     ntdll!memcpy+0x292 (00000000`77a7e962)

ntdll!memcpy+0x250:
00000000`77a7e920 488b440af8      mov     rax,qword ptr [rdx+rcx-8]
00000000`77a7e925 4c8b540af0      mov     r10,qword ptr [rdx+rcx-10h]
00000000`77a7e92a 4883e920        sub     rcx,20h
00000000`77a7e92e 48894118        mov     qword ptr [rcx+18h],rax
00000000`77a7e932 4c895110        mov     qword ptr [rcx+10h],r10
00000000`77a7e936 488b440a08      mov     rax,qword ptr [rdx+rcx+8]
00000000`77a7e93b 4c8b140a        mov     r10,qword ptr [rdx+rcx]
00000000`77a7e93f 49ffc9          dec     r9
00000000`77a7e942 48894108        mov     qword ptr [rcx+8],rax
00000000`77a7e946 4c8911          mov     qword ptr [rcx],r10
00000000`77a7e949 75d5            jne     ntdll!memcpy+0x250 (00000000`77a7e920)

ntdll!memcpy+0x27b:
00000000`77a7e94b 4983e01f        and     r8,1Fh
00000000`77a7e94f e973ffffff      jmp     ntdll!memcpy+0x1f7 (00000000`77a7e8c7)

ntdll!memcpy+0x292:
00000000`77a7e962 4881fa00f0ffff  cmp     rdx,0FFFFFFFFFFFFF000h
00000000`77a7e969 77b5            ja      ntdll!memcpy+0x250 (00000000`77a7e920)

ntdll!memcpy+0x29b:
00000000`77a7e96b b820000000      mov     eax,20h

ntdll!memcpy+0x2a0:
00000000`77a7e970 4881e980000000  sub     rcx,80h
00000000`77a7e977 0f18040a        prefetchnta [rdx+rcx]
00000000`77a7e97b 0f18440a40      prefetchnta [rdx+rcx+40h]
00000000`77a7e980 ffc8            dec     eax
00000000`77a7e982 75ec            jne     ntdll!memcpy+0x2a0 (00000000`77a7e970)

ntdll!memcpy+0x2b4:
00000000`77a7e984 4881c100100000  add     rcx,1000h
00000000`77a7e98b b840000000      mov     eax,40h

ntdll!memcpy+0x2c0:
00000000`77a7e990 4c8b4c0af8      mov     r9,qword ptr [rdx+rcx-8]
00000000`77a7e995 4c8b540af0      mov     r10,qword ptr [rdx+rcx-10h]
00000000`77a7e99a 4c0fc349f8      movnti  qword ptr [rcx-8],r9
00000000`77a7e99f 4c0fc351f0      movnti  qword ptr [rcx-10h],r10
00000000`77a7e9a4 4c8b4c0ae8      mov     r9,qword ptr [rdx+rcx-18h]
00000000`77a7e9a9 4c8b540ae0      mov     r10,qword ptr [rdx+rcx-20h]
00000000`77a7e9ae 4c0fc349e8      movnti  qword ptr [rcx-18h],r9
00000000`77a7e9b3 4c0fc351e0      movnti  qword ptr [rcx-20h],r10
00000000`77a7e9b8 4c8b4c0ad8      mov     r9,qword ptr [rdx+rcx-28h]
00000000`77a7e9bd 4c8b540ad0      mov     r10,qword ptr [rdx+rcx-30h]
00000000`77a7e9c2 4883e940        sub     rcx,40h
00000000`77a7e9c6 4c0fc34918      movnti  qword ptr [rcx+18h],r9
00000000`77a7e9cb 4c0fc35110      movnti  qword ptr [rcx+10h],r10
00000000`77a7e9d0 4c8b4c0a08      mov     r9,qword ptr [rdx+rcx+8]
00000000`77a7e9d5 4c8b140a        mov     r10,qword ptr [rdx+rcx]
00000000`77a7e9d9 ffc8            dec     eax
00000000`77a7e9db 4c0fc34908      movnti  qword ptr [rcx+8],r9
00000000`77a7e9e0 4c0fc311        movnti  qword ptr [rcx],r10
00000000`77a7e9e4 75aa            jne     ntdll!memcpy+0x2c0 (00000000`77a7e990)

ntdll!memcpy+0x316:
00000000`77a7e9e6 4981e800100000  sub     r8,1000h
00000000`77a7e9ed 4981f800100000  cmp     r8,1000h
00000000`77a7e9f4 0f8371ffffff    jae     ntdll!memcpy+0x29b (00000000`77a7e96b)

ntdll!memcpy+0x32a:
00000000`77a7e9fa f0800c2400      lock or byte ptr [rsp],0
00000000`77a7e9ff e9bafeffff      jmp     ntdll!memcpy+0x1ee (00000000`77a7e8be)
这个函数比较长,在 ntdll 模块里,因此对于效率的要求是很高的,它的函数原型,类似下面:
代码:
char *memcpy(char *dest, char *source, unsigned long long count);
这个 count 实际上是 64 位值,在上一篇逆向中用了 unsigned int (32 位值)


1. memcpy() 分两大情况进行处理

memcpy() 会首先判断 dest 和 source 的位置:

00000000`77a7e6d0 4c8bd9          mov     r11,rcx
00000000`77a7e6d3 482bd1          sub     rdx,rcx
00000000`77a7e6d6 0f829e010000    jb      ntdll!memcpy+0x1aa (00000000`77a7e87a)

ntdll!memcpy+0xc:
00000000`77a7e6dc 4983f808        cmp     r8,8
00000000`77a7e6e0 7262            jb      ntdll!memcpy+0x74 (00000000`77a7e744) 

... ... 

ntdll!memcpy+0x1aa:
00000000`77a7e87a 4903c8          add     rcx,r8
00000000`77a7e87d 4983f808        cmp     r8,8
00000000`77a7e881 7261            jb      ntdll!memcpy+0x214 (00000000`77a7e8e4)


第1种情况是: source >= dest 的时候,第 2 种情况是:source < dest 的时候,在这里我们得到下面的逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          // 相关处理
    }
    else
    {
         // 相关处理
    }


    return p;
}
这里,我主要对 source >= dest 这种情况进行分析,后面的 source < dest  处理手法完全是一样的。


2.  当 count 小于 8 的时候

接下来,首先处理当复制数量小于 8 bytes 时,在这种情况肯定是最简单的:

ntdll!memcpy+0xc:
00000000`77a7e6dc 4983f808        cmp     r8,8                          ; count < 8 时候
00000000`77a7e6e0 7262            jb      ntdll!memcpy+0x74 (00000000`77a7e744)

... ...

ntdll!memcpy+0x74:
00000000`77a7e744 4d85c0          test    r8,r8
00000000`77a7e747 7507            jne     ntdll!memcpy+0x80 (00000000`77a7e750)

... ...

ntdll!memcpy+0x80:
00000000`77a7e750 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e753 8801            mov     byte ptr [rcx],al
00000000`77a7e755 48ffc1          inc     rcx
00000000`77a7e758 49ffc8          dec     r8
00000000`77a7e75b 75f3            jne     ntdll!memcpy+0x80 (00000000`77a7e750)

ntdll!memcpy+0x8d:
00000000`77a7e75d 498bc3          mov     rax,r11
00000000`77a7e760 c3              ret

当小于 8 时候只是做简单的复制就可以了,于是,我们得到下面的逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          
         if (count < 8)
         {
             while (count--)
                 *dest++ = *source++;    
         }
    

    }
    else
    {
         // 相关处理
    }


    return p;
}

3. 处理地址非对齐的情况

我们都知道,当复制进候,源地址和目标地址都是对齐的情况下,效率是最高的。
当然这是一种理想的状态,实际上我们会遇到非对齐的地址,那么 memcpy() 会怎样处理呢:

ntdll!memcpy+0x12:
00000000`77a7e6e2 f6c107          test    cl,7
00000000`77a7e6e5 7437            je      ntdll!memcpy+0x4e (00000000`77a7e71e)

ntdll!memcpy+0x17:
00000000`77a7e6e7 f6c101          test    cl,1
00000000`77a7e6ea 740c            je      ntdll!memcpy+0x28 (00000000`77a7e6f8)

ntdll!memcpy+0x1c:
00000000`77a7e6ec 8a040a          mov     al,byte ptr [rdx+rcx]
00000000`77a7e6ef 49ffc8          dec     r8
00000000`77a7e6f2 8801            mov     byte ptr [rcx],al
00000000`77a7e6f4 4883c101        add     rcx,1

ntdll!memcpy+0x28:
00000000`77a7e6f8 f6c102          test    cl,2
00000000`77a7e6fb 740f            je      ntdll!memcpy+0x3c (00000000`77a7e70c)

ntdll!memcpy+0x2d:
00000000`77a7e6fd 668b040a        mov     ax,word ptr [rdx+rcx]
00000000`77a7e701 4983e802        sub     r8,2
00000000`77a7e705 668901          mov     word ptr [rcx],ax
00000000`77a7e708 4883c102        add     rcx,2

ntdll!memcpy+0x3c:
00000000`77a7e70c f6c104          test    cl,4
00000000`77a7e70f 740d            je      ntdll!memcpy+0x4e (00000000`77a7e71e)

ntdll!memcpy+0x41:
00000000`77a7e711 8b0411          mov     eax,dword ptr [rcx+rdx]
00000000`77a7e714 4983e804        sub     r8,4
00000000`77a7e718 8901            mov     dword ptr [rcx],eax
00000000`77a7e71a 4883c104        add     rcx,4

上面一段代码都是在处理当目标地址处理非对齐的情况下,memcpy() 的处理手法是:在进行大批量复制之前,先处理掉非对齐的部分,从而变得对齐了

上面这段代码的逻辑是:
代码:
  if (dest & 0x07)
  {
    if (dest & 0x01)
    {
      *dest++ = *source++;
      count--;
    }

    if (dest & 0x02)
    {
      *dest++ = *source++;
      *dest++ = *source++;
      count -= 2;
    }

    if (dest & 0x04)
    {
      *(int *)dest++ = *(int *)source++;
      count -= 4;
    }
  }
这段代码的作用是对非对齐地址进行填补,我们可以把代码写得好看些:
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          
         if (count < 8)
         {
             while (count--)
                 *dest++ = *source++;    
         }

        
        swtich (dest & 0x07)
        {
        case 7:        *dest++ = *source++;  count--;
        case 6:        *dest++ = *source++;  count--;
        case 5:        *dest++ = *source++;  count--;
        case 4:        *dest++ = *source++;  count--;
        case 3:        *dest++ = *source++;  count--;
        case 2:        *dest++ = *source++;  count--;
        case 1:        *dest++ = *source++;  count--;
        }                


    }
    else
    {
         // 相关处理
    }


    return p;
}

4. 当 count 小于 32 的情况下,以 32 bytes 作为一个处理单元

接下来会根据将 count 分为几种情况,首先是以 32 bytes 作为一个处理单元:

ntdll!memcpy+0x4e:
00000000`77a7e71e 4d8bc8          mov     r9,r8                                  ; 
00000000`77a7e721 49c1e905        shr     r9,5                                   ; 以 count / 32 为一个单元
00000000`77a7e725 7550            jne     ntdll!memcpy+0xa7 (00000000`77a7e777)

ntdll!memcpy+0x57:
00000000`77a7e727 4d8bc8          mov     r9,r8
00000000`77a7e72a 49c1e903        shr     r9,3                                   ; 以 count / 8 作为一个单元
00000000`77a7e72e 7414            je      ntdll!memcpy+0x74 (00000000`77a7e744)

ntdll!memcpy+0x60:
00000000`77a7e730 488b040a        mov     rax,qword ptr [rdx+rcx]                ; 一次复制 8 bytes
00000000`77a7e734 488901          mov     qword ptr [rcx],rax
00000000`77a7e737 4883c108        add     rcx,8
00000000`77a7e73b 49ffc9          dec     r9
00000000`77a7e73e 75f0            jne     ntdll!memcpy+0x60 (00000000`77a7e730)

ntdll!memcpy+0x70:
00000000`77a7e740 4983e007        and     r8,7

ntdll!memcpy+0x74:
00000000`77a7e744 4d85c0          test    r8,r8
00000000`77a7e747 7507            jne     ntdll!memcpy+0x80 (00000000`77a7e750)   ; 一次复制 1 byte

ntdll!memcpy+0x79:
00000000`77a7e749 498bc3          mov     rax,r11
00000000`77a7e74c c3              ret

这段代码在 32 bytes 内,先以 8 bytes 为单位进行复制,剩下不足的按 1 byte 复制,我们得到下面逻辑:
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          
         if (count < 8)
         {
             while (count--)
                 *dest++ = *source++;    
             
             return p;
         }

        // 处理非对齐
        
        swtich (dest & 0x07)
        {
        case 7:        *dest++ = *source++;  count--;
        case 6:        *dest++ = *source++;  count--;
        case 5:        *dest++ = *source++;  count--;
        case 4:        *dest++ = *source++;  count--;
        case 3:        *dest++ = *source++;  count--;
        case 2:        *dest++ = *source++;  count--;
        case 1:        *dest++ = *source++;  count--;
        }                


        // 处理 32 bytes

        if (count < 32)
        {

                if (count >= 8)
                {
                        for (int i = count / 8; i; i++)
                                *(long long *)dest++ = *(long long *)source++;        

                        for (int i = count % 8; i; i++)        
                                *dest++ = *source++;
                }

        }
        else
        {
                // 大于 32 bytes 的处理
        }





    }
    else
    {
         // 相关处理
    }


    return p;
}

5. 当复制数量大于 32 bytes 时的情况

这里的情况有些复杂,memcpy() 还以 256K bytes 作为分界线

5.1 小于 256K bytes 的时候

下面看看小于 256k bytes 时的处理:

ntdll!memcpy+0xa7:
00000000`77a7e777 4981f900200000  cmp     r9,2000h                                ; 8K * 32 = 256K bytes
00000000`77a7e77e 7342            jae     ntdll!memcpy+0xf2 (00000000`77a7e7c2)

ntdll!memcpy+0xb0:
00000000`77a7e780 488b040a        mov     rax,qword ptr [rdx+rcx]                 ; 一次复制 32 bytes
00000000`77a7e784 4c8b540a08      mov     r10,qword ptr [rdx+rcx+8]
00000000`77a7e789 4883c120        add     rcx,20h
00000000`77a7e78d 488941e0        mov     qword ptr [rcx-20h],rax
00000000`77a7e791 4c8951e8        mov     qword ptr [rcx-18h],r10
00000000`77a7e795 488b440af0      mov     rax,qword ptr [rdx+rcx-10h]
00000000`77a7e79a 4c8b540af8      mov     r10,qword ptr [rdx+rcx-8]
00000000`77a7e79f 49ffc9          dec     r9
00000000`77a7e7a2 488941f0        mov     qword ptr [rcx-10h],rax
00000000`77a7e7a6 4c8951f8        mov     qword ptr [rcx-8],r10
00000000`77a7e7aa 75d4            jne     ntdll!memcpy+0xb0 (00000000`77a7e780)

ntdll!memcpy+0xdc:
00000000`77a7e7ac 4983e01f        and     r8,1Fh
00000000`77a7e7b0 e972ffffff      jmp     ntdll!memcpy+0x57 (00000000`77a7e727)

这里以 256K bytes 以为一个处理单元,每次循环复制 32 bytes:
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          
         if (count < 8)
         {
             while (count--)
                 *dest++ = *source++;    
             
             return p;
         }

        // 处理非对齐
        
        swtich (dest & 0x07)
        {
        case 7:        *dest++ = *source++;  count--;
        case 6:        *dest++ = *source++;  count--;
        case 5:        *dest++ = *source++;  count--;
        case 4:        *dest++ = *source++;  count--;
        case 3:        *dest++ = *source++;  count--;
        case 2:        *dest++ = *source++;  count--;
        case 1:        *dest++ = *source++;  count--;
        }                


        // 处理 32 bytes

process_count32:

        if (count < 32)
        {

                if (count >= 8)
                {
                        for (int i = count / 8; i; i++)
                                *(long long *)dest++ = *(long long *)source++;        

                        for (int i = count % 8; i; i++)        
                                *dest++ = *source++;
                }

        }
        else
        {
                // 大于 32 bytes 的处理


                int count32 = count / 32;

                if (count32 >= 8192)
                {
                        // 大于 256K bytes 时处理
                }
                else
                {
                        for (int i = 0; i < count32 / 32; i++)
                        {
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                        }

                        goto process_count32;                 // 处理 32 bytes 单元
                }
        }





    }
    else
    {
         // 相关处理
    }


    return p;
}

5.2 当复制大于256K bytes 

这里为了效率,采用了一系列 Move Non-Temporal 指令来优化,以及 Prefetch 指令

ntdll!memcpy+0xf2:
00000000`77a7e7c2 4881fa00100000  cmp     rdx,1000h
00000000`77a7e7c9 72b5            jb      ntdll!memcpy+0xb0 (00000000`77a7e780)

ntdll!memcpy+0xfb:
00000000`77a7e7cb b820000000      mov     eax,20h

ntdll!memcpy+0x100:
00000000`77a7e7d0 0f18040a        prefetchnta [rdx+rcx]
00000000`77a7e7d4 0f18440a40      prefetchnta [rdx+rcx+40h]
00000000`77a7e7d9 4881c180000000  add     rcx,80h
00000000`77a7e7e0 ffc8            dec     eax
00000000`77a7e7e2 75ec            jne     ntdll!memcpy+0x100 (00000000`77a7e7d0)

ntdll!memcpy+0x114:
00000000`77a7e7e4 4881e900100000  sub     rcx,1000h
00000000`77a7e7eb b840000000      mov     eax,40h

ntdll!memcpy+0x120:
00000000`77a7e7f0 4c8b0c0a        mov     r9,qword ptr [rdx+rcx]
00000000`77a7e7f4 4c8b540a08      mov     r10,qword ptr [rdx+rcx+8]
00000000`77a7e7f9 4c0fc309        movnti  qword ptr [rcx],r9
00000000`77a7e7fd 4c0fc35108      movnti  qword ptr [rcx+8],r10
00000000`77a7e802 4c8b4c0a10      mov     r9,qword ptr [rdx+rcx+10h]
00000000`77a7e807 4c8b540a18      mov     r10,qword ptr [rdx+rcx+18h]
00000000`77a7e80c 4c0fc34910      movnti  qword ptr [rcx+10h],r9
00000000`77a7e811 4c0fc35118      movnti  qword ptr [rcx+18h],r10
00000000`77a7e816 4c8b4c0a20      mov     r9,qword ptr [rdx+rcx+20h]
00000000`77a7e81b 4c8b540a28      mov     r10,qword ptr [rdx+rcx+28h]
00000000`77a7e820 4883c140        add     rcx,40h
00000000`77a7e824 4c0fc349e0      movnti  qword ptr [rcx-20h],r9
00000000`77a7e829 4c0fc351e8      movnti  qword ptr [rcx-18h],r10
00000000`77a7e82e 4c8b4c0af0      mov     r9,qword ptr [rdx+rcx-10h]
00000000`77a7e833 4c8b540af8      mov     r10,qword ptr [rdx+rcx-8]
00000000`77a7e838 ffc8            dec     eax
00000000`77a7e83a 4c0fc349f0      movnti  qword ptr [rcx-10h],r9
00000000`77a7e83f 4c0fc351f8      movnti  qword ptr [rcx-8],r10
00000000`77a7e844 75aa            jne     ntdll!memcpy+0x120 (00000000`77a7e7f0)

ntdll!memcpy+0x176:
00000000`77a7e846 4981e800100000  sub     r8,1000h
00000000`77a7e84d 4981f800100000  cmp     r8,1000h
00000000`77a7e854 0f8371ffffff    jae     ntdll!memcpy+0xfb (00000000`77a7e7cb)

在这里以 4K bytes 作为一个处理单元进行复制,首先使用 prefetchnta 指令来加载 4K bytes 到 catch(80h * 20h = 4K)
然后每次循环复制 4K  bytes
代码:
char *memcpy(char *dest, char *source, unsigned long long count)
{
    char *p = dest;

    if (source >= dest)
    {
          
         if (count < 8)
         {
             while (count--)
                 *dest++ = *source++;    
             
             return p;
         }

        // 处理非对齐
        
        swtich (dest & 0x07)
        {
        case 7:        *dest++ = *source++;  count--;
        case 6:        *dest++ = *source++;  count--;
        case 5:        *dest++ = *source++;  count--;
        case 4:        *dest++ = *source++;  count--;
        case 3:        *dest++ = *source++;  count--;
        case 2:        *dest++ = *source++;  count--;
        case 1:        *dest++ = *source++;  count--;
        }                


        // 处理 32 bytes

process_count32:

        if (count < 32)
        {

                if (count >= 8)
                {
                        for (int i = count / 8; i; i++)
                                *(long long *)dest++ = *(long long *)source++;        

                        for (int i = count % 8; i; i++)        
                                *dest++ = *source++;
                }

        }
        else
        {
                // 大于 32 bytes 的处理


                int count32 = count / 32;

                if (count32 >= 8192)
                {
                        // 大于 256K bytes 时处理

                        // 使用 prefetchnta 指令加载,使用 movnti 加速复制              
                }
                else
                {
                        for (int i = 0; i < count32 / 32; i++)
                        {
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                                *(long long *)dest++ = *(long long *)source++;
                        }

                        goto process_count32;                 // 处理 32 bytes 单元
                }
        }





    }
    else
    {
         // 相关处理
    }


    return p;
}