A
Size: a a a
A
E
A
d
memcpy cases
+------------+-----------------------------+
| pointers | destination pointer align |
| difference +-----+-----+-----+-----+-----+
| align | 16 | 8 | 4 | 2 | 1 |
+------------+-----+-----+-----+-----+-----+
| 16 | 16 | 16* | 16* | 16* | 16* |
+------------+-----+-----+-----+-----+-----+
| 8 | 8 | 8 | 8* | 8* | 8* |
+------------+-----+-----+-----+-----+-----+
| 4 | 4 | 4 | 4 | 4* | 4* |
+------------+-----+-----+-----+-----+-----+
| 2 | 2 | 2 | 2 | 2 | 2* |
+------------+-----+-----+-----+-----+-----+
| 1 | 1 | 1 | 1 | 1 | 1 |
+------------+-----+-----+-----+-----+-----+
* - block copy after copy of non aligned part
E
A
E
d
A
A
A
A
d
E
scrot —select 1.png
))d
E
movsb
, можно было написать намного лучше... Странно, что это на O3memcpy:
push ebp ; AGU0 1.00
mov ebp, esp ; ALU0 1.00
push edi ; AGU0 2.00
push esi ; ALU0 2.00
push ebx ; AGU0 3.00
sub esp, 4 ; ALU0 3.00
mov ebx, DWORD PTR [ebp+16] ; AGU0 4.00
mov ecx, DWORD PTR [ebp+8] ; AGU1 1.00
mov esi, DWORD PTR [ebp+12] ; AGU2 1.00
test ebx, ebx ; ALU0 4.00
je .L2
mov eax, ecx ; ALU1 1.00
lea edi, [ebx-1] ; AGU0 5.00
or eax, esi ; ALU0 5.00
mov DWORD PTR [ebp-16], edi ; AGU0 6.00
test al, 3 ; ALU0 6.00
jne .L3
cmp edi, 2 ; ALU1 2.00
jbe .L3
mov edi, ebx ; ALU0 7.00
mov eax, esi ; ALU0 7.00
mov edx, ecx ; ALU0 7.00
and edi, -4 ; ALU1 3.00
add edi, esi ; ALU1 4.00
.align 16
.L4:
mov ecx, DWORD PTR [eax] ; AGU0 9.00
add eax, 4 ; ALU0 8.00
add edx, 4 ; ALU0 8.00
mov DWORD PTR [edx-4], ecx ; AGU0 10.0 / wait EDX
cmp eax, edi ; ALU0 9.00
jne .L4 ; n*9 clock cycles
mov ecx, DWORD PTR [ebp+8] ; AGU0 13.00
mov eax, ebx ; ALU0 10.00
and eax, -4 ; ALU0 11.00
lea edx, [ecx+eax] ; AGU1 2.00 / wait EAX
add esi, eax ; ALU0 12.00
cmp ebx, eax ; ALU0 13.00
je .L2
movzx ebx, BYTE PTR [esi] ; ALU0 14.00
mov BYTE PTR [edx], bl ; ALU0 18.00 / Wait EBX
mov ebx, DWORD PTR [ebp-16] ; ALU0 22.00
sub ebx, eax ; ALU0 26.00
je .L2
movzx eax, BYTE PTR [esi+1] ; ALU0 27.00
mov BYTE PTR [edx+1], al ; ALU0 31.00 / Wait EAX
cmp ebx, 1 ; ALU1 5.00
je .L2
movzx eax, BYTE PTR [esi+2] ; ALU0 34.00
mov BYTE PTR [edx+2], al ; ALU0 38.00 / Wait EAX
.L2:
add esp, 4 ; ZF.start.13 = ALU0 5.00 ; ZF.start.41 = ALU0 14.00 ; ...
mov eax, ecx ; ALU0 5.00 ; ALU0 14.00 ; ...
pop ebx ; ALU0 5.00 ; ALU0 14.00 ; ...
pop esi ; ALU1 6.00 ; ; ...
pop edi ; ALU2 1.00 ; ; ...
pop ebp ; AGU0 16.0 ; ; ...
ret ; ALU0 6.00 ; ALU0 15.00 ; ...
.align 16
.L3:
add ebx, ecx ; ZF.start.19 = ALU0 9.00
mov edi, ecx ; ALU0 10.0
.align 16
.L6:
movsb ; ALU0 11.0 / Wait EDI
cmp ebx, edi ; ALU0 14.0 / Wait EDI
jne .L6 ; n*4 clock cycles + 5 (if no jnz, or +2 if jnz)
add esp, 4 ; ALU0
mov eax, ecx ; ALU0
pop ebx ; ALU0
pop esi ; ALU1
pop edi ; AGU0
pop ebp ; AGU1
ret ; ALU0
Пропускная способность, чтобы далеко не заглядывать:d
E
d