svn commit: r339322 - head/sys/amd64/amd64
Mateusz Guzik
mjg at FreeBSD.org
Thu Oct 11 23:37:58 UTC 2018
Author: mjg
Date: Thu Oct 11 23:37:57 2018
New Revision: 339322
URL: https://svnweb.freebsd.org/changeset/base/339322
Log:
amd64: make memmove and memcpy less slow with mov
The reasoning is the same as with the memset change, see r339205
Reviewed by: kib (previous version)
Approved by: re (gjb)
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D17441
Modified:
head/sys/amd64/amd64/support.S
Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S Thu Oct 11 23:28:04 2018 (r339321)
+++ head/sys/amd64/amd64/support.S Thu Oct 11 23:37:57 2018 (r339322)
@@ -200,82 +200,236 @@ END(memcmp)
* Adapted from bcopy written by:
* ws at tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ENTRY(memmove_std)
- PUSH_FRAME_POINTER
- movq %rdi,%rax
- movq %rdx,%rcx
+/*
+ * Register state at entry is supposed to be as follows:
+ * rdi - destination
+ * rsi - source
+ * rdx - count
+ *
+ * The macro possibly clobbers the above and: rcx, r8.
+ * It does not clobber rax, r10 nor r11.
+ */
+.macro MEMMOVE erms overlap begin end
+ \begin
+.if \overlap == 1
movq %rdi,%r8
subq %rsi,%r8
- cmpq %rcx,%r8 /* overlapping && src < dst? */
+ cmpq %rcx,%r8 /* overlapping && src < dst? */
jb 2f
+.endif
- cmpq $15,%rcx
- jbe 1f
- shrq $3,%rcx /* copy by 64-bit words */
- rep
- movsq
- movq %rdx,%rcx
- andq $7,%rcx /* any bytes left? */
- jne 1f
- POP_FRAME_POINTER
+ cmpq $32,%rcx
+ jb 1016f
+
+ cmpq $256,%rcx
+ ja 1256f
+
+1032:
+ movq (%rsi),%rdx
+ movq %rdx,(%rdi)
+ movq 8(%rsi),%rdx
+ movq %rdx,8(%rdi)
+ movq 16(%rsi),%rdx
+ movq %rdx,16(%rdi)
+ movq 24(%rsi),%rdx
+ movq %rdx,24(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae 1032b
+ cmpb $0,%cl
+ jne 1016f
+ \end
ret
ALIGN_TEXT
-1:
+1016:
+ cmpb $16,%cl
+ jl 1008f
+ movq (%rsi),%rdx
+ movq %rdx,(%rdi)
+ movq 8(%rsi),%rdx
+ movq %rdx,8(%rdi)
+ subb $16,%cl
+ jz 1000f
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+1008:
+ cmpb $8,%cl
+ jl 1004f
+ movq (%rsi),%rdx
+ movq %rdx,(%rdi)
+ subb $8,%cl
+ jz 1000f
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+1004:
+ cmpb $4,%cl
+ jl 1002f
+ movl (%rsi),%edx
+ movl %edx,(%rdi)
+ subb $4,%cl
+ jz 1000f
+ leaq 4(%rsi),%rsi
+ leaq 4(%rdi),%rdi
+1002:
+ cmpb $2,%cl
+ jl 1001f
+ movw (%rsi),%dx
+ movw %dx,(%rdi)
+ subb $2,%cl
+ jz 1000f
+ leaq 2(%rsi),%rsi
+ leaq 2(%rdi),%rdi
+1001:
+ cmpb $1,%cl
+ jl 1000f
+ movb (%rsi),%dl
+ movb %dl,(%rdi)
+1000:
+ \end
+ ret
+
+ ALIGN_TEXT
+1256:
+.if \erms == 1
rep
movsb
- POP_FRAME_POINTER
+.else
+ shrq $3,%rcx /* copy by 64-bit words */
+ rep
+ movsq
+ movq %rdx,%rcx
+ andb $7,%cl /* any bytes left? */
+ jne 1004b
+.endif
+ \end
ret
- /* ALIGN_TEXT */
+.if \overlap == 1
+ /*
+ * Copy backwards.
+ */
+ ALIGN_TEXT
2:
- addq %rcx,%rdi /* copy backwards */
+ addq %rcx,%rdi
addq %rcx,%rsi
+
+ cmpq $32,%rcx
+ jb 2016f
+
+ cmpq $256,%rcx
+ ja 2256f
+
+2032:
+ movq -8(%rsi),%rdx
+ movq %rdx,-8(%rdi)
+ movq -16(%rsi),%rdx
+ movq %rdx,-16(%rdi)
+ movq -24(%rsi),%rdx
+ movq %rdx,-24(%rdi)
+ movq -32(%rsi),%rdx
+ movq %rdx,-32(%rdi)
+ leaq -32(%rsi),%rsi
+ leaq -32(%rdi),%rdi
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae 2032b
+ cmpb $0,%cl
+ jne 2016f
+ \end
+ ret
+ ALIGN_TEXT
+2016:
+ cmpb $16,%cl
+ jl 2008f
+ movq -8(%rsi),%rdx
+ movq %rdx,-8(%rdi)
+ movq -16(%rsi),%rdx
+ movq %rdx,-16(%rdi)
+ subb $16,%cl
+ jz 2000f
+ leaq -16(%rsi),%rsi
+ leaq -16(%rdi),%rdi
+2008:
+ cmpb $8,%cl
+ jl 2004f
+ movq -8(%rsi),%rdx
+ movq %rdx,-8(%rdi)
+ subb $8,%cl
+ jz 2000f
+ leaq -8(%rsi),%rsi
+ leaq -8(%rdi),%rdi
+2004:
+ cmpb $4,%cl
+ jl 2002f
+ movl -4(%rsi),%edx
+ movl %edx,-4(%rdi)
+ subb $4,%cl
+ jz 2000f
+ leaq -4(%rsi),%rsi
+ leaq -4(%rdi),%rdi
+2002:
+ cmpb $2,%cl
+ jl 2001f
+ movw -2(%rsi),%dx
+ movw %dx,-2(%rdi)
+ subb $2,%cl
+ jz 2000f
+ leaq -2(%rsi),%rsi
+ leaq -2(%rdi),%rdi
+2001:
+ cmpb $1,%cl
+ jl 2000f
+ movb -1(%rsi),%dl
+ movb %dl,-1(%rdi)
+2000:
+ \end
+ ret
+ ALIGN_TEXT
+2256:
decq %rdi
decq %rsi
std
- andq $7,%rcx /* any fractional bytes? */
+.if \erms == 1
+ rep
+ movsb
+.else
+ andq $7,%rcx /* any fractional bytes? */
je 3f
rep
movsb
3:
- movq %rdx,%rcx /* copy remainder by 32-bit words */
+ movq %rdx,%rcx /* copy remainder by 32-bit words */
shrq $3,%rcx
subq $7,%rsi
subq $7,%rdi
rep
movsq
+.endif
cld
- POP_FRAME_POINTER
+ \end
ret
-END(memmove_std)
+.endif
+.endm
-ENTRY(memmove_erms)
+.macro MEMMOVE_BEGIN
PUSH_FRAME_POINTER
movq %rdi,%rax
movq %rdx,%rcx
+.endm
- movq %rdi,%r8
- subq %rsi,%r8
- cmpq %rcx,%r8 /* overlapping && src < dst? */
- jb 1f
-
- rep
- movsb
+.macro MEMMOVE_END
POP_FRAME_POINTER
- ret
+.endm
-1:
- addq %rcx,%rdi /* copy backwards */
- addq %rcx,%rsi
- decq %rdi
- decq %rsi
- std
- rep
- movsb
- cld
- POP_FRAME_POINTER
- ret
+ENTRY(memmove_std)
+ MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
+END(memmove_std)
+
+ENTRY(memmove_erms)
+ MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove_erms)
/*
@@ -285,35 +439,11 @@ END(memmove_erms)
* Note: memcpy does not support overlapping copies
*/
ENTRY(memcpy_std)
- PUSH_FRAME_POINTER
- movq %rdi,%rax
- movq %rdx,%rcx
- cmpq $15,%rcx
- jbe 1f
- shrq $3,%rcx /* copy by 64-bit words */
- rep
- movsq
- movq %rdx,%rcx
- andq $7,%rcx /* any bytes left? */
- jne 1f
- POP_FRAME_POINTER
- ret
- ALIGN_TEXT
-1:
- rep
- movsb
- POP_FRAME_POINTER
- ret
+ MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_std)
ENTRY(memcpy_erms)
- PUSH_FRAME_POINTER
- movq %rdi,%rax
- movq %rdx,%rcx
- rep
- movsb
- POP_FRAME_POINTER
- ret
+ MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_erms)
/*
More information about the svn-src-all
mailing list