Shorterr releng/12.0/lib/msun/i387/s_remquo.S, releng/12.0/lib/msun/amd64/s_remquo.S, ...
Stefan Kanthak
stefan.kanthak at nexgo.de
Sun Sep 8 05:55:51 UTC 2019
Hi,
here's a patch to shave 4 instructions (and about 25% code size)
from
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquof.S
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquol.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquo.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquof.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquol.S
Especially the negation is rather clumsy:
1. the 2 shifts by 16 to propagate the sign to all bits can be
replaced with a single shift by 31, or with a CLTD alias CDQ
(which is 2 bytes shorter);
2. the conversion of -1 to +1 via AND and its addition can be
replaced by subtraction of -1.
The minor differences between the code for the float, double and
long double as well as the i387 and amd64 implementations are
intended; pick the variant you like best.
I prefer and recommend the variant with 3 ADC and 2 SHL instructions
used for the i387 double-precision function
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S,
which comes first.
stay tuned
Stefan Kanthak
PS: if you ever need to run these functions on a CPU without barrel
shifter, replace the first SHL or ROR with BT $14,%eax and the
second SHL or ROL with BT $9,%eax ... and hope that BT doesn't
use a slow shift under the hood.
--- -/releng/12.0/lib/msun/i387/s_remquo.S
+++ +/releng/12.0/lib/msun/i387/s_remquo.S
@@ -34,1 +34,2 @@
ENTRY(remquo)
+ xorl %ecx,%ecx
@@ -42,22 +43,17 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ adcl %ecx,%ecx
+ shll $18,%eax
+ adcl %ecx,%ecx
+ shll $5,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl 16(%esp),%ecx
- xorl 8(%esp),%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl 16(%esp),%eax
+ xorl 8(%esp),%eax
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl 20(%esp),%ecx
- movl %eax,(%ecx)
+ movl 20(%esp),%eax
+ movl %ecx,(%eax)
ret
END(remquo)
--- -/releng/12.0/lib/msun/i387/s_remquof.S
+++ +/releng/12.0/lib/msun/i387/s_remquof.S
@@ -42,22 +42,18 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ sbbl %ecx,%ecx
+ negl %ecx
+ shll $18,%eax
+ adcl %ecx,%ecx
+ shll $5,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl 8(%esp),%ecx
- xorl 4(%esp),%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl 8(%esp),%eax
+ xorl 4(%esp),%eax
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl 12(%esp),%ecx
- movl %eax,(%ecx)
+ movl 12(%esp),%eax
+ movl %ecx,(%eax)
ret
END(remquof)
--- -/releng/12.0/lib/msun/i387/s_remquol.S
+++ +/releng/12.0/lib/msun/i387/s_remquol.S
@@ -42,22 +42,19 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ setc %cl
+ movzbl %cl,%ecx
+ shll $18,%eax
+ adcl %ecx,%ecx
+ shll $5,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl 24(%esp),%ecx
- xorl 12(%esp),%ecx
- movsx %cx,%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl 24(%esp),%eax
+ xorl 12(%esp),%eax
+ cwtl
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl 28(%esp),%ecx
- movl %eax,(%ecx)
+ movl 28(%esp),%eax
+ movl %ecx,(%eax)
ret
+END(remquol)
--- -/releng/12.0/lib/msun/amd64/s_remquo.S
--- +/releng/12.0/lib/msun/amd64/s_remquo.S
@@ -34,1 +35,2 @@
ENTRY(remquo)
+ xorl %ecx,%ecx
@@ -44,19 +45,14 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ adcl %ecx,%ecx
+ rorl $15,%eax
+ adcl %ecx,%ecx
+ roll $6,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl -12(%rsp),%ecx
- xorl -4(%rsp),%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl -12(%rsp),%eax
+ xorl -4(%rsp),%eax
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl %eax,(%rdi)
+ movl %ecx,(%rdi)
--- -/releng/12.0/lib/msun/amd64/s_remquof.S
--- +/releng/12.0/lib/msun/amd64/s_remquof.S
@@ -44,19 +44,15 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ sbbl %ecx,%ecx
+ negl %ecx
+ rorl $15,%eax
+ adcl %ecx,%ecx
+ roll $6,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl -8(%rsp),%ecx
- xorl -4(%rsp),%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl -8(%rsp),%eax
+ xorl -4(%rsp),%eax
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl %eax,(%rdi)
+ movl %ecx,(%rdi)
--- -/releng/12.0/lib/msun/amd64/s_remquol.S
--- +/releng/12.0/lib/msun/amd64/s_remquol.S
@@ -42,21 +42,18 @@
/* Extract the three low-order bits of the quotient from C0,C3,C1. */
- shrl $6,%eax
- movl %eax,%ecx
- andl $0x108,%eax
- rorl $7,%eax
- orl %eax,%ecx
- roll $4,%eax
- orl %ecx,%eax
- andl $7,%eax
+ setc %cl
+ movzbl %cl,%ecx
+ rorl $15,%eax
+ adcl %ecx,%ecx
+ roll $6,%eax
+ adcl %ecx,%ecx
/* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */
- movl 32(%rsp),%ecx
- xorl 16(%rsp),%ecx
- movsx %cx,%ecx
- sarl $16,%ecx
- sarl $16,%ecx
- xorl %ecx,%eax
- andl $1,%ecx
- addl %ecx,%eax
+ movl 32(%rsp),%eax
+ xorl 16(%rsp),%eax
+ cwtl
+ cltd
+ xorl %edx,%ecx
+ subl %edx,%ecx
/* Store the quotient and return. */
- movl %eax,(%rdi)
+ movl %ecx,(%rdi)
ret
+END(remquol)
More information about the freebsd-numerics
mailing list