Shorterr releng/12.0/lib/msun/i387/s_remquo.S, releng/12.0/lib/msun/amd64/s_remquo.S, ...

Sun Sep 8 05:55:51 UTC 2019

Hi,

here's a patch to shave 4 instructions (and about 25% code size)
from
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquof.S
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquol.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquo.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquof.S
http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquol.S

Especially the negation is rather clumsy:
1. the 2 shifts by 16 to propagate the sign to all bits can be
   replaced with a single shift by 31, or with a CLTD alias CDQ
   (which is 2 bytes shorter);
2. the conversion of -1 to +1 via AND and its addition can be
   replaced by subtraction of -1.

The minor differences between the code for the float, double and
long double as well as the i387 and amd64 implementations are
intended; pick the variant you like best.
I prefer and recommend the variant with 3 ADC and 2 SHL instructions
used for the i387 double-precision function
http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S,
which comes first.

stay tuned
Stefan Kanthak

PS: if you ever need to run these functions on a CPU without barrel
    shifter, replace the first SHL or ROR with BT $14,%eax and the
    second SHL or ROL with BT $9,%eax ... and hope that BT doesn't
    use a slow shift under the hood.

--- -/releng/12.0/lib/msun/i387/s_remquo.S
+++ +/releng/12.0/lib/msun/i387/s_remquo.S
@@ -34,1 +34,2 @@
 ENTRY(remquo)
+        xorl    %ecx,%ecx
@@ -42,22 +43,17 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        adcl    %ecx,%ecx
+        shll    $18,%eax
+        adcl    %ecx,%ecx
+        shll    $5,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    16(%esp),%ecx
-        xorl    8(%esp),%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    16(%esp),%eax
+        xorl    8(%esp),%eax
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    20(%esp),%ecx
-        movl    %eax,(%ecx)
+        movl    20(%esp),%eax
+        movl    %ecx,(%eax)
         ret
 END(remquo)

--- -/releng/12.0/lib/msun/i387/s_remquof.S
+++ +/releng/12.0/lib/msun/i387/s_remquof.S
@@ -42,22 +42,18 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        sbbl    %ecx,%ecx
+        negl    %ecx
+        shll    $18,%eax
+        adcl    %ecx,%ecx
+        shll    $5,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    8(%esp),%ecx
-        xorl    4(%esp),%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    8(%esp),%eax
+        xorl    4(%esp),%eax
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    12(%esp),%ecx
-        movl    %eax,(%ecx)
+        movl    12(%esp),%eax
+        movl    %ecx,(%eax)
         ret
 END(remquof)

--- -/releng/12.0/lib/msun/i387/s_remquol.S
+++ +/releng/12.0/lib/msun/i387/s_remquol.S
@@ -42,22 +42,19 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        setc    %cl
+        movzbl  %cl,%ecx
+        shll    $18,%eax
+        adcl    %ecx,%ecx
+        shll    $5,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    24(%esp),%ecx
-        xorl    12(%esp),%ecx
-        movsx   %cx,%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    24(%esp),%eax
+        xorl    12(%esp),%eax
+        cwtl
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    28(%esp),%ecx
-        movl    %eax,(%ecx)
+        movl    28(%esp),%eax
+        movl    %ecx,(%eax)
         ret
+END(remquol)

--- -/releng/12.0/lib/msun/amd64/s_remquo.S
--- +/releng/12.0/lib/msun/amd64/s_remquo.S
@@ -34,1 +35,2 @@
 ENTRY(remquo)
+        xorl    %ecx,%ecx
@@ -44,19 +45,14 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        adcl    %ecx,%ecx
+        rorl    $15,%eax
+        adcl    %ecx,%ecx
+        roll    $6,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    -12(%rsp),%ecx
-        xorl    -4(%rsp),%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    -12(%rsp),%eax
+        xorl    -4(%rsp),%eax
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    %eax,(%rdi)
+        movl    %ecx,(%rdi)

--- -/releng/12.0/lib/msun/amd64/s_remquof.S
--- +/releng/12.0/lib/msun/amd64/s_remquof.S
@@ -44,19 +44,15 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        sbbl    %ecx,%ecx
+        negl    %ecx
+        rorl    $15,%eax
+        adcl    %ecx,%ecx
+        roll    $6,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    -8(%rsp),%ecx
-        xorl    -4(%rsp),%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    -8(%rsp),%eax
+        xorl    -4(%rsp),%eax
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    %eax,(%rdi)
+        movl    %ecx,(%rdi)

--- -/releng/12.0/lib/msun/amd64/s_remquol.S
--- +/releng/12.0/lib/msun/amd64/s_remquol.S
@@ -42,21 +42,18 @@
 /* Extract the three low-order bits of the quotient from C0,C3,C1. */
-        shrl    $6,%eax
-        movl    %eax,%ecx
-        andl    $0x108,%eax
-        rorl    $7,%eax
-        orl     %eax,%ecx
-        roll    $4,%eax
-        orl     %ecx,%eax
-        andl    $7,%eax
+        setc    %cl
+        movzbl  %cl,%ecx
+        rorl    $15,%eax
+        adcl    %ecx,%ecx
+        roll    $6,%eax
+        adcl    %ecx,%ecx
 /* Negate the quotient bits if x*y<0.  Avoid using an unpredictable branch. */
-        movl    32(%rsp),%ecx
-        xorl    16(%rsp),%ecx
-        movsx   %cx,%ecx
-        sarl    $16,%ecx
-        sarl    $16,%ecx
-        xorl    %ecx,%eax
-        andl    $1,%ecx
-        addl    %ecx,%eax
+        movl    32(%rsp),%eax
+        xorl    16(%rsp),%eax
+        cwtl
+        cltd
+        xorl    %edx,%ecx
+        subl    %edx,%ecx
 /* Store the quotient and return. */
-        movl    %eax,(%rdi)
+        movl    %ecx,(%rdi)
         ret
+END(remquol)