Implementation for coshl.
Steve Kargl
sgk at troutmask.apl.washington.edu
Mon Jun 10 00:36:51 UTC 2013
I suspect that there will be some nits with the implementation.
Anyway, testing gives
Arch | Interval | #calls | Time (s) | Max ULP | Compiler | Value
-----------+---------------------+--------+----------+---------+----------+-------
i386 [1] | [ 0.00: 0.35] | 100M | 15.0198 | 0.58583 | gcc | 1
i386 [1] | [ 0.35: 24.00] | 100M | 15.1858 | 1.01504 | gcc | 2
i386 [1] | [ 24.00:11356.52] | 100M | 12.9591 | 0.51198 | gcc | 3
i386 [1] | [11356.52:11357.22] | 100M | 13.3328 | 1.90988 | gcc | 4
-----------+---------------------+--------+----------+---------+----------+-------
amd64 [2]| [ 0.00: 0.35] | 100M | 11.7811 | 0.63075 | clang | 5
amd64 [2]| [ 0.35: 24.00] | 100M | 11.0662 | 1.01535 | clang | 6
amd64 [2]| [ 24.00:11356.52] | 100M | 9.97704 | 0.50852 | clang | 7
amd64 [2]| [11356.52:11357.22] | 100M | 10.8221 | 1.90931 | clang | 8
-----------+---------------------+--------+----------+---------+----------+-------
amd64 [2]| [ 0.00: 0.35] | 100M | 10.5930 | 0.63075 | gcc | 9
amd64 [2]| [ 0.35: 24.00] | 100M | 10.0606 | 1.01535 | gcc | 10
amd64 [2]| [ 24.00:11356.52] | 100M | 8.76182 | 0.50852 | gcc | 11
amd64 [2]| [11356.52:11357.22] | 100M | 9.23632 | 1.90931 | gcc | 12
-----------+---------------------+--------+----------+---------+----------+-------
sparc64 [3]| [ 0.00: 0.35] | 1M | 62.7154 | 0.57638 | gcc | 13
sparc64 [3]| [ 0.35: 41.00] | 1M | 43.2052 | 1.00709 | gcc | 14
sparc64 [3]| [ 41.00:11356.52] | 1M | 40.5336 | 0.50587 | gcc | 15
sparc64 [3]| [11356.52:11357.22] | 1M | 44.7029 | 1.90400 | gcc | 16
1 3.3932642386627052e-01, 0x1.5b7862d4b272b9c6p-2
2 1.2802453843180306e+00, 0x1.47be2958803a846cp+0
3 2.4087260438954509e+01, 0x1.81656b33b8b51fcp+4
4 1.1357216248489914e+04, 0x1.62e9bae07cffe94ap+13
5 3.4657359027997265e-01, 0x1.62e42fefa39ef35ap-2
6 1.2651166512735005e+00, 0x1.43deaf52d83fa6c4p+0
7 2.4019265291717229e+01, 0x1.804ee91f5df97166p+4
8 1.1357215893598522e+04, 0x1.62e9ba266c488adcp+13
9 3.4657359027997265e-01, 0x1.62e42fefa39ef35ap-2
10 1.2651166512735005e+00, 0x1.43deaf52d83fa6c4p+0
11 2.4019265291717229e+01, 0x1.804ee91f5df97166p+4
12 1.1357215893598522e+04, 0x1.62e9ba266c488adcp+13
13 3.35979864752159202758685843670565218e-01, 0x1.580b1b0ce66d750dad9e5773bc5bp-2
14 1.28794527359513292307645171506249973e+00, 0x1.49b6c80d20fd82749b4ffd5d2b24p+0
15 1.03131066340595824072037883331972584e+04, 0x1.4248da62f5345e862cf6d17602dcp+13
16 1.13572063870748402222198225060228912e+04, 0x1.62e9a6ae44460bffbacfe7a589c2p+13
[1] Intel(R) Core(TM)2 Duo CPU T7250 at 2.00GHz (1995.04-MHz 686-class CPU)
[2] AMD Opteron(tm) Processor 248 (2191.60-MHz K8-class CPU)
[3] Sun Microsystems UltraSparc-IIe Processor (648.00 MHz CPU)
Two observations:
1. The max ulp for the intervals [0.35:24] and [0.35:41] of 1.xxx is
due to the division in the expression half*exp(x) + half/exp(x).
Bruce and I exchanged emails a long time ago about possible ways
to reduce the ulp in this range by either computer exp(x) with
extra precision or using a table with cosh(x) = cosh(x_i) * cosh(d)
+ sinh(x_i) * sinh(d) with d = x - x_i. I tried the latter with
disappointing results. The former would require a refactoring of
s_expl.c into a kernel __kernel_expl(x, hi, lo). I have no plans on
pursuing this at the this time.
2. gcc produces faster code than clang on amd64.
Comments welcomed.
--
Steve
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*
* Converted to long double by Steven G. Kargl
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* See comments in src/e_cosh.c for a description of the algorithm.
*/
#include <float.h>
#ifdef __i386__
#include <ieeefp.h>
#endif
#include "fpmath.h"
#include "math.h"
#include "math_private.h"
#if LDBL_MANT_DIG == 64
static const union IEEEl2bits
#define EXP_TINY -32
#define s_threshold 24
/* log(2) / 2 */
log2o2u = LD80C(0xb17217f7d1cf79ac, -2, 0.346573590279972654714L),
#define log2o2 (log2o2u.e)
/* x = log(LDBL_MAX - 0.5) */
o_threshold1u = LD80C(0xb17217f7d1cf79ac, 13, 11356.5234062941439497L),
#define o_threshold1 (o_threshold1u.e)
/* log(LDBL_MAX - 0.5) + log(2) */
o_threshold2u = LD80C(0xb174ddc031aec0ea, 13, 11357.2165534747038951L);
#define o_threshold2 (o_threshold2u.e)
#elif LDBL_MANT_DIG == 113
#define EXP_TINY -56
#define s_threshold 41
static long double
log2o2 = 0.346573590279972654708616060729088288L,
o_threshold1 = 11356.5234062941439494919310779707650L,
o_threshold2 = 11357.2165534747038948013483100922230L;
#else
#error "Unsupported long double format"
#endif
static const long double huge = 0x1.p10000L;
static const double half = 0.5;
#define BIAS (LDBL_MAX_EXP - 1)
long double
coshl(long double x)
{
long double t, w;
uint16_t hx, ix;
ENTERI();
GET_LDBL_EXPSIGN(hx, x);
ix = hx & 0x7fff;
SET_LDBL_EXPSIGN(x, ix);
/* x is +-Inf or NaN. */
if (ix == BIAS + LDBL_MAX_EXP)
RETURNI(x * x);
if (x < log2o2) {
if (ix < BIAS + EXP_TINY) { /* |x| < 0x1pEXP_TINY */
/* cosh(x) = 1 exactly iff x = +-0. */
if ((int)x == 0)
RETURNI(1.0L);
}
t = expm1l(x);
w = 1 + t;
RETURNI(1 + t * t / (w + w));
}
if (x < s_threshold) {
t = expl(x);
RETURNI(half * t + half / t);
}
if (x < o_threshold1)
RETURNI(half * expl(x));
if (x < o_threshold2) {
t = expl(half * x);
RETURNI(half * t * t);
}
RETURNI(huge * huge);
}
More information about the freebsd-numerics
mailing list