Implementation for coshl.

Mon Jun 10 00:36:51 UTC 2013

I suspect that there will be some nits with the implementation.
Anyway, testing gives

    Arch   |     Interval        | #calls | Time (s) | Max ULP | Compiler | Value
-----------+---------------------+--------+----------+---------+----------+-------
  i386 [1] | [    0.00:    0.35] |  100M  | 15.0198  | 0.58583 |  gcc     |  1
  i386 [1] | [    0.35:   24.00] |  100M  | 15.1858  | 1.01504 |  gcc     |  2
  i386 [1] | [   24.00:11356.52] |  100M  | 12.9591  | 0.51198 |  gcc     |  3
  i386 [1] | [11356.52:11357.22] |  100M  | 13.3328  | 1.90988 |  gcc     |  4
-----------+---------------------+--------+----------+---------+----------+-------
  amd64 [2]| [    0.00:    0.35] |  100M  | 11.7811  | 0.63075 |  clang   |  5
  amd64 [2]| [    0.35:   24.00] |  100M  | 11.0662  | 1.01535 |  clang   |  6
  amd64 [2]| [   24.00:11356.52] |  100M  | 9.97704  | 0.50852 |  clang   |  7
  amd64 [2]| [11356.52:11357.22] |  100M  | 10.8221  | 1.90931 |  clang   |  8
-----------+---------------------+--------+----------+---------+----------+-------
  amd64 [2]| [    0.00:    0.35] |  100M  | 10.5930  | 0.63075 |  gcc     |  9
  amd64 [2]| [    0.35:   24.00] |  100M  | 10.0606  | 1.01535 |  gcc     | 10
  amd64 [2]| [   24.00:11356.52] |  100M  | 8.76182  | 0.50852 |  gcc     | 11
  amd64 [2]| [11356.52:11357.22] |  100M  | 9.23632  | 1.90931 |  gcc     | 12
-----------+---------------------+--------+----------+---------+----------+-------
sparc64 [3]| [    0.00:    0.35] |    1M  | 62.7154  | 0.57638 |  gcc     | 13
sparc64 [3]| [    0.35:   41.00] |    1M  | 43.2052  | 1.00709 |  gcc     | 14
sparc64 [3]| [   41.00:11356.52] |    1M  | 40.5336  | 0.50587 |  gcc     | 15
sparc64 [3]| [11356.52:11357.22] |    1M  | 44.7029  | 1.90400 |  gcc     | 16

 1  3.3932642386627052e-01, 0x1.5b7862d4b272b9c6p-2
 2  1.2802453843180306e+00, 0x1.47be2958803a846cp+0
 3  2.4087260438954509e+01, 0x1.81656b33b8b51fcp+4
 4  1.1357216248489914e+04, 0x1.62e9bae07cffe94ap+13
 5  3.4657359027997265e-01, 0x1.62e42fefa39ef35ap-2
 6  1.2651166512735005e+00, 0x1.43deaf52d83fa6c4p+0
 7  2.4019265291717229e+01, 0x1.804ee91f5df97166p+4
 8  1.1357215893598522e+04, 0x1.62e9ba266c488adcp+13
 9  3.4657359027997265e-01, 0x1.62e42fefa39ef35ap-2
10  1.2651166512735005e+00, 0x1.43deaf52d83fa6c4p+0
11  2.4019265291717229e+01, 0x1.804ee91f5df97166p+4
12  1.1357215893598522e+04, 0x1.62e9ba266c488adcp+13
13  3.35979864752159202758685843670565218e-01, 0x1.580b1b0ce66d750dad9e5773bc5bp-2
14  1.28794527359513292307645171506249973e+00, 0x1.49b6c80d20fd82749b4ffd5d2b24p+0
15  1.03131066340595824072037883331972584e+04, 0x1.4248da62f5345e862cf6d17602dcp+13
16  1.13572063870748402222198225060228912e+04, 0x1.62e9a6ae44460bffbacfe7a589c2p+13

[1] Intel(R) Core(TM)2 Duo CPU T7250 at 2.00GHz (1995.04-MHz 686-class CPU)
[2] AMD Opteron(tm) Processor 248 (2191.60-MHz K8-class CPU)
[3] Sun Microsystems UltraSparc-IIe Processor (648.00 MHz CPU)

Two observations:

1. The max ulp for the intervals [0.35:24] and [0.35:41] of 1.xxx is
due to the division in the expression half*exp(x) + half/exp(x).
Bruce and I exchanged emails a long time ago about possible ways
to reduce the ulp in this range by either computer exp(x) with
extra precision or using a table with cosh(x) = cosh(x_i) * cosh(d)
+ sinh(x_i) * sinh(d) with d = x - x_i.  I tried the latter with
disappointing results.  The former would require a refactoring of
s_expl.c into a kernel __kernel_expl(x, hi, lo).  I have no plans on
pursuing this at the this time.

2. gcc produces faster code than clang on amd64.

Comments welcomed.

-- 
Steve

/*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice 
 * is preserved.
 * ====================================================
 *
 * Converted to long double by Steven G. Kargl
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

/*
 * See comments in src/e_cosh.c for a description of the algorithm. 
 */
#include <float.h>
#ifdef __i386__
#include <ieeefp.h>
#endif
#include "fpmath.h"
#include "math.h"
#include "math_private.h"

#if LDBL_MANT_DIG == 64
static const union IEEEl2bits
#define	EXP_TINY	-32
#define	s_threshold	24
/* log(2) / 2 */
log2o2u = LD80C(0xb17217f7d1cf79ac, -2, 0.346573590279972654714L),
#define	log2o2	(log2o2u.e)
/* x = log(LDBL_MAX - 0.5) */
o_threshold1u = LD80C(0xb17217f7d1cf79ac, 13, 11356.5234062941439497L),
#define	o_threshold1	(o_threshold1u.e)
/* log(LDBL_MAX - 0.5) + log(2) */
o_threshold2u = LD80C(0xb174ddc031aec0ea, 13, 11357.2165534747038951L);
#define	o_threshold2	(o_threshold2u.e)
#elif LDBL_MANT_DIG == 113
#define	EXP_TINY	-56
#define	s_threshold	41
static long double
log2o2 = 0.346573590279972654708616060729088288L,
o_threshold1 = 11356.5234062941439494919310779707650L,
o_threshold2 = 11357.2165534747038948013483100922230L;
#else
#error "Unsupported long double format"
#endif

static const long double huge = 0x1.p10000L;
static const double half = 0.5;

#define BIAS	(LDBL_MAX_EXP - 1)

long double
coshl(long double x)
{
	long double t, w;
	uint16_t hx, ix;

	ENTERI();

	GET_LDBL_EXPSIGN(hx, x);
	ix = hx & 0x7fff;
	SET_LDBL_EXPSIGN(x, ix);

	/* x is +-Inf or NaN. */
	if (ix == BIAS + LDBL_MAX_EXP)
		RETURNI(x * x);

	if (x < log2o2) {
		if (ix < BIAS + EXP_TINY) {	/* |x| < 0x1pEXP_TINY */
			/* cosh(x) = 1 exactly iff x = +-0. */
			if ((int)x == 0)
				RETURNI(1.0L);
		}
		t = expm1l(x);
		w = 1 + t;
		RETURNI(1 + t * t / (w + w));
	}

	if (x < s_threshold) {
		t = expl(x);
		RETURNI(half * t + half / t);
	}

	if (x < o_threshold1)
		RETURNI(half * expl(x));

	if (x < o_threshold2) {
		t = expl(half * x);
		RETURNI(half * t * t);
	}

	RETURNI(huge * huge);
}