git: 35f6b83049da - stable/14 - Update the Arm Optimized Routine library to v24.01
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 08 Apr 2024 13:15:43 UTC
The branch stable/14 has been updated by andrew: URL: https://cgit.FreeBSD.org/src/commit/?id=35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6 commit 35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6 Author: Andrew Turner <andrew@FreeBSD.org> AuthorDate: 2024-02-29 11:39:12 +0000 Commit: Andrew Turner <andrew@FreeBSD.org> CommitDate: 2024-04-08 13:15:18 +0000 Update the Arm Optimized Routine library to v24.01 Sponsored by: Arm Ltd (cherry picked from commit 5a02ffc32e777041dd2dad4e651ed2a0865a0a5d) --- contrib/arm-optimized-routines/README | 2 +- contrib/arm-optimized-routines/config.mk.dist | 13 +- contrib/arm-optimized-routines/math/Dir.mk | 6 +- .../arm-optimized-routines/math/aarch64/v_cos.c | 87 + .../arm-optimized-routines/math/aarch64/v_cosf.c | 82 + .../arm-optimized-routines/math/aarch64/v_exp.c | 125 + .../arm-optimized-routines/math/aarch64/v_exp2f.c | 113 + .../math/aarch64/v_exp2f_1u.c | 72 + .../math/aarch64/v_exp_data.c | 146 + .../arm-optimized-routines/math/aarch64/v_expf.c | 122 + .../math/aarch64/v_expf_1u.c | 77 + .../arm-optimized-routines/math/aarch64/v_log.c | 100 + .../math/aarch64/v_log_data.c | 156 + .../arm-optimized-routines/math/aarch64/v_logf.c | 74 + .../arm-optimized-routines/math/aarch64/v_math.h | 135 + .../arm-optimized-routines/math/aarch64/v_pow.c | 22 + .../arm-optimized-routines/math/aarch64/v_powf.c | 148 + .../arm-optimized-routines/math/aarch64/v_sin.c | 97 + .../arm-optimized-routines/math/aarch64/v_sinf.c | 82 + contrib/arm-optimized-routines/math/exp10.c | 129 + contrib/arm-optimized-routines/math/exp_data.c | 23 +- .../arm-optimized-routines/math/include/mathlib.h | 67 +- contrib/arm-optimized-routines/math/math_config.h | 61 +- contrib/arm-optimized-routines/math/s_cos.c | 6 - contrib/arm-optimized-routines/math/s_cosf.c | 6 - contrib/arm-optimized-routines/math/s_exp.c | 6 - contrib/arm-optimized-routines/math/s_exp2f.c | 6 - contrib/arm-optimized-routines/math/s_exp2f_1u.c | 6 - contrib/arm-optimized-routines/math/s_expf.c | 6 - contrib/arm-optimized-routines/math/s_expf_1u.c | 6 - contrib/arm-optimized-routines/math/s_log.c | 6 - contrib/arm-optimized-routines/math/s_logf.c | 6 - contrib/arm-optimized-routines/math/s_pow.c | 6 - contrib/arm-optimized-routines/math/s_powf.c | 6 - contrib/arm-optimized-routines/math/s_sin.c | 6 - contrib/arm-optimized-routines/math/s_sinf.c | 6 - .../arm-optimized-routines/math/test/mathbench.c | 152 +- .../math/test/mathbench_funcs.h | 50 +- .../math/test/mathbench_wrappers.h | 42 +- .../arm-optimized-routines/math/test/mathtest.c | 9 +- contrib/arm-optimized-routines/math/test/runulp.sh | 112 +- .../math/test/testcases/directed/exp10.tst | 15 + contrib/arm-optimized-routines/math/test/ulp.c | 81 +- contrib/arm-optimized-routines/math/test/ulp.h | 29 +- .../arm-optimized-routines/math/test/ulp_funcs.h | 50 +- .../math/test/ulp_wrappers.h | 36 +- contrib/arm-optimized-routines/math/tgamma128.c | 356 ++ contrib/arm-optimized-routines/math/tgamma128.h | 141 + .../math/tools/tgamma128_gen.jl | 212 ++ contrib/arm-optimized-routines/math/v_cos.c | 95 - contrib/arm-optimized-routines/math/v_cosf.c | 84 - contrib/arm-optimized-routines/math/v_exp.c | 128 - contrib/arm-optimized-routines/math/v_exp.h | 14 - contrib/arm-optimized-routines/math/v_exp2f.c | 117 - contrib/arm-optimized-routines/math/v_exp2f_1u.c | 75 - contrib/arm-optimized-routines/math/v_expf.c | 122 - contrib/arm-optimized-routines/math/v_expf_1u.c | 80 - contrib/arm-optimized-routines/math/v_log.c | 104 - contrib/arm-optimized-routines/math/v_log.h | 18 - contrib/arm-optimized-routines/math/v_log_data.c | 158 - contrib/arm-optimized-routines/math/v_logf.c | 73 - contrib/arm-optimized-routines/math/v_math.h | 661 ---- contrib/arm-optimized-routines/math/v_pow.c | 27 - contrib/arm-optimized-routines/math/v_powf.c | 235 -- contrib/arm-optimized-routines/math/v_sin.c | 103 - contrib/arm-optimized-routines/math/v_sinf.c | 88 - contrib/arm-optimized-routines/math/vn_cos.c | 12 - contrib/arm-optimized-routines/math/vn_cosf.c | 12 - contrib/arm-optimized-routines/math/vn_exp.c | 12 - contrib/arm-optimized-routines/math/vn_exp2f.c | 12 - contrib/arm-optimized-routines/math/vn_exp2f_1u.c | 11 - contrib/arm-optimized-routines/math/vn_expf.c | 12 - contrib/arm-optimized-routines/math/vn_expf_1u.c | 11 - contrib/arm-optimized-routines/math/vn_log.c | 12 - contrib/arm-optimized-routines/math/vn_logf.c | 12 - contrib/arm-optimized-routines/math/vn_pow.c | 12 - contrib/arm-optimized-routines/math/vn_powf.c | 12 - contrib/arm-optimized-routines/math/vn_sin.c | 12 - contrib/arm-optimized-routines/math/vn_sinf.c | 12 - contrib/arm-optimized-routines/pl/math/Dir.mk | 89 +- contrib/arm-optimized-routines/pl/math/acos_2u.c | 100 + contrib/arm-optimized-routines/pl/math/acosf_1u4.c | 99 + contrib/arm-optimized-routines/pl/math/asin_3u.c | 106 + contrib/arm-optimized-routines/pl/math/asin_data.c | 19 + contrib/arm-optimized-routines/pl/math/asinf_2u5.c | 100 + .../arm-optimized-routines/pl/math/asinf_data.c | 16 + contrib/arm-optimized-routines/pl/math/asinh_2u5.c | 5 +- .../arm-optimized-routines/pl/math/asinhf_3u5.c | 6 +- .../arm-optimized-routines/pl/math/atan_common.h | 40 +- contrib/arm-optimized-routines/pl/math/atanf_2u9.c | 12 +- .../arm-optimized-routines/pl/math/atanf_common.h | 33 +- contrib/arm-optimized-routines/pl/math/atanh_3u.c | 15 +- .../arm-optimized-routines/pl/math/atanhf_3u1.c | 12 +- contrib/arm-optimized-routines/pl/math/cbrt_2u.c | 5 +- contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c | 9 +- contrib/arm-optimized-routines/pl/math/cosh_2u.c | 9 +- contrib/arm-optimized-routines/pl/math/coshf_1u9.c | 9 +- contrib/arm-optimized-routines/pl/math/cospi_3u1.c | 89 + .../arm-optimized-routines/pl/math/cospif_2u6.c | 84 + contrib/arm-optimized-routines/pl/math/erf_2u5.c | 102 + contrib/arm-optimized-routines/pl/math/erf_data.c | 788 +++++ contrib/arm-optimized-routines/pl/math/erfc_1u8.c | 153 + contrib/arm-optimized-routines/pl/math/erfc_4u5.c | 155 - contrib/arm-optimized-routines/pl/math/erfc_data.c | 3628 +++++++++++++++++++- contrib/arm-optimized-routines/pl/math/erfcf.h | 38 - contrib/arm-optimized-routines/pl/math/erfcf_1u7.c | 103 + contrib/arm-optimized-routines/pl/math/erfcf_2u.c | 133 - .../arm-optimized-routines/pl/math/erfcf_data.c | 703 +++- contrib/arm-optimized-routines/pl/math/erff_1u5.c | 108 - contrib/arm-optimized-routines/pl/math/erff_2u.c | 82 + contrib/arm-optimized-routines/pl/math/erff_data.c | 532 ++- .../arm-optimized-routines/pl/math/erfinv_24u5.c | 81 + .../arm-optimized-routines/pl/math/erfinvf_4u7.c | 74 + contrib/arm-optimized-routines/pl/math/erfinvl.c | 114 + contrib/arm-optimized-routines/pl/math/estrin.h | 16 - .../arm-optimized-routines/pl/math/estrin_wrap.h | 48 - contrib/arm-optimized-routines/pl/math/estrinf.h | 14 - contrib/arm-optimized-routines/pl/math/expf.c | 4 +- contrib/arm-optimized-routines/pl/math/expm1_2u5.c | 19 +- .../arm-optimized-routines/pl/math/expm1f_1u6.c | 11 +- .../arm-optimized-routines/pl/math/finite_pow.h | 365 ++ contrib/arm-optimized-routines/pl/math/horner.h | 14 - .../arm-optimized-routines/pl/math/horner_wrap.h | 34 - contrib/arm-optimized-routines/pl/math/hornerf.h | 14 - .../pl/math/include/mathlib.h | 238 +- .../pl/math/include/pl_test.h | 8 +- contrib/arm-optimized-routines/pl/math/log1p_2u.c | 17 +- .../arm-optimized-routines/pl/math/log1pf_2u1.c | 16 +- .../arm-optimized-routines/pl/math/math_config.h | 252 +- contrib/arm-optimized-routines/pl/math/math_err.c | 4 +- contrib/arm-optimized-routines/pl/math/math_errf.c | 4 +- .../pl/math/pairwise_horner.h | 14 - .../pl/math/pairwise_horner_wrap.h | 48 - .../pl/math/pairwise_hornerf.h | 14 - contrib/arm-optimized-routines/pl/math/pl_sig.h | 56 +- .../pl/math/poly_advsimd_f32.h | 24 + .../pl/math/poly_advsimd_f64.h | 24 + .../arm-optimized-routines/pl/math/poly_generic.h | 277 ++ .../pl/math/poly_scalar_f32.h | 24 + .../pl/math/poly_scalar_f64.h | 24 + .../arm-optimized-routines/pl/math/poly_sve_f32.h | 26 + .../arm-optimized-routines/pl/math/poly_sve_f64.h | 26 + .../pl/math/poly_sve_generic.h | 301 ++ .../arm-optimized-routines/pl/math/s_acosh_3u5.c | 6 - .../arm-optimized-routines/pl/math/s_acoshf_3u1.c | 6 - .../arm-optimized-routines/pl/math/s_asinh_3u5.c | 6 - .../arm-optimized-routines/pl/math/s_asinhf_2u7.c | 6 - .../arm-optimized-routines/pl/math/s_atan2_3u.c | 6 - .../arm-optimized-routines/pl/math/s_atan2f_3u.c | 6 - .../arm-optimized-routines/pl/math/s_atan_2u5.c | 6 - .../arm-optimized-routines/pl/math/s_atanf_3u.c | 6 - .../arm-optimized-routines/pl/math/s_atanh_3u5.c | 6 - .../arm-optimized-routines/pl/math/s_atanhf_3u1.c | 6 - contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c | 6 - .../arm-optimized-routines/pl/math/s_cbrtf_1u5.c | 6 - contrib/arm-optimized-routines/pl/math/s_cosh_2u.c | 6 - .../arm-optimized-routines/pl/math/s_coshf_2u4.c | 6 - contrib/arm-optimized-routines/pl/math/s_erf_2u.c | 6 - contrib/arm-optimized-routines/pl/math/s_erfc_4u.c | 6 - .../arm-optimized-routines/pl/math/s_erfcf_1u.c | 6 - .../arm-optimized-routines/pl/math/s_erff_1u5.c | 6 - .../arm-optimized-routines/pl/math/s_exp_tail.c | 6 - contrib/arm-optimized-routines/pl/math/s_expf.c | 6 - .../arm-optimized-routines/pl/math/s_expm1_2u5.c | 6 - .../arm-optimized-routines/pl/math/s_expm1f_1u6.c | 6 - .../arm-optimized-routines/pl/math/s_log10_2u5.c | 6 - .../arm-optimized-routines/pl/math/s_log10f_3u5.c | 6 - .../arm-optimized-routines/pl/math/s_log1p_2u5.c | 6 - .../arm-optimized-routines/pl/math/s_log1pf_2u1.c | 6 - contrib/arm-optimized-routines/pl/math/s_log2_3u.c | 6 - .../arm-optimized-routines/pl/math/s_log2f_2u5.c | 6 - contrib/arm-optimized-routines/pl/math/s_sinh_3u.c | 6 - .../arm-optimized-routines/pl/math/s_sinhf_2u3.c | 6 - contrib/arm-optimized-routines/pl/math/s_tan_3u5.c | 6 - .../arm-optimized-routines/pl/math/s_tanf_3u5.c | 6 - contrib/arm-optimized-routines/pl/math/s_tanh_3u.c | 6 - .../arm-optimized-routines/pl/math/s_tanhf_2u6.c | 6 - contrib/arm-optimized-routines/pl/math/sinh_3u.c | 9 +- contrib/arm-optimized-routines/pl/math/sinhf_2u3.c | 9 +- contrib/arm-optimized-routines/pl/math/sinpi_3u.c | 90 + .../arm-optimized-routines/pl/math/sinpif_2u5.c | 83 + .../arm-optimized-routines/pl/math/sv_acos_2u.c | 91 + .../arm-optimized-routines/pl/math/sv_acosf_1u4.c | 84 + .../arm-optimized-routines/pl/math/sv_acosh_3u5.c | 50 + .../arm-optimized-routines/pl/math/sv_acoshf_2u8.c | 47 + .../arm-optimized-routines/pl/math/sv_asin_3u.c | 84 + .../arm-optimized-routines/pl/math/sv_asinf_2u5.c | 76 + .../arm-optimized-routines/pl/math/sv_asinh_3u0.c | 129 + .../arm-optimized-routines/pl/math/sv_asinhf_2u5.c | 55 + .../arm-optimized-routines/pl/math/sv_atan2_2u5.c | 111 +- .../arm-optimized-routines/pl/math/sv_atan2f_3u.c | 112 +- .../arm-optimized-routines/pl/math/sv_atan_2u5.c | 77 +- .../pl/math/sv_atan_common.h | 61 - .../arm-optimized-routines/pl/math/sv_atanf_2u9.c | 69 +- .../pl/math/sv_atanf_common.h | 47 - .../arm-optimized-routines/pl/math/sv_atanh_3u3.c | 60 + .../arm-optimized-routines/pl/math/sv_atanhf_2u8.c | 56 + .../arm-optimized-routines/pl/math/sv_cbrt_2u.c | 122 + .../arm-optimized-routines/pl/math/sv_cbrtf_1u7.c | 116 + .../arm-optimized-routines/pl/math/sv_cexpi_3u5.c | 45 + .../arm-optimized-routines/pl/math/sv_cexpif_1u8.c | 47 + .../arm-optimized-routines/pl/math/sv_cos_2u5.c | 104 +- .../arm-optimized-routines/pl/math/sv_cosf_2u1.c | 94 +- .../arm-optimized-routines/pl/math/sv_cosh_2u.c | 100 + .../arm-optimized-routines/pl/math/sv_coshf_2u.c | 56 + .../arm-optimized-routines/pl/math/sv_cospi_3u2.c | 63 + .../arm-optimized-routines/pl/math/sv_cospif_2u6.c | 59 + .../arm-optimized-routines/pl/math/sv_erf_2u5.c | 111 + contrib/arm-optimized-routines/pl/math/sv_erf_3u.c | 103 - .../arm-optimized-routines/pl/math/sv_erf_data.c | 1558 +++++++++ .../arm-optimized-routines/pl/math/sv_erfc_1u8.c | 164 + .../arm-optimized-routines/pl/math/sv_erfc_4u.c | 146 - .../arm-optimized-routines/pl/math/sv_erfcf_1u7.c | 111 + .../arm-optimized-routines/pl/math/sv_erff_1u3.c | 104 - .../arm-optimized-routines/pl/math/sv_erff_2u.c | 90 + .../arm-optimized-routines/pl/math/sv_erff_data.c | 1046 ++++++ .../arm-optimized-routines/pl/math/sv_exp10_1u5.c | 122 + .../arm-optimized-routines/pl/math/sv_exp10f_1u5.c | 87 + .../arm-optimized-routines/pl/math/sv_exp2_2u.c | 107 + .../arm-optimized-routines/pl/math/sv_exp2f_1u6.c | 80 + .../arm-optimized-routines/pl/math/sv_exp_1u5.c | 137 + .../arm-optimized-routines/pl/math/sv_exp_tail.h | 79 - .../arm-optimized-routines/pl/math/sv_expf_2u.c | 180 +- .../arm-optimized-routines/pl/math/sv_expf_data.c | 12 - .../pl/math/sv_expf_inline.h | 66 + .../arm-optimized-routines/pl/math/sv_expm1_2u5.c | 95 + .../arm-optimized-routines/pl/math/sv_expm1f_1u6.c | 93 + .../pl/math/sv_expm1f_inline.h | 73 + .../arm-optimized-routines/pl/math/sv_hypot_1u5.c | 51 + .../arm-optimized-routines/pl/math/sv_hypotf_1u5.c | 45 + .../arm-optimized-routines/pl/math/sv_log10_2u5.c | 94 +- .../arm-optimized-routines/pl/math/sv_log10f_3u5.c | 119 +- .../arm-optimized-routines/pl/math/sv_log1p_2u5.c | 116 + .../pl/math/sv_log1p_inline.h | 96 + .../arm-optimized-routines/pl/math/sv_log1pf_1u3.c | 97 + .../pl/math/sv_log1pf_inline.h | 65 + .../arm-optimized-routines/pl/math/sv_log2_3u.c | 94 +- .../arm-optimized-routines/pl/math/sv_log2f_2u5.c | 99 +- .../arm-optimized-routines/pl/math/sv_log_2u5.c | 101 +- .../arm-optimized-routines/pl/math/sv_log_data.c | 146 - .../arm-optimized-routines/pl/math/sv_logf_3u4.c | 99 +- .../arm-optimized-routines/pl/math/sv_logf_data.c | 12 - contrib/arm-optimized-routines/pl/math/sv_math.h | 220 +- .../arm-optimized-routines/pl/math/sv_pow_1u5.c | 444 +++ .../arm-optimized-routines/pl/math/sv_powf_2u6.c | 360 ++ contrib/arm-optimized-routines/pl/math/sv_powi.c | 25 +- contrib/arm-optimized-routines/pl/math/sv_powif.c | 26 +- contrib/arm-optimized-routines/pl/math/sv_sin_3u.c | 89 - .../arm-optimized-routines/pl/math/sv_sin_3u5.c | 96 + .../arm-optimized-routines/pl/math/sv_sincos_3u5.c | 61 + .../pl/math/sv_sincos_common.h | 85 + .../pl/math/sv_sincosf_1u8.c | 62 + .../pl/math/sv_sincosf_common.h | 81 + .../arm-optimized-routines/pl/math/sv_sinf_1u9.c | 103 +- .../pl/math/sv_sinf_poly_data.c | 19 - .../arm-optimized-routines/pl/math/sv_sinh_3u.c | 103 + .../arm-optimized-routines/pl/math/sv_sinhf_2u3.c | 64 + .../arm-optimized-routines/pl/math/sv_sinpi_3u1.c | 57 + .../arm-optimized-routines/pl/math/sv_sinpif_2u5.c | 53 + .../arm-optimized-routines/pl/math/sv_tan_3u5.c | 99 + .../arm-optimized-routines/pl/math/sv_tanf_3u5.c | 141 +- .../arm-optimized-routines/pl/math/sv_tanh_3u.c | 96 + .../arm-optimized-routines/pl/math/sv_tanhf_2u6.c | 59 + contrib/arm-optimized-routines/pl/math/tanf_3u3.c | 27 +- contrib/arm-optimized-routines/pl/math/tanh_3u.c | 22 +- contrib/arm-optimized-routines/pl/math/tanhf_2u6.c | 9 +- .../pl/math/test/mathbench_funcs.h | 55 +- .../pl/math/test/mathbench_wrappers.h | 159 +- .../arm-optimized-routines/pl/math/test/pl_test.h | 24 +- .../arm-optimized-routines/pl/math/test/runulp.sh | 56 +- .../pl/math/test/testcases/directed/acos.tst | 17 + .../pl/math/test/testcases/directed/acosf.tst | 21 + .../pl/math/test/testcases/directed/asin.tst | 24 + .../pl/math/test/testcases/directed/asinf.tst | 24 + .../pl/math/test/ulp_funcs.h | 54 +- .../pl/math/test/ulp_wrappers.h | 78 +- .../pl/math/tools/asin.sollya | 29 + .../pl/math/tools/asinf.sollya | 36 + .../pl/math/tools/erf.sollya | 25 + .../pl/math/tools/erfc.sollya | 60 +- .../pl/math/tools/erfcf.sollya | 41 +- .../pl/math/tools/erff.sollya | 20 + .../pl/math/tools/exp10.sollya | 55 + .../pl/math/tools/sincos.sollya | 33 + .../pl/math/tools/sincosf.sollya | 33 + .../pl/math/tools/sinpi.sollya | 33 + .../pl/math/trigpi_references.c | 57 + contrib/arm-optimized-routines/pl/math/v_acos_2u.c | 122 + .../arm-optimized-routines/pl/math/v_acosf_1u4.c | 113 + .../arm-optimized-routines/pl/math/v_acosh_3u5.c | 63 +- .../arm-optimized-routines/pl/math/v_acoshf_3u1.c | 70 +- contrib/arm-optimized-routines/pl/math/v_asin_3u.c | 113 + .../arm-optimized-routines/pl/math/v_asinf_2u5.c | 104 + .../arm-optimized-routines/pl/math/v_asinh_3u5.c | 176 +- .../arm-optimized-routines/pl/math/v_asinhf_2u7.c | 78 +- .../arm-optimized-routines/pl/math/v_atan2_3u.c | 117 +- .../arm-optimized-routines/pl/math/v_atan2f_3u.c | 112 +- .../arm-optimized-routines/pl/math/v_atan_2u5.c | 98 +- .../arm-optimized-routines/pl/math/v_atanf_3u.c | 96 +- .../arm-optimized-routines/pl/math/v_atanh_3u5.c | 69 +- .../arm-optimized-routines/pl/math/v_atanhf_3u1.c | 73 +- contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c | 100 +- .../arm-optimized-routines/pl/math/v_cbrtf_1u5.c | 96 - .../arm-optimized-routines/pl/math/v_cbrtf_1u7.c | 116 + .../arm-optimized-routines/pl/math/v_cexpi_3u5.c | 45 + .../arm-optimized-routines/pl/math/v_cexpif_1u8.c | 47 + contrib/arm-optimized-routines/pl/math/v_cosh_2u.c | 130 +- .../arm-optimized-routines/pl/math/v_coshf_2u4.c | 76 +- .../arm-optimized-routines/pl/math/v_cospi_3u1.c | 86 + .../arm-optimized-routines/pl/math/v_cospif_3u2.c | 83 + contrib/arm-optimized-routines/pl/math/v_erf_2u.c | 116 - contrib/arm-optimized-routines/pl/math/v_erf_2u5.c | 158 + .../arm-optimized-routines/pl/math/v_erf_data.c | 119 - .../arm-optimized-routines/pl/math/v_erfc_1u8.c | 198 ++ contrib/arm-optimized-routines/pl/math/v_erfc_4u.c | 168 - .../arm-optimized-routines/pl/math/v_erfc_data.c | 96 - .../arm-optimized-routines/pl/math/v_erfcf_1u.c | 183 - .../arm-optimized-routines/pl/math/v_erfcf_1u7.c | 166 + .../arm-optimized-routines/pl/math/v_erff_1u5.c | 116 - contrib/arm-optimized-routines/pl/math/v_erff_2u.c | 118 + .../arm-optimized-routines/pl/math/v_erff_data.c | 18 - .../arm-optimized-routines/pl/math/v_erfinv_25u.c | 161 + .../arm-optimized-routines/pl/math/v_erfinvf_5u.c | 163 + .../arm-optimized-routines/pl/math/v_exp10_2u.c | 144 + .../arm-optimized-routines/pl/math/v_exp10f_2u4.c | 138 + contrib/arm-optimized-routines/pl/math/v_exp2_2u.c | 128 + .../arm-optimized-routines/pl/math/v_exp_data.c | 55 + .../arm-optimized-routines/pl/math/v_exp_tail.c | 75 - .../pl/math/v_exp_tail_data.c | 179 +- .../pl/math/v_exp_tail_inline.h | 102 + contrib/arm-optimized-routines/pl/math/v_expf.c | 83 - .../arm-optimized-routines/pl/math/v_expf_inline.h | 60 + .../arm-optimized-routines/pl/math/v_expm1_2u5.c | 139 +- .../arm-optimized-routines/pl/math/v_expm1f_1u6.c | 123 +- .../pl/math/v_expm1f_inline.h | 56 +- .../arm-optimized-routines/pl/math/v_hypot_1u5.c | 95 + .../arm-optimized-routines/pl/math/v_hypotf_1u5.c | 94 + .../arm-optimized-routines/pl/math/v_log10_2u5.c | 140 +- .../arm-optimized-routines/pl/math/v_log10_data.c | 298 +- .../arm-optimized-routines/pl/math/v_log10f_3u5.c | 114 +- .../arm-optimized-routines/pl/math/v_log10f_data.c | 13 - .../arm-optimized-routines/pl/math/v_log1p_2u5.c | 144 +- .../pl/math/v_log1p_inline.h | 82 +- .../arm-optimized-routines/pl/math/v_log1pf_2u1.c | 174 +- .../pl/math/v_log1pf_inline.h | 74 +- contrib/arm-optimized-routines/pl/math/v_log2_3u.c | 133 +- .../arm-optimized-routines/pl/math/v_log2_data.c | 278 +- .../arm-optimized-routines/pl/math/v_log2f_2u5.c | 93 +- .../arm-optimized-routines/pl/math/v_log2f_data.c | 15 - .../arm-optimized-routines/pl/math/v_log_data.c | 161 + .../arm-optimized-routines/pl/math/v_log_inline.h | 104 + .../arm-optimized-routines/pl/math/v_logf_inline.h | 59 + contrib/arm-optimized-routines/pl/math/v_math.h | 874 +---- contrib/arm-optimized-routines/pl/math/v_pow_1u5.c | 259 ++ .../v_exp_data.c => pl/math/v_pow_exp_data.c} | 164 +- .../pl/math/v_pow_log_data.c | 174 + .../arm-optimized-routines/pl/math/v_powf_data.c | 89 + .../arm-optimized-routines/pl/math/v_sincos_3u5.c | 57 + .../pl/math/v_sincos_common.h | 86 + .../arm-optimized-routines/pl/math/v_sincosf_1u8.c | 58 + .../pl/math/v_sincosf_common.h | 84 + contrib/arm-optimized-routines/pl/math/v_sinh_3u.c | 120 +- .../arm-optimized-routines/pl/math/v_sinhf_2u3.c | 91 +- .../arm-optimized-routines/pl/math/v_sinpi_3u1.c | 86 + .../arm-optimized-routines/pl/math/v_sinpif_3u.c | 81 + contrib/arm-optimized-routines/pl/math/v_tan_3u5.c | 124 +- .../arm-optimized-routines/pl/math/v_tan_data.c | 15 - .../arm-optimized-routines/pl/math/v_tanf_3u5.c | 134 +- contrib/arm-optimized-routines/pl/math/v_tanh_3u.c | 112 +- .../arm-optimized-routines/pl/math/v_tanhf_2u6.c | 80 +- .../arm-optimized-routines/pl/math/vn_acosh_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_acoshf_3u1.c | 12 - .../arm-optimized-routines/pl/math/vn_asinh_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_asinhf_2u7.c | 12 - .../arm-optimized-routines/pl/math/vn_atan2_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_atan2f_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_atan_2u5.c | 12 - .../arm-optimized-routines/pl/math/vn_atanf_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_atanh_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_atanhf_3u1.c | 12 - .../arm-optimized-routines/pl/math/vn_cbrt_2u.c | 12 - .../arm-optimized-routines/pl/math/vn_cbrtf_1u5.c | 12 - .../arm-optimized-routines/pl/math/vn_cosh_2u.c | 12 - .../arm-optimized-routines/pl/math/vn_coshf_2u4.c | 12 - contrib/arm-optimized-routines/pl/math/vn_erf_2u.c | 12 - .../arm-optimized-routines/pl/math/vn_erfc_4u.c | 12 - .../arm-optimized-routines/pl/math/vn_erfcf_1u.c | 12 - .../arm-optimized-routines/pl/math/vn_erff_1u5.c | 12 - .../arm-optimized-routines/pl/math/vn_exp_tail.c | 11 - contrib/arm-optimized-routines/pl/math/vn_expf.c | 12 - .../arm-optimized-routines/pl/math/vn_expm1_2u5.c | 12 - .../arm-optimized-routines/pl/math/vn_expm1f_1u6.c | 12 - .../arm-optimized-routines/pl/math/vn_log10_2u5.c | 12 - .../arm-optimized-routines/pl/math/vn_log10f_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_log1p_2u5.c | 12 - .../arm-optimized-routines/pl/math/vn_log1pf_2u1.c | 12 - .../arm-optimized-routines/pl/math/vn_log2_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_log2f_2u5.c | 12 - .../arm-optimized-routines/pl/math/vn_sinh_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_sinhf_2u3.c | 12 - .../arm-optimized-routines/pl/math/vn_tan_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_tanf_3u5.c | 12 - .../arm-optimized-routines/pl/math/vn_tanh_3u.c | 12 - .../arm-optimized-routines/pl/math/vn_tanhf_2u6.c | 12 - .../string/aarch64/asmdefs.h | 14 + .../string/aarch64/memcpy-advsimd.S | 62 +- .../string/aarch64/memcpy-mops.S | 21 + .../string/aarch64/memmove-mops.S | 21 + .../string/aarch64/memset-mops.S | 20 + .../arm-optimized-routines/string/bench/memcpy.c | 5 +- .../string/include/stringlib.h | 7 +- .../arm-optimized-routines/string/test/memcpy.c | 5 +- .../arm-optimized-routines/string/test/memmove.c | 5 +- .../arm-optimized-routines/string/test/memset.c | 5 +- 414 files changed, 26613 insertions(+), 10731 deletions(-) diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README index a2143a28488a..651ebdc84bc8 100644 --- a/contrib/arm-optimized-routines/README +++ b/contrib/arm-optimized-routines/README @@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of the appropriate subdirectory. Regular quarterly releases are tagged as vYY.MM, the latest -release is v23.01. +release is v24.01. Source code layout: diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist index 7a8497507a81..03fb54db52fa 100644 --- a/contrib/arm-optimized-routines/config.mk.dist +++ b/contrib/arm-optimized-routines/config.mk.dist @@ -1,6 +1,6 @@ # Example config.mk # -# Copyright (c) 2018-2022, Arm Limited. +# Copyright (c) 2018-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception # Subprojects to build @@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable vector math code -#math-cflags += -DWANT_VMATH=0 - -# Disable/enable SVE vector math code and tests +# Disable/enable SVE vector math code and tests. +# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE +# routines only so that SVE code does not leak into scalar +# routines. It is also necessary to add it for tools (e.g. ulp, +# mathbench) WANT_SVE_MATH = 0 ifeq ($(WANT_SVE_MATH), 1) - math-cflags += -march=armv8.2-a+sve + math-sve-cflags = -march=armv8-a+sve endif math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk index 2a9cad10d96a..5e9494a7bd3c 100644 --- a/contrib/arm-optimized-routines/math/Dir.mk +++ b/contrib/arm-optimized-routines/math/Dir.mk @@ -1,12 +1,14 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2022, Arm Limited. +# Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) +math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) + math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ @@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs) $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_ALL += $(math-sve-cflags) build/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c new file mode 100644 index 000000000000..9a73575bce89 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c new file mode 100644 index 000000000000..b9890b2998ad --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c new file mode 100644 index 000000000000..bc5609faf4fc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c @@ -0,0 +1,125 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c new file mode 100644 index 000000000000..e402205e98e6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.962 ulp. */ + .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), + V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + n = vrndaq_f32 (x); + r = vsubq_f32 (x, n); + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c new file mode 100644 index 000000000000..ba6b02fbb4bc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c @@ -0,0 +1,72 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.878 ulp. */ + 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) +#define C5 v_f32 (Poly[5]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ *** 42574 LINES SKIPPED ***