git: 35f6b83049da - stable/14 - Update the Arm Optimized Routine library to v24.01

From: Andrew Turner <andrew_at_FreeBSD.org>
Date: Mon, 08 Apr 2024 13:15:43 UTC
The branch stable/14 has been updated by andrew:

URL: https://cgit.FreeBSD.org/src/commit/?id=35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6

commit 35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6
Author:     Andrew Turner <andrew@FreeBSD.org>
AuthorDate: 2024-02-29 11:39:12 +0000
Commit:     Andrew Turner <andrew@FreeBSD.org>
CommitDate: 2024-04-08 13:15:18 +0000

    Update the Arm Optimized Routine library to v24.01
    
    Sponsored by:   Arm Ltd
    
    (cherry picked from commit 5a02ffc32e777041dd2dad4e651ed2a0865a0a5d)
---
 contrib/arm-optimized-routines/README              |    2 +-
 contrib/arm-optimized-routines/config.mk.dist      |   13 +-
 contrib/arm-optimized-routines/math/Dir.mk         |    6 +-
 .../arm-optimized-routines/math/aarch64/v_cos.c    |   87 +
 .../arm-optimized-routines/math/aarch64/v_cosf.c   |   82 +
 .../arm-optimized-routines/math/aarch64/v_exp.c    |  125 +
 .../arm-optimized-routines/math/aarch64/v_exp2f.c  |  113 +
 .../math/aarch64/v_exp2f_1u.c                      |   72 +
 .../math/aarch64/v_exp_data.c                      |  146 +
 .../arm-optimized-routines/math/aarch64/v_expf.c   |  122 +
 .../math/aarch64/v_expf_1u.c                       |   77 +
 .../arm-optimized-routines/math/aarch64/v_log.c    |  100 +
 .../math/aarch64/v_log_data.c                      |  156 +
 .../arm-optimized-routines/math/aarch64/v_logf.c   |   74 +
 .../arm-optimized-routines/math/aarch64/v_math.h   |  135 +
 .../arm-optimized-routines/math/aarch64/v_pow.c    |   22 +
 .../arm-optimized-routines/math/aarch64/v_powf.c   |  148 +
 .../arm-optimized-routines/math/aarch64/v_sin.c    |   97 +
 .../arm-optimized-routines/math/aarch64/v_sinf.c   |   82 +
 contrib/arm-optimized-routines/math/exp10.c        |  129 +
 contrib/arm-optimized-routines/math/exp_data.c     |   23 +-
 .../arm-optimized-routines/math/include/mathlib.h  |   67 +-
 contrib/arm-optimized-routines/math/math_config.h  |   61 +-
 contrib/arm-optimized-routines/math/s_cos.c        |    6 -
 contrib/arm-optimized-routines/math/s_cosf.c       |    6 -
 contrib/arm-optimized-routines/math/s_exp.c        |    6 -
 contrib/arm-optimized-routines/math/s_exp2f.c      |    6 -
 contrib/arm-optimized-routines/math/s_exp2f_1u.c   |    6 -
 contrib/arm-optimized-routines/math/s_expf.c       |    6 -
 contrib/arm-optimized-routines/math/s_expf_1u.c    |    6 -
 contrib/arm-optimized-routines/math/s_log.c        |    6 -
 contrib/arm-optimized-routines/math/s_logf.c       |    6 -
 contrib/arm-optimized-routines/math/s_pow.c        |    6 -
 contrib/arm-optimized-routines/math/s_powf.c       |    6 -
 contrib/arm-optimized-routines/math/s_sin.c        |    6 -
 contrib/arm-optimized-routines/math/s_sinf.c       |    6 -
 .../arm-optimized-routines/math/test/mathbench.c   |  152 +-
 .../math/test/mathbench_funcs.h                    |   50 +-
 .../math/test/mathbench_wrappers.h                 |   42 +-
 .../arm-optimized-routines/math/test/mathtest.c    |    9 +-
 contrib/arm-optimized-routines/math/test/runulp.sh |  112 +-
 .../math/test/testcases/directed/exp10.tst         |   15 +
 contrib/arm-optimized-routines/math/test/ulp.c     |   81 +-
 contrib/arm-optimized-routines/math/test/ulp.h     |   29 +-
 .../arm-optimized-routines/math/test/ulp_funcs.h   |   50 +-
 .../math/test/ulp_wrappers.h                       |   36 +-
 contrib/arm-optimized-routines/math/tgamma128.c    |  356 ++
 contrib/arm-optimized-routines/math/tgamma128.h    |  141 +
 .../math/tools/tgamma128_gen.jl                    |  212 ++
 contrib/arm-optimized-routines/math/v_cos.c        |   95 -
 contrib/arm-optimized-routines/math/v_cosf.c       |   84 -
 contrib/arm-optimized-routines/math/v_exp.c        |  128 -
 contrib/arm-optimized-routines/math/v_exp.h        |   14 -
 contrib/arm-optimized-routines/math/v_exp2f.c      |  117 -
 contrib/arm-optimized-routines/math/v_exp2f_1u.c   |   75 -
 contrib/arm-optimized-routines/math/v_expf.c       |  122 -
 contrib/arm-optimized-routines/math/v_expf_1u.c    |   80 -
 contrib/arm-optimized-routines/math/v_log.c        |  104 -
 contrib/arm-optimized-routines/math/v_log.h        |   18 -
 contrib/arm-optimized-routines/math/v_log_data.c   |  158 -
 contrib/arm-optimized-routines/math/v_logf.c       |   73 -
 contrib/arm-optimized-routines/math/v_math.h       |  661 ----
 contrib/arm-optimized-routines/math/v_pow.c        |   27 -
 contrib/arm-optimized-routines/math/v_powf.c       |  235 --
 contrib/arm-optimized-routines/math/v_sin.c        |  103 -
 contrib/arm-optimized-routines/math/v_sinf.c       |   88 -
 contrib/arm-optimized-routines/math/vn_cos.c       |   12 -
 contrib/arm-optimized-routines/math/vn_cosf.c      |   12 -
 contrib/arm-optimized-routines/math/vn_exp.c       |   12 -
 contrib/arm-optimized-routines/math/vn_exp2f.c     |   12 -
 contrib/arm-optimized-routines/math/vn_exp2f_1u.c  |   11 -
 contrib/arm-optimized-routines/math/vn_expf.c      |   12 -
 contrib/arm-optimized-routines/math/vn_expf_1u.c   |   11 -
 contrib/arm-optimized-routines/math/vn_log.c       |   12 -
 contrib/arm-optimized-routines/math/vn_logf.c      |   12 -
 contrib/arm-optimized-routines/math/vn_pow.c       |   12 -
 contrib/arm-optimized-routines/math/vn_powf.c      |   12 -
 contrib/arm-optimized-routines/math/vn_sin.c       |   12 -
 contrib/arm-optimized-routines/math/vn_sinf.c      |   12 -
 contrib/arm-optimized-routines/pl/math/Dir.mk      |   89 +-
 contrib/arm-optimized-routines/pl/math/acos_2u.c   |  100 +
 contrib/arm-optimized-routines/pl/math/acosf_1u4.c |   99 +
 contrib/arm-optimized-routines/pl/math/asin_3u.c   |  106 +
 contrib/arm-optimized-routines/pl/math/asin_data.c |   19 +
 contrib/arm-optimized-routines/pl/math/asinf_2u5.c |  100 +
 .../arm-optimized-routines/pl/math/asinf_data.c    |   16 +
 contrib/arm-optimized-routines/pl/math/asinh_2u5.c |    5 +-
 .../arm-optimized-routines/pl/math/asinhf_3u5.c    |    6 +-
 .../arm-optimized-routines/pl/math/atan_common.h   |   40 +-
 contrib/arm-optimized-routines/pl/math/atanf_2u9.c |   12 +-
 .../arm-optimized-routines/pl/math/atanf_common.h  |   33 +-
 contrib/arm-optimized-routines/pl/math/atanh_3u.c  |   15 +-
 .../arm-optimized-routines/pl/math/atanhf_3u1.c    |   12 +-
 contrib/arm-optimized-routines/pl/math/cbrt_2u.c   |    5 +-
 contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c |    9 +-
 contrib/arm-optimized-routines/pl/math/cosh_2u.c   |    9 +-
 contrib/arm-optimized-routines/pl/math/coshf_1u9.c |    9 +-
 contrib/arm-optimized-routines/pl/math/cospi_3u1.c |   89 +
 .../arm-optimized-routines/pl/math/cospif_2u6.c    |   84 +
 contrib/arm-optimized-routines/pl/math/erf_2u5.c   |  102 +
 contrib/arm-optimized-routines/pl/math/erf_data.c  |  788 +++++
 contrib/arm-optimized-routines/pl/math/erfc_1u8.c  |  153 +
 contrib/arm-optimized-routines/pl/math/erfc_4u5.c  |  155 -
 contrib/arm-optimized-routines/pl/math/erfc_data.c | 3628 +++++++++++++++++++-
 contrib/arm-optimized-routines/pl/math/erfcf.h     |   38 -
 contrib/arm-optimized-routines/pl/math/erfcf_1u7.c |  103 +
 contrib/arm-optimized-routines/pl/math/erfcf_2u.c  |  133 -
 .../arm-optimized-routines/pl/math/erfcf_data.c    |  703 +++-
 contrib/arm-optimized-routines/pl/math/erff_1u5.c  |  108 -
 contrib/arm-optimized-routines/pl/math/erff_2u.c   |   82 +
 contrib/arm-optimized-routines/pl/math/erff_data.c |  532 ++-
 .../arm-optimized-routines/pl/math/erfinv_24u5.c   |   81 +
 .../arm-optimized-routines/pl/math/erfinvf_4u7.c   |   74 +
 contrib/arm-optimized-routines/pl/math/erfinvl.c   |  114 +
 contrib/arm-optimized-routines/pl/math/estrin.h    |   16 -
 .../arm-optimized-routines/pl/math/estrin_wrap.h   |   48 -
 contrib/arm-optimized-routines/pl/math/estrinf.h   |   14 -
 contrib/arm-optimized-routines/pl/math/expf.c      |    4 +-
 contrib/arm-optimized-routines/pl/math/expm1_2u5.c |   19 +-
 .../arm-optimized-routines/pl/math/expm1f_1u6.c    |   11 +-
 .../arm-optimized-routines/pl/math/finite_pow.h    |  365 ++
 contrib/arm-optimized-routines/pl/math/horner.h    |   14 -
 .../arm-optimized-routines/pl/math/horner_wrap.h   |   34 -
 contrib/arm-optimized-routines/pl/math/hornerf.h   |   14 -
 .../pl/math/include/mathlib.h                      |  238 +-
 .../pl/math/include/pl_test.h                      |    8 +-
 contrib/arm-optimized-routines/pl/math/log1p_2u.c  |   17 +-
 .../arm-optimized-routines/pl/math/log1pf_2u1.c    |   16 +-
 .../arm-optimized-routines/pl/math/math_config.h   |  252 +-
 contrib/arm-optimized-routines/pl/math/math_err.c  |    4 +-
 contrib/arm-optimized-routines/pl/math/math_errf.c |    4 +-
 .../pl/math/pairwise_horner.h                      |   14 -
 .../pl/math/pairwise_horner_wrap.h                 |   48 -
 .../pl/math/pairwise_hornerf.h                     |   14 -
 contrib/arm-optimized-routines/pl/math/pl_sig.h    |   56 +-
 .../pl/math/poly_advsimd_f32.h                     |   24 +
 .../pl/math/poly_advsimd_f64.h                     |   24 +
 .../arm-optimized-routines/pl/math/poly_generic.h  |  277 ++
 .../pl/math/poly_scalar_f32.h                      |   24 +
 .../pl/math/poly_scalar_f64.h                      |   24 +
 .../arm-optimized-routines/pl/math/poly_sve_f32.h  |   26 +
 .../arm-optimized-routines/pl/math/poly_sve_f64.h  |   26 +
 .../pl/math/poly_sve_generic.h                     |  301 ++
 .../arm-optimized-routines/pl/math/s_acosh_3u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_acoshf_3u1.c  |    6 -
 .../arm-optimized-routines/pl/math/s_asinh_3u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_asinhf_2u7.c  |    6 -
 .../arm-optimized-routines/pl/math/s_atan2_3u.c    |    6 -
 .../arm-optimized-routines/pl/math/s_atan2f_3u.c   |    6 -
 .../arm-optimized-routines/pl/math/s_atan_2u5.c    |    6 -
 .../arm-optimized-routines/pl/math/s_atanf_3u.c    |    6 -
 .../arm-optimized-routines/pl/math/s_atanh_3u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_atanhf_3u1.c  |    6 -
 contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c |    6 -
 .../arm-optimized-routines/pl/math/s_cbrtf_1u5.c   |    6 -
 contrib/arm-optimized-routines/pl/math/s_cosh_2u.c |    6 -
 .../arm-optimized-routines/pl/math/s_coshf_2u4.c   |    6 -
 contrib/arm-optimized-routines/pl/math/s_erf_2u.c  |    6 -
 contrib/arm-optimized-routines/pl/math/s_erfc_4u.c |    6 -
 .../arm-optimized-routines/pl/math/s_erfcf_1u.c    |    6 -
 .../arm-optimized-routines/pl/math/s_erff_1u5.c    |    6 -
 .../arm-optimized-routines/pl/math/s_exp_tail.c    |    6 -
 contrib/arm-optimized-routines/pl/math/s_expf.c    |    6 -
 .../arm-optimized-routines/pl/math/s_expm1_2u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_expm1f_1u6.c  |    6 -
 .../arm-optimized-routines/pl/math/s_log10_2u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_log10f_3u5.c  |    6 -
 .../arm-optimized-routines/pl/math/s_log1p_2u5.c   |    6 -
 .../arm-optimized-routines/pl/math/s_log1pf_2u1.c  |    6 -
 contrib/arm-optimized-routines/pl/math/s_log2_3u.c |    6 -
 .../arm-optimized-routines/pl/math/s_log2f_2u5.c   |    6 -
 contrib/arm-optimized-routines/pl/math/s_sinh_3u.c |    6 -
 .../arm-optimized-routines/pl/math/s_sinhf_2u3.c   |    6 -
 contrib/arm-optimized-routines/pl/math/s_tan_3u5.c |    6 -
 .../arm-optimized-routines/pl/math/s_tanf_3u5.c    |    6 -
 contrib/arm-optimized-routines/pl/math/s_tanh_3u.c |    6 -
 .../arm-optimized-routines/pl/math/s_tanhf_2u6.c   |    6 -
 contrib/arm-optimized-routines/pl/math/sinh_3u.c   |    9 +-
 contrib/arm-optimized-routines/pl/math/sinhf_2u3.c |    9 +-
 contrib/arm-optimized-routines/pl/math/sinpi_3u.c  |   90 +
 .../arm-optimized-routines/pl/math/sinpif_2u5.c    |   83 +
 .../arm-optimized-routines/pl/math/sv_acos_2u.c    |   91 +
 .../arm-optimized-routines/pl/math/sv_acosf_1u4.c  |   84 +
 .../arm-optimized-routines/pl/math/sv_acosh_3u5.c  |   50 +
 .../arm-optimized-routines/pl/math/sv_acoshf_2u8.c |   47 +
 .../arm-optimized-routines/pl/math/sv_asin_3u.c    |   84 +
 .../arm-optimized-routines/pl/math/sv_asinf_2u5.c  |   76 +
 .../arm-optimized-routines/pl/math/sv_asinh_3u0.c  |  129 +
 .../arm-optimized-routines/pl/math/sv_asinhf_2u5.c |   55 +
 .../arm-optimized-routines/pl/math/sv_atan2_2u5.c  |  111 +-
 .../arm-optimized-routines/pl/math/sv_atan2f_3u.c  |  112 +-
 .../arm-optimized-routines/pl/math/sv_atan_2u5.c   |   77 +-
 .../pl/math/sv_atan_common.h                       |   61 -
 .../arm-optimized-routines/pl/math/sv_atanf_2u9.c  |   69 +-
 .../pl/math/sv_atanf_common.h                      |   47 -
 .../arm-optimized-routines/pl/math/sv_atanh_3u3.c  |   60 +
 .../arm-optimized-routines/pl/math/sv_atanhf_2u8.c |   56 +
 .../arm-optimized-routines/pl/math/sv_cbrt_2u.c    |  122 +
 .../arm-optimized-routines/pl/math/sv_cbrtf_1u7.c  |  116 +
 .../arm-optimized-routines/pl/math/sv_cexpi_3u5.c  |   45 +
 .../arm-optimized-routines/pl/math/sv_cexpif_1u8.c |   47 +
 .../arm-optimized-routines/pl/math/sv_cos_2u5.c    |  104 +-
 .../arm-optimized-routines/pl/math/sv_cosf_2u1.c   |   94 +-
 .../arm-optimized-routines/pl/math/sv_cosh_2u.c    |  100 +
 .../arm-optimized-routines/pl/math/sv_coshf_2u.c   |   56 +
 .../arm-optimized-routines/pl/math/sv_cospi_3u2.c  |   63 +
 .../arm-optimized-routines/pl/math/sv_cospif_2u6.c |   59 +
 .../arm-optimized-routines/pl/math/sv_erf_2u5.c    |  111 +
 contrib/arm-optimized-routines/pl/math/sv_erf_3u.c |  103 -
 .../arm-optimized-routines/pl/math/sv_erf_data.c   | 1558 +++++++++
 .../arm-optimized-routines/pl/math/sv_erfc_1u8.c   |  164 +
 .../arm-optimized-routines/pl/math/sv_erfc_4u.c    |  146 -
 .../arm-optimized-routines/pl/math/sv_erfcf_1u7.c  |  111 +
 .../arm-optimized-routines/pl/math/sv_erff_1u3.c   |  104 -
 .../arm-optimized-routines/pl/math/sv_erff_2u.c    |   90 +
 .../arm-optimized-routines/pl/math/sv_erff_data.c  | 1046 ++++++
 .../arm-optimized-routines/pl/math/sv_exp10_1u5.c  |  122 +
 .../arm-optimized-routines/pl/math/sv_exp10f_1u5.c |   87 +
 .../arm-optimized-routines/pl/math/sv_exp2_2u.c    |  107 +
 .../arm-optimized-routines/pl/math/sv_exp2f_1u6.c  |   80 +
 .../arm-optimized-routines/pl/math/sv_exp_1u5.c    |  137 +
 .../arm-optimized-routines/pl/math/sv_exp_tail.h   |   79 -
 .../arm-optimized-routines/pl/math/sv_expf_2u.c    |  180 +-
 .../arm-optimized-routines/pl/math/sv_expf_data.c  |   12 -
 .../pl/math/sv_expf_inline.h                       |   66 +
 .../arm-optimized-routines/pl/math/sv_expm1_2u5.c  |   95 +
 .../arm-optimized-routines/pl/math/sv_expm1f_1u6.c |   93 +
 .../pl/math/sv_expm1f_inline.h                     |   73 +
 .../arm-optimized-routines/pl/math/sv_hypot_1u5.c  |   51 +
 .../arm-optimized-routines/pl/math/sv_hypotf_1u5.c |   45 +
 .../arm-optimized-routines/pl/math/sv_log10_2u5.c  |   94 +-
 .../arm-optimized-routines/pl/math/sv_log10f_3u5.c |  119 +-
 .../arm-optimized-routines/pl/math/sv_log1p_2u5.c  |  116 +
 .../pl/math/sv_log1p_inline.h                      |   96 +
 .../arm-optimized-routines/pl/math/sv_log1pf_1u3.c |   97 +
 .../pl/math/sv_log1pf_inline.h                     |   65 +
 .../arm-optimized-routines/pl/math/sv_log2_3u.c    |   94 +-
 .../arm-optimized-routines/pl/math/sv_log2f_2u5.c  |   99 +-
 .../arm-optimized-routines/pl/math/sv_log_2u5.c    |  101 +-
 .../arm-optimized-routines/pl/math/sv_log_data.c   |  146 -
 .../arm-optimized-routines/pl/math/sv_logf_3u4.c   |   99 +-
 .../arm-optimized-routines/pl/math/sv_logf_data.c  |   12 -
 contrib/arm-optimized-routines/pl/math/sv_math.h   |  220 +-
 .../arm-optimized-routines/pl/math/sv_pow_1u5.c    |  444 +++
 .../arm-optimized-routines/pl/math/sv_powf_2u6.c   |  360 ++
 contrib/arm-optimized-routines/pl/math/sv_powi.c   |   25 +-
 contrib/arm-optimized-routines/pl/math/sv_powif.c  |   26 +-
 contrib/arm-optimized-routines/pl/math/sv_sin_3u.c |   89 -
 .../arm-optimized-routines/pl/math/sv_sin_3u5.c    |   96 +
 .../arm-optimized-routines/pl/math/sv_sincos_3u5.c |   61 +
 .../pl/math/sv_sincos_common.h                     |   85 +
 .../pl/math/sv_sincosf_1u8.c                       |   62 +
 .../pl/math/sv_sincosf_common.h                    |   81 +
 .../arm-optimized-routines/pl/math/sv_sinf_1u9.c   |  103 +-
 .../pl/math/sv_sinf_poly_data.c                    |   19 -
 .../arm-optimized-routines/pl/math/sv_sinh_3u.c    |  103 +
 .../arm-optimized-routines/pl/math/sv_sinhf_2u3.c  |   64 +
 .../arm-optimized-routines/pl/math/sv_sinpi_3u1.c  |   57 +
 .../arm-optimized-routines/pl/math/sv_sinpif_2u5.c |   53 +
 .../arm-optimized-routines/pl/math/sv_tan_3u5.c    |   99 +
 .../arm-optimized-routines/pl/math/sv_tanf_3u5.c   |  141 +-
 .../arm-optimized-routines/pl/math/sv_tanh_3u.c    |   96 +
 .../arm-optimized-routines/pl/math/sv_tanhf_2u6.c  |   59 +
 contrib/arm-optimized-routines/pl/math/tanf_3u3.c  |   27 +-
 contrib/arm-optimized-routines/pl/math/tanh_3u.c   |   22 +-
 contrib/arm-optimized-routines/pl/math/tanhf_2u6.c |    9 +-
 .../pl/math/test/mathbench_funcs.h                 |   55 +-
 .../pl/math/test/mathbench_wrappers.h              |  159 +-
 .../arm-optimized-routines/pl/math/test/pl_test.h  |   24 +-
 .../arm-optimized-routines/pl/math/test/runulp.sh  |   56 +-
 .../pl/math/test/testcases/directed/acos.tst       |   17 +
 .../pl/math/test/testcases/directed/acosf.tst      |   21 +
 .../pl/math/test/testcases/directed/asin.tst       |   24 +
 .../pl/math/test/testcases/directed/asinf.tst      |   24 +
 .../pl/math/test/ulp_funcs.h                       |   54 +-
 .../pl/math/test/ulp_wrappers.h                    |   78 +-
 .../pl/math/tools/asin.sollya                      |   29 +
 .../pl/math/tools/asinf.sollya                     |   36 +
 .../pl/math/tools/erf.sollya                       |   25 +
 .../pl/math/tools/erfc.sollya                      |   60 +-
 .../pl/math/tools/erfcf.sollya                     |   41 +-
 .../pl/math/tools/erff.sollya                      |   20 +
 .../pl/math/tools/exp10.sollya                     |   55 +
 .../pl/math/tools/sincos.sollya                    |   33 +
 .../pl/math/tools/sincosf.sollya                   |   33 +
 .../pl/math/tools/sinpi.sollya                     |   33 +
 .../pl/math/trigpi_references.c                    |   57 +
 contrib/arm-optimized-routines/pl/math/v_acos_2u.c |  122 +
 .../arm-optimized-routines/pl/math/v_acosf_1u4.c   |  113 +
 .../arm-optimized-routines/pl/math/v_acosh_3u5.c   |   63 +-
 .../arm-optimized-routines/pl/math/v_acoshf_3u1.c  |   70 +-
 contrib/arm-optimized-routines/pl/math/v_asin_3u.c |  113 +
 .../arm-optimized-routines/pl/math/v_asinf_2u5.c   |  104 +
 .../arm-optimized-routines/pl/math/v_asinh_3u5.c   |  176 +-
 .../arm-optimized-routines/pl/math/v_asinhf_2u7.c  |   78 +-
 .../arm-optimized-routines/pl/math/v_atan2_3u.c    |  117 +-
 .../arm-optimized-routines/pl/math/v_atan2f_3u.c   |  112 +-
 .../arm-optimized-routines/pl/math/v_atan_2u5.c    |   98 +-
 .../arm-optimized-routines/pl/math/v_atanf_3u.c    |   96 +-
 .../arm-optimized-routines/pl/math/v_atanh_3u5.c   |   69 +-
 .../arm-optimized-routines/pl/math/v_atanhf_3u1.c  |   73 +-
 contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c |  100 +-
 .../arm-optimized-routines/pl/math/v_cbrtf_1u5.c   |   96 -
 .../arm-optimized-routines/pl/math/v_cbrtf_1u7.c   |  116 +
 .../arm-optimized-routines/pl/math/v_cexpi_3u5.c   |   45 +
 .../arm-optimized-routines/pl/math/v_cexpif_1u8.c  |   47 +
 contrib/arm-optimized-routines/pl/math/v_cosh_2u.c |  130 +-
 .../arm-optimized-routines/pl/math/v_coshf_2u4.c   |   76 +-
 .../arm-optimized-routines/pl/math/v_cospi_3u1.c   |   86 +
 .../arm-optimized-routines/pl/math/v_cospif_3u2.c  |   83 +
 contrib/arm-optimized-routines/pl/math/v_erf_2u.c  |  116 -
 contrib/arm-optimized-routines/pl/math/v_erf_2u5.c |  158 +
 .../arm-optimized-routines/pl/math/v_erf_data.c    |  119 -
 .../arm-optimized-routines/pl/math/v_erfc_1u8.c    |  198 ++
 contrib/arm-optimized-routines/pl/math/v_erfc_4u.c |  168 -
 .../arm-optimized-routines/pl/math/v_erfc_data.c   |   96 -
 .../arm-optimized-routines/pl/math/v_erfcf_1u.c    |  183 -
 .../arm-optimized-routines/pl/math/v_erfcf_1u7.c   |  166 +
 .../arm-optimized-routines/pl/math/v_erff_1u5.c    |  116 -
 contrib/arm-optimized-routines/pl/math/v_erff_2u.c |  118 +
 .../arm-optimized-routines/pl/math/v_erff_data.c   |   18 -
 .../arm-optimized-routines/pl/math/v_erfinv_25u.c  |  161 +
 .../arm-optimized-routines/pl/math/v_erfinvf_5u.c  |  163 +
 .../arm-optimized-routines/pl/math/v_exp10_2u.c    |  144 +
 .../arm-optimized-routines/pl/math/v_exp10f_2u4.c  |  138 +
 contrib/arm-optimized-routines/pl/math/v_exp2_2u.c |  128 +
 .../arm-optimized-routines/pl/math/v_exp_data.c    |   55 +
 .../arm-optimized-routines/pl/math/v_exp_tail.c    |   75 -
 .../pl/math/v_exp_tail_data.c                      |  179 +-
 .../pl/math/v_exp_tail_inline.h                    |  102 +
 contrib/arm-optimized-routines/pl/math/v_expf.c    |   83 -
 .../arm-optimized-routines/pl/math/v_expf_inline.h |   60 +
 .../arm-optimized-routines/pl/math/v_expm1_2u5.c   |  139 +-
 .../arm-optimized-routines/pl/math/v_expm1f_1u6.c  |  123 +-
 .../pl/math/v_expm1f_inline.h                      |   56 +-
 .../arm-optimized-routines/pl/math/v_hypot_1u5.c   |   95 +
 .../arm-optimized-routines/pl/math/v_hypotf_1u5.c  |   94 +
 .../arm-optimized-routines/pl/math/v_log10_2u5.c   |  140 +-
 .../arm-optimized-routines/pl/math/v_log10_data.c  |  298 +-
 .../arm-optimized-routines/pl/math/v_log10f_3u5.c  |  114 +-
 .../arm-optimized-routines/pl/math/v_log10f_data.c |   13 -
 .../arm-optimized-routines/pl/math/v_log1p_2u5.c   |  144 +-
 .../pl/math/v_log1p_inline.h                       |   82 +-
 .../arm-optimized-routines/pl/math/v_log1pf_2u1.c  |  174 +-
 .../pl/math/v_log1pf_inline.h                      |   74 +-
 contrib/arm-optimized-routines/pl/math/v_log2_3u.c |  133 +-
 .../arm-optimized-routines/pl/math/v_log2_data.c   |  278 +-
 .../arm-optimized-routines/pl/math/v_log2f_2u5.c   |   93 +-
 .../arm-optimized-routines/pl/math/v_log2f_data.c  |   15 -
 .../arm-optimized-routines/pl/math/v_log_data.c    |  161 +
 .../arm-optimized-routines/pl/math/v_log_inline.h  |  104 +
 .../arm-optimized-routines/pl/math/v_logf_inline.h |   59 +
 contrib/arm-optimized-routines/pl/math/v_math.h    |  874 +----
 contrib/arm-optimized-routines/pl/math/v_pow_1u5.c |  259 ++
 .../v_exp_data.c => pl/math/v_pow_exp_data.c}      |  164 +-
 .../pl/math/v_pow_log_data.c                       |  174 +
 .../arm-optimized-routines/pl/math/v_powf_data.c   |   89 +
 .../arm-optimized-routines/pl/math/v_sincos_3u5.c  |   57 +
 .../pl/math/v_sincos_common.h                      |   86 +
 .../arm-optimized-routines/pl/math/v_sincosf_1u8.c |   58 +
 .../pl/math/v_sincosf_common.h                     |   84 +
 contrib/arm-optimized-routines/pl/math/v_sinh_3u.c |  120 +-
 .../arm-optimized-routines/pl/math/v_sinhf_2u3.c   |   91 +-
 .../arm-optimized-routines/pl/math/v_sinpi_3u1.c   |   86 +
 .../arm-optimized-routines/pl/math/v_sinpif_3u.c   |   81 +
 contrib/arm-optimized-routines/pl/math/v_tan_3u5.c |  124 +-
 .../arm-optimized-routines/pl/math/v_tan_data.c    |   15 -
 .../arm-optimized-routines/pl/math/v_tanf_3u5.c    |  134 +-
 contrib/arm-optimized-routines/pl/math/v_tanh_3u.c |  112 +-
 .../arm-optimized-routines/pl/math/v_tanhf_2u6.c   |   80 +-
 .../arm-optimized-routines/pl/math/vn_acosh_3u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_acoshf_3u1.c |   12 -
 .../arm-optimized-routines/pl/math/vn_asinh_3u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_asinhf_2u7.c |   12 -
 .../arm-optimized-routines/pl/math/vn_atan2_3u.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_atan2f_3u.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_atan_2u5.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_atanf_3u.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_atanh_3u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_atanhf_3u1.c |   12 -
 .../arm-optimized-routines/pl/math/vn_cbrt_2u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_cbrtf_1u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_cosh_2u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_coshf_2u4.c  |   12 -
 contrib/arm-optimized-routines/pl/math/vn_erf_2u.c |   12 -
 .../arm-optimized-routines/pl/math/vn_erfc_4u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_erfcf_1u.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_erff_1u5.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_exp_tail.c   |   11 -
 contrib/arm-optimized-routines/pl/math/vn_expf.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_expm1_2u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_expm1f_1u6.c |   12 -
 .../arm-optimized-routines/pl/math/vn_log10_2u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_log10f_3u5.c |   12 -
 .../arm-optimized-routines/pl/math/vn_log1p_2u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_log1pf_2u1.c |   12 -
 .../arm-optimized-routines/pl/math/vn_log2_3u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_log2f_2u5.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_sinh_3u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_sinhf_2u3.c  |   12 -
 .../arm-optimized-routines/pl/math/vn_tan_3u5.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_tanf_3u5.c   |   12 -
 .../arm-optimized-routines/pl/math/vn_tanh_3u.c    |   12 -
 .../arm-optimized-routines/pl/math/vn_tanhf_2u6.c  |   12 -
 .../string/aarch64/asmdefs.h                       |   14 +
 .../string/aarch64/memcpy-advsimd.S                |   62 +-
 .../string/aarch64/memcpy-mops.S                   |   21 +
 .../string/aarch64/memmove-mops.S                  |   21 +
 .../string/aarch64/memset-mops.S                   |   20 +
 .../arm-optimized-routines/string/bench/memcpy.c   |    5 +-
 .../string/include/stringlib.h                     |    7 +-
 .../arm-optimized-routines/string/test/memcpy.c    |    5 +-
 .../arm-optimized-routines/string/test/memmove.c   |    5 +-
 .../arm-optimized-routines/string/test/memset.c    |    5 +-
 414 files changed, 26613 insertions(+), 10731 deletions(-)

diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README
index a2143a28488a..651ebdc84bc8 100644
--- a/contrib/arm-optimized-routines/README
+++ b/contrib/arm-optimized-routines/README
@@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
 the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v23.01.
+release is v24.01.
 
 Source code layout:
 
diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist
index 7a8497507a81..03fb54db52fa 100644
--- a/contrib/arm-optimized-routines/config.mk.dist
+++ b/contrib/arm-optimized-routines/config.mk.dist
@@ -1,6 +1,6 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2022, Arm Limited.
+# Copyright (c) 2018-2023, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
@@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Use with clang.
 #math-cflags += -ffp-contract=fast
 
-# Disable vector math code
-#math-cflags += -DWANT_VMATH=0
-
-# Disable/enable SVE vector math code and tests
+# Disable/enable SVE vector math code and tests.
+# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
+# routines only so that SVE code does not leak into scalar
+# routines. It is also necessary to add it for tools (e.g. ulp,
+# mathbench)
 WANT_SVE_MATH = 0
 ifeq ($(WANT_SVE_MATH), 1)
-  math-cflags += -march=armv8.2-a+sve
+  math-sve-cflags = -march=armv8-a+sve
 endif
 math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
 
diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk
index 2a9cad10d96a..5e9494a7bd3c 100644
--- a/contrib/arm-optimized-routines/math/Dir.mk
+++ b/contrib/arm-optimized-routines/math/Dir.mk
@@ -1,12 +1,14 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
 B := build/math
 
 math-lib-srcs := $(wildcard $(S)/*.[cS])
+math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+
 math-test-srcs := \
 	$(S)/test/mathtest.c \
 	$(S)/test/mathbench.c \
@@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
 
 $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
 $(math-tools): LDLIBS += $(math-ldlibs) -lm
+# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
+$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
 
 build/bin/rtest: $(math-host-objs)
 	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
new file mode 100644
index 000000000000..9a73575bce89
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+  /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .half_pi = V2 (0x1.921fb54442d18p+0),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+  .shift = V2 (0x1.8p52),
+  .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f64 (x);
+  cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+		   vreinterpretq_u64_f64 (d->range_val));
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+  cmp = vcageq_f64 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+  n = vsubq_f64 (n, d->shift);
+  n = vsubq_f64 (n, v_f64 (0.5));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (r, d->pi_2, n);
+  r = vfmsq_f64 (r, d->pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
new file mode 100644
index 000000000000..b9890b2998ad
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .shift = V4 (0x1.8p+23f),
+  .half_pi = V4 (0x1.921fb6p0f),
+  .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, r3, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f32 (x);
+  cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+		   vreinterpretq_u32_f32 (d->range_val));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+  cmp = vcageq_f32 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+  n = vsubq_f32 (n, d->shift);
+  n = vsubq_f32 (n, v_f32 (0.5f));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (r, d->pi_2, n);
+  r = vfmsq_f32 (r, d->pi_3, n);
+
+  /* y = sin(r).  */
+  r2 = vmulq_f32 (r, r);
+  r3 = vmulq_f32 (r2, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, y, r3);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
new file mode 100644
index 000000000000..bc5609faf4fc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
@@ -0,0 +1,125 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.88 +0.5 ulp
+     rel error: 1.4337*2^-53
+     abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+  .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+	    V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
+  .special_bound = V2 (704.0),
+#endif
+  .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
+  .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
+  .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+  .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+  float64x2_t n, r, r2, s, y, z;
+  uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  float64x2_t xm = x;
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+  cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+  /* n = round(x/(ln2/N)).  */
+  z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+  u = vreinterpretq_u64_f64 (z);
+  n = vsubq_f64 (z, data.shift);
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = vfmsq_f64 (r, data.ln2_hi, n);
+  r = vfmsq_f64 (r, data.ln2_lo, n);
+
+  e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (C (0), C (1), r);
+  y = vfmaq_f64 (y, C (2), r2);
+  y = vfmaq_f64 (r, y, r2);
+
+  /* s = 2^(n/N).  */
+  u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+  s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+    return special_case (s, y, n);
+#endif
+
+  return vfmaq_f64 (s, y, s);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
new file mode 100644
index 000000000000..e402205e98e6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[5];
+  uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.962 ulp.  */
+  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+  .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	  /* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine for special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, scale, p, q, poly;
+  uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
+  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = n + r, with r in [-1/2, 1/2].  */
+  n = vrndaq_f32 (x);
+  r = vsubq_f32 (x, n);
+  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  r2 = vmulq_f32 (r, r);
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
new file mode 100644
index 000000000000..ba6b02fbb4bc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+  /*  maxerr: 0.878 ulp.  */
+  0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+#define C5 v_f32 (Poly[5])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+  float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+  uint32x4_t cmp = absn > v_f32 (192.0f);
+  float32x4_t r1 = s1 * s1;
+  float32x4_t r0 = poly * s1 * s2;
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
*** 42574 LINES SKIPPED ***