git: bc8f866013bf - main - misc/pytorch: update 2.3.1 → 2.4.1

From: Yuri Victorovich <yuri_at_FreeBSD.org>
Date: Fri, 04 Oct 2024 09:15:31 UTC
The branch main has been updated by yuri:

URL: https://cgit.FreeBSD.org/ports/commit/?id=bc8f866013bfe38b5661ba7a4988dd47eeac3d4f

commit bc8f866013bfe38b5661ba7a4988dd47eeac3d4f
Author:     Yuri Victorovich <yuri@FreeBSD.org>
AuthorDate: 2024-10-04 09:15:10 +0000
Commit:     Yuri Victorovich <yuri@FreeBSD.org>
CommitDate: 2024-10-04 09:15:24 +0000

    misc/pytorch: update 2.3.1 → 2.4.1
---
 misc/pytorch/Makefile                              |   3 +-
 misc/pytorch/distinfo                              |   6 +-
 misc/pytorch/files/patch-CMakeLists.txt            |  24 +-
 ...aten_src_ATen_cpu_vec_vec256_vec256__bfloat16.h |  11 -
 ...aten_src_ATen_cpu_vec_vec512_vec512__bfloat16.h |  11 -
 .../files/patch-caffe2_proto_CMakeLists.txt        |   8 -
 misc/pytorch/pkg-plist                             | 855 ++++-----------------
 7 files changed, 181 insertions(+), 737 deletions(-)

diff --git a/misc/pytorch/Makefile b/misc/pytorch/Makefile
index 508353364222..fc2e2e49f8c7 100644
--- a/misc/pytorch/Makefile
+++ b/misc/pytorch/Makefile
@@ -1,7 +1,6 @@
 PORTNAME=	pytorch
 DISTVERSIONPREFIX=	v
-DISTVERSION=	2.3.1
-PORTREVISION=	6
+DISTVERSION=	2.4.1
 CATEGORIES=	misc # machine-learning
 MASTER_SITES=	https://github.com/pytorch/pytorch/releases/download/v${DISTVERSION}/
 DIST_SUBDIR=	${PORTNAME}
diff --git a/misc/pytorch/distinfo b/misc/pytorch/distinfo
index d0547799e54e..084af2ec13c7 100644
--- a/misc/pytorch/distinfo
+++ b/misc/pytorch/distinfo
@@ -1,3 +1,3 @@
-TIMESTAMP = 1718421830
-SHA256 (pytorch/pytorch-v2.3.1.tar.gz) = 6c66b59345091907cd62a693b647cee224558e7f15a9b04f4f322f4f6ffeb75b
-SIZE (pytorch/pytorch-v2.3.1.tar.gz) = 277997681
+TIMESTAMP = 1727986762
+SHA256 (pytorch/pytorch-v2.4.1.tar.gz) = 39666a43c0c10f5fd46c1a7ca95dc74d3bc39de2678b70066481cbf02e58384f
+SIZE (pytorch/pytorch-v2.4.1.tar.gz) = 296932555
diff --git a/misc/pytorch/files/patch-CMakeLists.txt b/misc/pytorch/files/patch-CMakeLists.txt
index 078637ac581d..4018e370063c 100644
--- a/misc/pytorch/files/patch-CMakeLists.txt
+++ b/misc/pytorch/files/patch-CMakeLists.txt
@@ -1,6 +1,6 @@
---- CMakeLists.txt.orig	2024-01-31 00:58:01 UTC
+--- CMakeLists.txt.orig	2024-09-04 20:01:18 UTC
 +++ CMakeLists.txt
-@@ -145,7 +145,7 @@ set(CPU_INTEL OFF)
+@@ -181,7 +181,7 @@ set(CPU_INTEL OFF)
  set(CPU_AARCH64 OFF)
  set(CPU_INTEL OFF)
  
@@ -9,16 +9,16 @@
    set(CPU_INTEL ON)
  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)")
    set(CPU_AARCH64 ON)
-@@ -170,7 +170,7 @@ option(BUILD_DOCS "Build Caffe2 documentation" OFF)
- option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
+@@ -210,7 +210,7 @@ option(BUILD_CUSTOM_PROTOBUF
  option(BUILD_BINARY "Build C++ binaries" OFF)
  option(BUILD_DOCS "Build Caffe2 documentation" OFF)
--option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
-+option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" OFF)
+ option(BUILD_CUSTOM_PROTOBUF
+-       "Build and use Caffe2's own protobuf under third_party" ON)
++       "Build and use Caffe2's own protobuf under third_party" OFF)
  option(BUILD_PYTHON "Build Python binaries" ON)
- option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
  option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
-@@ -405,15 +405,15 @@ option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo
+ option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
+@@ -451,15 +451,15 @@ option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo
  # USE_SYSTEM_LIBS being "OFF".
  option(USE_SYSTEM_LIBS "Use all available system-provided libraries." OFF)
  option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo." OFF)
@@ -35,9 +35,9 @@
 -option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
 +option(USE_SYSTEM_ONNX "Use system-provided onnx." ON)
  option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
- option(USE_SYSTEM_ZSTD "Use system-provided zstd." OFF)
  option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
-@@ -838,11 +838,11 @@ if(NOT MSVC)
+ if(USE_SYSTEM_LIBS)
+@@ -971,11 +971,11 @@ if(NOT MSVC)
    # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
    string(APPEND CMAKE_CXX_FLAGS " -Wall")
    string(APPEND CMAKE_CXX_FLAGS " -Wextra")
@@ -52,5 +52,5 @@
 +  #append_cxx_flag_if_supported("-Werror=range-loop-construct" CMAKE_CXX_FLAGS)
 +  #append_cxx_flag_if_supported("-Werror=bool-operation" CMAKE_CXX_FLAGS)
    append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wno-missing-field-initializers"
+                                CMAKE_CXX_FLAGS)
diff --git a/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec256_vec256__bfloat16.h b/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec256_vec256__bfloat16.h
deleted file mode 100644
index e03ac51d837b..000000000000
--- a/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec256_vec256__bfloat16.h
+++ /dev/null
@@ -1,11 +0,0 @@
---- aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h.orig	2024-03-27 22:28:51 UTC
-+++ aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-@@ -265,7 +265,7 @@ static_assert( (public)
-     }
-     return b;
-   }
--  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
-+  Vectorized<T> map(__m256 (*const vop)(__m256)) const {
-     __m256 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
diff --git a/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec512_vec512__bfloat16.h b/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec512_vec512__bfloat16.h
deleted file mode 100644
index 80e0b1832434..000000000000
--- a/misc/pytorch/files/patch-aten_src_ATen_cpu_vec_vec512_vec512__bfloat16.h
+++ /dev/null
@@ -1,11 +0,0 @@
---- aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h.orig	2023-10-12 12:54:40 UTC
-+++ aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-@@ -345,7 +345,7 @@ static_assert( (public)
-   }
-   #pragma clang diagnostic push
-   #pragma clang diagnostic ignored "-Wignored-qualifiers"
--  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
-+  Vectorized<T> map(__m512 (*const vop)(__m512)) const {
-     __m512 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
diff --git a/misc/pytorch/files/patch-caffe2_proto_CMakeLists.txt b/misc/pytorch/files/patch-caffe2_proto_CMakeLists.txt
deleted file mode 100644
index 1df1849c0556..000000000000
--- a/misc/pytorch/files/patch-caffe2_proto_CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
---- caffe2/proto/CMakeLists.txt.orig	2023-05-08 19:58:16 UTC
-+++ caffe2/proto/CMakeLists.txt
-@@ -1,3 +1,5 @@
-+set(CMAKE_CXX_STANDARD 17)
-+
- if(BUILD_CAFFE2)
-   file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
- else()
diff --git a/misc/pytorch/pkg-plist b/misc/pytorch/pkg-plist
index d54b7b628740..8bd2a42d05ce 100644
--- a/misc/pytorch/pkg-plist
+++ b/misc/pytorch/pkg-plist
@@ -4,6 +4,7 @@ include/ATen/AccumulateType.h
 include/ATen/ArrayRef.h
 include/ATen/Backend.h
 include/ATen/Backtrace.h
+include/ATen/BlasBackend.h
 include/ATen/CPUApplyUtils.h
 include/ATen/CPUFixedAllocator.h
 include/ATen/CPUFunctions.h
@@ -70,7 +71,6 @@ include/ATen/Parallel-inl.h
 include/ATen/Parallel.h
 include/ATen/ParallelFuture.h
 include/ATen/ParallelNative.h
-include/ATen/ParallelNativeTBB.h
 include/ATen/ParallelOpenMP.h
 include/ATen/PythonTorchFunctionTLS.h
 include/ATen/RedispatchFunctions.h
@@ -116,6 +116,7 @@ include/ATen/core/ATen_fwd.h
 include/ATen/core/ATen_pch.h
 include/ATen/core/Array.h
 include/ATen/core/Backtrace.h
+include/ATen/core/CachingHostAllocator.h
 include/ATen/core/CheckMemoryFormat.h
 include/ATen/core/DeprecatedTypeProperties.h
 include/ATen/core/DeprecatedTypePropertiesRegistry.h
@@ -218,10 +219,13 @@ include/ATen/cpu/vec/vec256/vec256.h
 include/ATen/cpu/vec/vec256/vec256_bfloat16.h
 include/ATen/cpu/vec/vec256/vec256_complex_double.h
 include/ATen/cpu/vec/vec256/vec256_complex_float.h
+include/ATen/cpu/vec/vec256/vec256_convert.h
 include/ATen/cpu/vec/vec256/vec256_double.h
 include/ATen/cpu/vec/vec256/vec256_float.h
 include/ATen/cpu/vec/vec256/vec256_float_neon.h
+include/ATen/cpu/vec/vec256/vec256_half_neon.h
 include/ATen/cpu/vec/vec256/vec256_int.h
+include/ATen/cpu/vec/vec256/vec256_mask.h
 include/ATen/cpu/vec/vec256/vec256_qint.h
 include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
 include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
@@ -241,12 +245,16 @@ include/ATen/cpu/vec/vec512/vec512.h
 include/ATen/cpu/vec/vec512/vec512_bfloat16.h
 include/ATen/cpu/vec/vec512/vec512_complex_double.h
 include/ATen/cpu/vec/vec512/vec512_complex_float.h
+include/ATen/cpu/vec/vec512/vec512_convert.h
 include/ATen/cpu/vec/vec512/vec512_double.h
 include/ATen/cpu/vec/vec512/vec512_float.h
 include/ATen/cpu/vec/vec512/vec512_int.h
+include/ATen/cpu/vec/vec512/vec512_mask.h
 include/ATen/cpu/vec/vec512/vec512_qint.h
 include/ATen/cpu/vec/vec_base.h
+include/ATen/cpu/vec/vec_convert.h
 include/ATen/cpu/vec/vec_half.h
+include/ATen/cpu/vec/vec_mask.h
 include/ATen/cpu/vec/vec_n.h
 include/ATen/cpu/vml.h
 include/ATen/cuda/ATenCUDAGeneral.h
@@ -315,9 +323,9 @@ include/ATen/detail/CUDAHooksInterface.h
 include/ATen/detail/FunctionTraits.h
 include/ATen/detail/HIPHooksInterface.h
 include/ATen/detail/IPUHooksInterface.h
+include/ATen/detail/MAIAHooksInterface.h
 include/ATen/detail/MPSHooksInterface.h
 include/ATen/detail/MTIAHooksInterface.h
-include/ATen/detail/ORTHooksInterface.h
 include/ATen/detail/PrivateUse1HooksInterface.h
 include/ATen/detail/XPUHooksInterface.h
 include/ATen/div_rtn.h
@@ -385,6 +393,9 @@ include/ATen/native/Fill.h
 include/ATen/native/ForeachUtils.h
 include/ATen/native/FractionalMaxPooling.h
 include/ATen/native/FunctionOfAMatrixUtils.h
+include/ATen/native/FusedAdagrad.h
+include/ATen/native/FusedAdam.h
+include/ATen/native/FusedSGD.h
 include/ATen/native/GridSampler.h
 include/ATen/native/GridSamplerUtils.h
 include/ATen/native/Histogram.h
@@ -572,6 +583,51 @@ include/ATen/native/quantized/cpu/init_qnnpack.h
 include/ATen/native/quantized/cpu/qembeddingbag.h
 include/ATen/native/quantized/cpu/qembeddingbag_prepack.h
 include/ATen/native/quantized/cudnn/utils.h
+include/ATen/native/transformers/attention.h
+include/ATen/native/transformers/cuda/flash_attn/alibi.h
+include/ATen/native/transformers/cuda/flash_attn/block_info.h
+include/ATen/native/transformers/cuda/flash_attn/dropout.h
+include/ATen/native/transformers/cuda/flash_attn/flash.h
+include/ATen/native/transformers/cuda/flash_attn/flash_api.h
+include/ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h
+include/ATen/native/transformers/cuda/flash_attn/flash_bwd_launch_template.h
+include/ATen/native/transformers/cuda/flash_attn/flash_bwd_preprocess_kernel.h
+include/ATen/native/transformers/cuda/flash_attn/flash_fwd_kernel.h
+include/ATen/native/transformers/cuda/flash_attn/flash_fwd_launch_template.h
+include/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
+include/ATen/native/transformers/cuda/flash_attn/mask.h
+include/ATen/native/transformers/cuda/flash_attn/rotary.h
+include/ATen/native/transformers/cuda/flash_attn/softmax.h
+include/ATen/native/transformers/cuda/flash_attn/static_switch.h
+include/ATen/native/transformers/cuda/flash_attn/utils.h
+include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h
+include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_pipelined.h
+include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h
+include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h
+include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/default_warp_iterator_from_smem.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/make_residual_last.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/transpose_warp_iterator.h
+include/ATen/native/transformers/cuda/mem_eff_attention/iterators/warp_iterator_from_smem.h
+include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
+include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
+include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h
+include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h
+include/ATen/native/transformers/cuda/mem_eff_attention/transform/tile_smem_loader.h
+include/ATen/native/transformers/cuda/sdp_utils.h
+include/ATen/native/transformers/hip/aotriton_adapter.h
+include/ATen/native/transformers/sdp_utils_cpp.h
 include/ATen/native/utils/Factory.h
 include/ATen/native/utils/ParamUtils.h
 include/ATen/native/utils/ParamsHash.h
@@ -671,6 +727,16 @@ include/ATen/ops/_batch_norm_impl_index_backward_ops.h
 include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_batch_norm_impl_index_native.h
 include/ATen/ops/_batch_norm_impl_index_ops.h
+include/ATen/ops/_batch_norm_no_update.h
+include/ATen/ops/_batch_norm_no_update_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_batch_norm_no_update_native.h
+include/ATen/ops/_batch_norm_no_update_ops.h
+include/ATen/ops/_batch_norm_with_update.h
+include/ATen/ops/_batch_norm_with_update_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_batch_norm_with_update_cpu_dispatch.h
+include/ATen/ops/_batch_norm_with_update_cuda_dispatch.h
+include/ATen/ops/_batch_norm_with_update_native.h
+include/ATen/ops/_batch_norm_with_update_ops.h
 include/ATen/ops/_cast_Byte.h
 include/ATen/ops/_cast_Byte_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_cast_Byte_native.h
@@ -727,6 +793,7 @@ include/ATen/ops/_choose_qparams_per_tensor_native.h
 include/ATen/ops/_choose_qparams_per_tensor_ops.h
 include/ATen/ops/_chunk_cat.h
 include/ATen/ops/_chunk_cat_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_chunk_cat_cuda_dispatch.h
 include/ATen/ops/_chunk_cat_native.h
 include/ATen/ops/_chunk_cat_ops.h
 include/ATen/ops/_coalesce.h
@@ -1022,265 +1089,226 @@ include/ATen/ops/_foobar_native.h
 include/ATen/ops/_foobar_ops.h
 include/ATen/ops/_foreach_abs.h
 include/ATen/ops/_foreach_abs_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_abs_cpu_dispatch.h
 include/ATen/ops/_foreach_abs_cuda_dispatch.h
 include/ATen/ops/_foreach_abs_native.h
 include/ATen/ops/_foreach_abs_ops.h
 include/ATen/ops/_foreach_acos.h
 include/ATen/ops/_foreach_acos_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_acos_cpu_dispatch.h
 include/ATen/ops/_foreach_acos_cuda_dispatch.h
 include/ATen/ops/_foreach_acos_native.h
 include/ATen/ops/_foreach_acos_ops.h
 include/ATen/ops/_foreach_add.h
 include/ATen/ops/_foreach_add_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_add_cpu_dispatch.h
 include/ATen/ops/_foreach_add_cuda_dispatch.h
 include/ATen/ops/_foreach_add_native.h
 include/ATen/ops/_foreach_add_ops.h
 include/ATen/ops/_foreach_addcdiv.h
 include/ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_addcdiv_cpu_dispatch.h
 include/ATen/ops/_foreach_addcdiv_cuda_dispatch.h
 include/ATen/ops/_foreach_addcdiv_native.h
 include/ATen/ops/_foreach_addcdiv_ops.h
 include/ATen/ops/_foreach_addcmul.h
 include/ATen/ops/_foreach_addcmul_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_addcmul_cpu_dispatch.h
 include/ATen/ops/_foreach_addcmul_cuda_dispatch.h
 include/ATen/ops/_foreach_addcmul_native.h
 include/ATen/ops/_foreach_addcmul_ops.h
 include/ATen/ops/_foreach_asin.h
 include/ATen/ops/_foreach_asin_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_asin_cpu_dispatch.h
 include/ATen/ops/_foreach_asin_cuda_dispatch.h
 include/ATen/ops/_foreach_asin_native.h
 include/ATen/ops/_foreach_asin_ops.h
 include/ATen/ops/_foreach_atan.h
 include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_atan_cpu_dispatch.h
 include/ATen/ops/_foreach_atan_cuda_dispatch.h
 include/ATen/ops/_foreach_atan_native.h
 include/ATen/ops/_foreach_atan_ops.h
 include/ATen/ops/_foreach_ceil.h
 include/ATen/ops/_foreach_ceil_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_ceil_cpu_dispatch.h
 include/ATen/ops/_foreach_ceil_cuda_dispatch.h
 include/ATen/ops/_foreach_ceil_native.h
 include/ATen/ops/_foreach_ceil_ops.h
 include/ATen/ops/_foreach_clamp_max.h
 include/ATen/ops/_foreach_clamp_max_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_clamp_max_cpu_dispatch.h
 include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h
 include/ATen/ops/_foreach_clamp_max_native.h
 include/ATen/ops/_foreach_clamp_max_ops.h
 include/ATen/ops/_foreach_clamp_min.h
 include/ATen/ops/_foreach_clamp_min_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_clamp_min_cpu_dispatch.h
 include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h
 include/ATen/ops/_foreach_clamp_min_native.h
 include/ATen/ops/_foreach_clamp_min_ops.h
 include/ATen/ops/_foreach_copy.h
 include/ATen/ops/_foreach_copy_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_copy_cpu_dispatch.h
 include/ATen/ops/_foreach_copy_cuda_dispatch.h
 include/ATen/ops/_foreach_copy_native.h
 include/ATen/ops/_foreach_copy_ops.h
 include/ATen/ops/_foreach_cos.h
 include/ATen/ops/_foreach_cos_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_cos_cpu_dispatch.h
 include/ATen/ops/_foreach_cos_cuda_dispatch.h
 include/ATen/ops/_foreach_cos_native.h
 include/ATen/ops/_foreach_cos_ops.h
 include/ATen/ops/_foreach_cosh.h
 include/ATen/ops/_foreach_cosh_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_cosh_cpu_dispatch.h
 include/ATen/ops/_foreach_cosh_cuda_dispatch.h
 include/ATen/ops/_foreach_cosh_native.h
 include/ATen/ops/_foreach_cosh_ops.h
 include/ATen/ops/_foreach_div.h
 include/ATen/ops/_foreach_div_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_div_cpu_dispatch.h
 include/ATen/ops/_foreach_div_cuda_dispatch.h
 include/ATen/ops/_foreach_div_native.h
 include/ATen/ops/_foreach_div_ops.h
 include/ATen/ops/_foreach_erf.h
 include/ATen/ops/_foreach_erf_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_erf_cpu_dispatch.h
 include/ATen/ops/_foreach_erf_cuda_dispatch.h
 include/ATen/ops/_foreach_erf_native.h
 include/ATen/ops/_foreach_erf_ops.h
 include/ATen/ops/_foreach_erfc.h
 include/ATen/ops/_foreach_erfc_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_erfc_cpu_dispatch.h
 include/ATen/ops/_foreach_erfc_cuda_dispatch.h
 include/ATen/ops/_foreach_erfc_native.h
 include/ATen/ops/_foreach_erfc_ops.h
 include/ATen/ops/_foreach_exp.h
 include/ATen/ops/_foreach_exp_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_exp_cpu_dispatch.h
 include/ATen/ops/_foreach_exp_cuda_dispatch.h
 include/ATen/ops/_foreach_exp_native.h
 include/ATen/ops/_foreach_exp_ops.h
 include/ATen/ops/_foreach_expm1.h
 include/ATen/ops/_foreach_expm1_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_expm1_cpu_dispatch.h
 include/ATen/ops/_foreach_expm1_cuda_dispatch.h
 include/ATen/ops/_foreach_expm1_native.h
 include/ATen/ops/_foreach_expm1_ops.h
 include/ATen/ops/_foreach_floor.h
 include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_floor_cpu_dispatch.h
 include/ATen/ops/_foreach_floor_cuda_dispatch.h
 include/ATen/ops/_foreach_floor_native.h
 include/ATen/ops/_foreach_floor_ops.h
 include/ATen/ops/_foreach_frac.h
 include/ATen/ops/_foreach_frac_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_frac_cpu_dispatch.h
 include/ATen/ops/_foreach_frac_cuda_dispatch.h
 include/ATen/ops/_foreach_frac_native.h
 include/ATen/ops/_foreach_frac_ops.h
 include/ATen/ops/_foreach_lerp.h
 include/ATen/ops/_foreach_lerp_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_lerp_cpu_dispatch.h
 include/ATen/ops/_foreach_lerp_cuda_dispatch.h
 include/ATen/ops/_foreach_lerp_native.h
 include/ATen/ops/_foreach_lerp_ops.h
 include/ATen/ops/_foreach_lgamma.h
 include/ATen/ops/_foreach_lgamma_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_lgamma_cpu_dispatch.h
 include/ATen/ops/_foreach_lgamma_cuda_dispatch.h
 include/ATen/ops/_foreach_lgamma_native.h
 include/ATen/ops/_foreach_lgamma_ops.h
 include/ATen/ops/_foreach_log.h
 include/ATen/ops/_foreach_log10.h
 include/ATen/ops/_foreach_log10_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_log10_cpu_dispatch.h
 include/ATen/ops/_foreach_log10_cuda_dispatch.h
 include/ATen/ops/_foreach_log10_native.h
 include/ATen/ops/_foreach_log10_ops.h
 include/ATen/ops/_foreach_log1p.h
 include/ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_log1p_cpu_dispatch.h
 include/ATen/ops/_foreach_log1p_cuda_dispatch.h
 include/ATen/ops/_foreach_log1p_native.h
 include/ATen/ops/_foreach_log1p_ops.h
 include/ATen/ops/_foreach_log2.h
 include/ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_log2_cpu_dispatch.h
 include/ATen/ops/_foreach_log2_cuda_dispatch.h
 include/ATen/ops/_foreach_log2_native.h
 include/ATen/ops/_foreach_log2_ops.h
 include/ATen/ops/_foreach_log_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_log_cpu_dispatch.h
 include/ATen/ops/_foreach_log_cuda_dispatch.h
 include/ATen/ops/_foreach_log_native.h
 include/ATen/ops/_foreach_log_ops.h
+include/ATen/ops/_foreach_max.h
+include/ATen/ops/_foreach_max_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_foreach_max_cuda_dispatch.h
+include/ATen/ops/_foreach_max_native.h
+include/ATen/ops/_foreach_max_ops.h
 include/ATen/ops/_foreach_maximum.h
 include/ATen/ops/_foreach_maximum_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_maximum_cpu_dispatch.h
 include/ATen/ops/_foreach_maximum_cuda_dispatch.h
 include/ATen/ops/_foreach_maximum_native.h
 include/ATen/ops/_foreach_maximum_ops.h
 include/ATen/ops/_foreach_minimum.h
 include/ATen/ops/_foreach_minimum_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_minimum_cpu_dispatch.h
 include/ATen/ops/_foreach_minimum_cuda_dispatch.h
 include/ATen/ops/_foreach_minimum_native.h
 include/ATen/ops/_foreach_minimum_ops.h
 include/ATen/ops/_foreach_mul.h
 include/ATen/ops/_foreach_mul_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_mul_cpu_dispatch.h
 include/ATen/ops/_foreach_mul_cuda_dispatch.h
 include/ATen/ops/_foreach_mul_native.h
 include/ATen/ops/_foreach_mul_ops.h
 include/ATen/ops/_foreach_neg.h
 include/ATen/ops/_foreach_neg_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_neg_cpu_dispatch.h
 include/ATen/ops/_foreach_neg_cuda_dispatch.h
 include/ATen/ops/_foreach_neg_native.h
 include/ATen/ops/_foreach_neg_ops.h
 include/ATen/ops/_foreach_norm.h
 include/ATen/ops/_foreach_norm_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_norm_cpu_dispatch.h
 include/ATen/ops/_foreach_norm_cuda_dispatch.h
 include/ATen/ops/_foreach_norm_native.h
 include/ATen/ops/_foreach_norm_ops.h
 include/ATen/ops/_foreach_pow.h
 include/ATen/ops/_foreach_pow_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_pow_cpu_dispatch.h
 include/ATen/ops/_foreach_pow_cuda_dispatch.h
 include/ATen/ops/_foreach_pow_native.h
 include/ATen/ops/_foreach_pow_ops.h
 include/ATen/ops/_foreach_reciprocal.h
 include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_reciprocal_cpu_dispatch.h
 include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h
 include/ATen/ops/_foreach_reciprocal_native.h
 include/ATen/ops/_foreach_reciprocal_ops.h
 include/ATen/ops/_foreach_round.h
 include/ATen/ops/_foreach_round_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_round_cpu_dispatch.h
 include/ATen/ops/_foreach_round_cuda_dispatch.h
 include/ATen/ops/_foreach_round_native.h
 include/ATen/ops/_foreach_round_ops.h
 include/ATen/ops/_foreach_sigmoid.h
 include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sigmoid_cpu_dispatch.h
 include/ATen/ops/_foreach_sigmoid_cuda_dispatch.h
 include/ATen/ops/_foreach_sigmoid_native.h
 include/ATen/ops/_foreach_sigmoid_ops.h
 include/ATen/ops/_foreach_sign.h
 include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sign_cpu_dispatch.h
 include/ATen/ops/_foreach_sign_cuda_dispatch.h
 include/ATen/ops/_foreach_sign_native.h
 include/ATen/ops/_foreach_sign_ops.h
 include/ATen/ops/_foreach_sin.h
 include/ATen/ops/_foreach_sin_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sin_cpu_dispatch.h
 include/ATen/ops/_foreach_sin_cuda_dispatch.h
 include/ATen/ops/_foreach_sin_native.h
 include/ATen/ops/_foreach_sin_ops.h
 include/ATen/ops/_foreach_sinh.h
 include/ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sinh_cpu_dispatch.h
 include/ATen/ops/_foreach_sinh_cuda_dispatch.h
 include/ATen/ops/_foreach_sinh_native.h
 include/ATen/ops/_foreach_sinh_ops.h
 include/ATen/ops/_foreach_sqrt.h
 include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sqrt_cpu_dispatch.h
 include/ATen/ops/_foreach_sqrt_cuda_dispatch.h
 include/ATen/ops/_foreach_sqrt_native.h
 include/ATen/ops/_foreach_sqrt_ops.h
 include/ATen/ops/_foreach_sub.h
 include/ATen/ops/_foreach_sub_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_sub_cpu_dispatch.h
 include/ATen/ops/_foreach_sub_cuda_dispatch.h
 include/ATen/ops/_foreach_sub_native.h
 include/ATen/ops/_foreach_sub_ops.h
 include/ATen/ops/_foreach_tan.h
 include/ATen/ops/_foreach_tan_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_tan_cpu_dispatch.h
 include/ATen/ops/_foreach_tan_cuda_dispatch.h
 include/ATen/ops/_foreach_tan_native.h
 include/ATen/ops/_foreach_tan_ops.h
 include/ATen/ops/_foreach_tanh.h
 include/ATen/ops/_foreach_tanh_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_tanh_cpu_dispatch.h
 include/ATen/ops/_foreach_tanh_cuda_dispatch.h
 include/ATen/ops/_foreach_tanh_native.h
 include/ATen/ops/_foreach_tanh_ops.h
 include/ATen/ops/_foreach_trunc.h
 include/ATen/ops/_foreach_trunc_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_trunc_cpu_dispatch.h
 include/ATen/ops/_foreach_trunc_cuda_dispatch.h
 include/ATen/ops/_foreach_trunc_native.h
 include/ATen/ops/_foreach_trunc_ops.h
 include/ATen/ops/_foreach_zero.h
 include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h
-include/ATen/ops/_foreach_zero_cpu_dispatch.h
 include/ATen/ops/_foreach_zero_cuda_dispatch.h
 include/ATen/ops/_foreach_zero_native.h
 include/ATen/ops/_foreach_zero_ops.h
@@ -1300,13 +1328,20 @@ include/ATen/ops/_functional_sym_constrain_range_for_size_native.h
 include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h
 include/ATen/ops/_functional_sym_constrain_range_native.h
 include/ATen/ops/_functional_sym_constrain_range_ops.h
+include/ATen/ops/_fused_adagrad.h
+include/ATen/ops/_fused_adagrad_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_fused_adagrad_cpu_dispatch.h
+include/ATen/ops/_fused_adagrad_native.h
+include/ATen/ops/_fused_adagrad_ops.h
 include/ATen/ops/_fused_adam.h
 include/ATen/ops/_fused_adam_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_fused_adam_cpu_dispatch.h
 include/ATen/ops/_fused_adam_cuda_dispatch.h
 include/ATen/ops/_fused_adam_native.h
 include/ATen/ops/_fused_adam_ops.h
 include/ATen/ops/_fused_adamw.h
 include/ATen/ops/_fused_adamw_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_fused_adamw_cpu_dispatch.h
 include/ATen/ops/_fused_adamw_cuda_dispatch.h
 include/ATen/ops/_fused_adamw_native.h
 include/ATen/ops/_fused_adamw_ops.h
@@ -1329,6 +1364,7 @@ include/ATen/ops/_fused_sdp_choice_native.h
 include/ATen/ops/_fused_sdp_choice_ops.h
 include/ATen/ops/_fused_sgd.h
 include/ATen/ops/_fused_sgd_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_fused_sgd_cpu_dispatch.h
 include/ATen/ops/_fused_sgd_cuda_dispatch.h
 include/ATen/ops/_fused_sgd_native.h
 include/ATen/ops/_fused_sgd_ops.h
@@ -1392,6 +1428,7 @@ include/ATen/ops/_indices_copy_ops.h
 include/ATen/ops/_indices_native.h
 include/ATen/ops/_indices_ops.h
 include/ATen/ops/_int_mm.h
+include/ATen/ops/_int_mm_cpu_dispatch.h
 include/ATen/ops/_int_mm_cuda_dispatch.h
 include/ATen/ops/_int_mm_native.h
 include/ATen/ops/_int_mm_ops.h
@@ -1407,6 +1444,10 @@ include/ATen/ops/_is_zerotensor.h
 include/ATen/ops/_is_zerotensor_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_is_zerotensor_native.h
 include/ATen/ops/_is_zerotensor_ops.h
+include/ATen/ops/_jagged_to_padded_dense_forward.h
+include/ATen/ops/_jagged_to_padded_dense_forward_cuda_dispatch.h
+include/ATen/ops/_jagged_to_padded_dense_forward_native.h
+include/ATen/ops/_jagged_to_padded_dense_forward_ops.h
 include/ATen/ops/_lazy_clone.h
 include/ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h
 include/ATen/ops/_lazy_clone_native.h
@@ -1582,6 +1623,11 @@ include/ATen/ops/_neg_view_copy_native.h
 include/ATen/ops/_neg_view_copy_ops.h
 include/ATen/ops/_neg_view_native.h
 include/ATen/ops/_neg_view_ops.h
+include/ATen/ops/_nested_compute_contiguous_strides_offsets.h
+include/ATen/ops/_nested_compute_contiguous_strides_offsets_cpu_dispatch.h
+include/ATen/ops/_nested_compute_contiguous_strides_offsets_cuda_dispatch.h
+include/ATen/ops/_nested_compute_contiguous_strides_offsets_native.h
+include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h
 include/ATen/ops/_nested_from_padded.h
 include/ATen/ops/_nested_from_padded_and_nested_example.h
 include/ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h
@@ -1701,6 +1747,10 @@ include/ATen/ops/_pad_packed_sequence.h
 include/ATen/ops/_pad_packed_sequence_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_pad_packed_sequence_native.h
 include/ATen/ops/_pad_packed_sequence_ops.h
+include/ATen/ops/_padded_dense_to_jagged_forward.h
+include/ATen/ops/_padded_dense_to_jagged_forward_cuda_dispatch.h
+include/ATen/ops/_padded_dense_to_jagged_forward_native.h
+include/ATen/ops/_padded_dense_to_jagged_forward_ops.h
 include/ATen/ops/_pdist_backward.h
 include/ATen/ops/_pdist_backward_compositeexplicitautograd_dispatch.h
 include/ATen/ops/_pdist_backward_cpu_dispatch.h
@@ -1783,6 +1833,10 @@ include/ATen/ops/_scaled_dot_product_attention_math_compositeimplicitautograd_di
 include/ATen/ops/_scaled_dot_product_attention_math_native.h
 include/ATen/ops/_scaled_dot_product_attention_math_ops.h
 include/ATen/ops/_scaled_dot_product_cudnn_attention.h
+include/ATen/ops/_scaled_dot_product_cudnn_attention_backward.h
+include/ATen/ops/_scaled_dot_product_cudnn_attention_backward_cuda_dispatch.h
+include/ATen/ops/_scaled_dot_product_cudnn_attention_backward_native.h
+include/ATen/ops/_scaled_dot_product_cudnn_attention_backward_ops.h
 include/ATen/ops/_scaled_dot_product_cudnn_attention_cuda_dispatch.h
 include/ATen/ops/_scaled_dot_product_cudnn_attention_native.h
 include/ATen/ops/_scaled_dot_product_cudnn_attention_ops.h
@@ -1891,6 +1945,10 @@ include/ATen/ops/_sparse_compressed_tensor_unsafe.h
 include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_sparse_compressed_tensor_unsafe_native.h
 include/ATen/ops/_sparse_compressed_tensor_unsafe_ops.h
+include/ATen/ops/_sparse_compressed_tensor_with_dims.h
+include/ATen/ops/_sparse_compressed_tensor_with_dims_compositeexplicitautograd_dispatch.h
+include/ATen/ops/_sparse_compressed_tensor_with_dims_native.h
+include/ATen/ops/_sparse_compressed_tensor_with_dims_ops.h
 include/ATen/ops/_sparse_coo_tensor_unsafe.h
 include/ATen/ops/_sparse_coo_tensor_unsafe_compositeimplicitautograd_dispatch.h
 include/ATen/ops/_sparse_coo_tensor_unsafe_native.h
@@ -1944,10 +2002,30 @@ include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h
 include/ATen/ops/_sparse_mm_reduce_impl_backward_ops.h
 include/ATen/ops/_sparse_mm_reduce_impl_native.h
 include/ATen/ops/_sparse_mm_reduce_impl_ops.h
+include/ATen/ops/_sparse_semi_structured_addmm.h
+include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h
+include/ATen/ops/_sparse_semi_structured_addmm_native.h
+include/ATen/ops/_sparse_semi_structured_addmm_ops.h
+include/ATen/ops/_sparse_semi_structured_apply.h
+include/ATen/ops/_sparse_semi_structured_apply_cuda_dispatch.h
+include/ATen/ops/_sparse_semi_structured_apply_dense.h
+include/ATen/ops/_sparse_semi_structured_apply_dense_cuda_dispatch.h
+include/ATen/ops/_sparse_semi_structured_apply_dense_native.h
+include/ATen/ops/_sparse_semi_structured_apply_dense_ops.h
+include/ATen/ops/_sparse_semi_structured_apply_native.h
+include/ATen/ops/_sparse_semi_structured_apply_ops.h
 include/ATen/ops/_sparse_semi_structured_linear.h
 include/ATen/ops/_sparse_semi_structured_linear_cuda_dispatch.h
 include/ATen/ops/_sparse_semi_structured_linear_native.h
 include/ATen/ops/_sparse_semi_structured_linear_ops.h
+include/ATen/ops/_sparse_semi_structured_mm.h
+include/ATen/ops/_sparse_semi_structured_mm_cuda_dispatch.h
+include/ATen/ops/_sparse_semi_structured_mm_native.h
+include/ATen/ops/_sparse_semi_structured_mm_ops.h
+include/ATen/ops/_sparse_semi_structured_tile.h
+include/ATen/ops/_sparse_semi_structured_tile_cuda_dispatch.h
+include/ATen/ops/_sparse_semi_structured_tile_native.h
+include/ATen/ops/_sparse_semi_structured_tile_ops.h
 include/ATen/ops/_sparse_softmax.h
 include/ATen/ops/_sparse_softmax_backward_data.h
 include/ATen/ops/_sparse_softmax_backward_data_compositeexplicitautograd_dispatch.h
@@ -2756,11 +2834,16 @@ include/ATen/ops/bartlett_window_compositeexplicitautograd_dispatch.h
 include/ATen/ops/bartlett_window_native.h
 include/ATen/ops/bartlett_window_ops.h
 include/ATen/ops/batch_norm.h
+include/ATen/ops/batch_norm_backward.h
+include/ATen/ops/batch_norm_backward_cpu_dispatch.h
+include/ATen/ops/batch_norm_backward_cuda_dispatch.h
 include/ATen/ops/batch_norm_backward_elemt.h
 include/ATen/ops/batch_norm_backward_elemt_compositeexplicitautograd_dispatch.h
 include/ATen/ops/batch_norm_backward_elemt_cuda_dispatch.h
 include/ATen/ops/batch_norm_backward_elemt_native.h
 include/ATen/ops/batch_norm_backward_elemt_ops.h
+include/ATen/ops/batch_norm_backward_native.h
+include/ATen/ops/batch_norm_backward_ops.h
 include/ATen/ops/batch_norm_backward_reduce.h
 include/ATen/ops/batch_norm_backward_reduce_compositeexplicitautograd_dispatch.h
 include/ATen/ops/batch_norm_backward_reduce_cuda_dispatch.h
@@ -3148,6 +3231,7 @@ include/ATen/ops/convolution_overrideable_ops.h
 include/ATen/ops/copy.h
 include/ATen/ops/copy_compositeexplicitautograd_dispatch.h
 include/ATen/ops/copy_compositeexplicitautogradnonfunctional_dispatch.h
+include/ATen/ops/copy_meta_dispatch.h
 include/ATen/ops/copy_native.h
 include/ATen/ops/copy_ops.h
 include/ATen/ops/copy_sparse_to_sparse.h
@@ -3325,8 +3409,7 @@ include/ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h
 include/ATen/ops/deg2rad_native.h
 include/ATen/ops/deg2rad_ops.h
 include/ATen/ops/dense_dim.h
-include/ATen/ops/dense_dim_cpu_dispatch.h
-include/ATen/ops/dense_dim_cuda_dispatch.h
+include/ATen/ops/dense_dim_compositeexplicitautograd_dispatch.h
 include/ATen/ops/dense_dim_native.h
 include/ATen/ops/dense_dim_ops.h
 include/ATen/ops/dequantize.h
@@ -5977,6 +6060,10 @@ include/ATen/ops/retains_grad.h
 include/ATen/ops/retains_grad_compositeimplicitautograd_dispatch.h
 include/ATen/ops/retains_grad_native.h
 include/ATen/ops/retains_grad_ops.h
+include/ATen/ops/rms_norm.h
+include/ATen/ops/rms_norm_compositeimplicitautograd_dispatch.h
+include/ATen/ops/rms_norm_native.h
+include/ATen/ops/rms_norm_ops.h
 include/ATen/ops/rnn_relu.h
 include/ATen/ops/rnn_relu_cell.h
 include/ATen/ops/rnn_relu_cell_compositeimplicitautograd_dispatch.h
@@ -6383,8 +6470,7 @@ include/ATen/ops/sparse_csr_tensor_compositeimplicitautograd_dispatch.h
 include/ATen/ops/sparse_csr_tensor_native.h
 include/ATen/ops/sparse_csr_tensor_ops.h
 include/ATen/ops/sparse_dim.h
-include/ATen/ops/sparse_dim_cpu_dispatch.h
-include/ATen/ops/sparse_dim_cuda_dispatch.h
+include/ATen/ops/sparse_dim_compositeexplicitautograd_dispatch.h
 include/ATen/ops/sparse_dim_native.h
 include/ATen/ops/sparse_dim_ops.h
 include/ATen/ops/sparse_mask.h
@@ -7442,6 +7528,8 @@ include/ATen/ops/zeros_ops.h
 include/ATen/quantized/QTensorImpl.h
 include/ATen/quantized/Quantizer.h
 include/ATen/record_function.h
+include/ATen/xpu/CachingHostAllocator.h
+include/ATen/xpu/PinnedMemoryAllocator.h
 include/ATen/xpu/XPUContext.h
 include/ATen/xpu/XPUDevice.h
 include/ATen/xpu/XPUEvent.h
@@ -7570,6 +7658,7 @@ include/c10/util/FunctionRef.h
 include/c10/util/Half-inl.h
 include/c10/util/Half.h
 include/c10/util/IdWrapper.h
+include/c10/util/Lazy.h
 include/c10/util/LeftRight.h
 include/c10/util/Load.h
 include/c10/util/Logging.h
@@ -7645,507 +7734,9 @@ include/c10/xpu/XPUMacros.h
 include/c10/xpu/XPUStream.h
 include/c10/xpu/impl/XPUGuardImpl.h
 include/c10/xpu/test/impl/XPUTest.h
-include/caffe2/contrib/aten/aten_op.h
-include/caffe2/contrib/aten/aten_op_template.h
-include/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/common.h
-include/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
-include/caffe2/contrib/fakelowp/fp16_fma.h
-include/caffe2/contrib/fakelowp/fp16_gemm_utils.h
-include/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
-include/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
-include/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
-include/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/lengths_reducer_ops.h
-include/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
-include/caffe2/contrib/fakelowp/unary_fp16_fake_op.h
-include/caffe2/contrib/gloo/allgather_ops.h
-include/caffe2/contrib/gloo/allreduce_ops.h
-include/caffe2/contrib/gloo/barrier_ops.h
-include/caffe2/contrib/gloo/broadcast_ops.h
-include/caffe2/contrib/gloo/common.h
-include/caffe2/contrib/gloo/common_world_ops.h
-include/caffe2/contrib/gloo/context.h
-include/caffe2/contrib/gloo/reduce_scatter_ops.h
-include/caffe2/contrib/gloo/store_handler.h
-include/caffe2/contrib/nccl/cuda_nccl_gpu.h
-include/caffe2/contrib/opencl/context.h
-include/caffe2/contrib/prof/prof_dag_stats_op.h
-include/caffe2/contrib/shm_mutex/shm_mutex.h
-include/caffe2/contrib/tensorrt/tensorrt_op_trt.h
-include/caffe2/contrib/tensorrt/tensorrt_tranformer.h
-include/caffe2/contrib/tensorrt/trt_utils.h
-include/caffe2/contrib/warpctc/ctc_op.h
-include/caffe2/core/allocator.h
-include/caffe2/core/blob.h
-include/caffe2/core/blob_serialization.h
-include/caffe2/core/blob_serializer_base.h
-include/caffe2/core/blob_stats.h
 include/caffe2/core/common.h
-include/caffe2/core/common_cudnn.h
-include/caffe2/core/common_gpu.h
-include/caffe2/core/common_omp.h
-include/caffe2/core/context.h
-include/caffe2/core/context_base.h
-include/caffe2/core/context_gpu.h
-include/caffe2/core/cudnn_wrappers.h
-include/caffe2/core/db.h
-include/caffe2/core/distributions_stubs.h
-include/caffe2/core/event.h
-include/caffe2/core/event_cpu.h
-include/caffe2/core/export_c10_op_to_caffe2.h
-include/caffe2/core/export_caffe2_op_to_c10.h
-include/caffe2/core/flags.h
-include/caffe2/core/graph.h
-include/caffe2/core/hip/common_miopen.h
-include/caffe2/core/hip/miopen_wrapper.h
-include/caffe2/core/init.h
-include/caffe2/core/logging.h
 include/caffe2/core/macros.h
-include/caffe2/core/memonger.h
-include/caffe2/core/module.h
-include/caffe2/core/net.h
-include/caffe2/core/net_async_base.h
-include/caffe2/core/net_async_scheduling.h
-include/caffe2/core/net_async_task.h
-include/caffe2/core/net_async_task_future.h
-include/caffe2/core/net_async_task_graph.h
-include/caffe2/core/net_async_tracing.h
-include/caffe2/core/net_dag_utils.h
-include/caffe2/core/net_parallel.h
-include/caffe2/core/net_simple.h
-include/caffe2/core/net_simple_refcount.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Graph/BinaryMatchImpl.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Graph/TarjansImpl.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Graph/TopoSort.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Support/Casting.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Transformations/Match.h
-include/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
-include/caffe2/core/nomnigraph/tests/test_util.h
-include/caffe2/core/numa.h
-include/caffe2/core/observer.h
-include/caffe2/core/operator.h
-include/caffe2/core/operator_gradient.h
-include/caffe2/core/operator_schema.h
-include/caffe2/core/plan_executor.h
-include/caffe2/core/prof_dag_counters.h
-include/caffe2/core/qtensor.h
-include/caffe2/core/qtensor_serialization.h
-include/caffe2/core/scope_guard.h
-include/caffe2/core/stats.h
-include/caffe2/core/storage.h
-include/caffe2/core/tensor.h
-include/caffe2/core/tensor_impl.h
-include/caffe2/core/tensor_int8.h
-include/caffe2/core/test_utils.h
 include/caffe2/core/timer.h
-include/caffe2/core/transform.h
-include/caffe2/core/types.h
-include/caffe2/core/workspace.h
-include/caffe2/cuda_rtc/common_rtc.h
-include/caffe2/db/create_db_op.h
-include/caffe2/distributed/file_store_handler.h
-include/caffe2/distributed/file_store_handler_op.h
-include/caffe2/distributed/redis_store_handler.h
-include/caffe2/distributed/redis_store_handler_op.h
-include/caffe2/distributed/store_handler.h
-include/caffe2/distributed/store_ops.h
-include/caffe2/experiments/operators/fully_connected_op_decomposition.h
-include/caffe2/experiments/operators/fully_connected_op_prune.h
-include/caffe2/experiments/operators/fully_connected_op_sparse.h
-include/caffe2/experiments/operators/funhash_op.h
-include/caffe2/experiments/operators/sparse_funhash_op.h
-include/caffe2/experiments/operators/sparse_matrix_reshape_op.h
-include/caffe2/experiments/operators/tt_contraction_op.h
-include/caffe2/experiments/operators/tt_pad_op.h
-include/caffe2/ideep/ideep_utils.h
-include/caffe2/ideep/operators/conv_pool_base_op.h
-include/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
-include/caffe2/ideep/operators/operator_fallback_ideep.h
-include/caffe2/ideep/utils/ideep_context.h
-include/caffe2/ideep/utils/ideep_operator.h
-include/caffe2/image/image_input_op.h
-include/caffe2/image/transform_gpu.h
-include/caffe2/mobile/contrib/ios/ios_caffe.h
-include/caffe2/mobile/contrib/ios/ios_caffe_defines.h
-include/caffe2/mobile/contrib/ios/ios_caffe_predictor.h
-include/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
-include/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.h
-include/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.h
-include/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
-include/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_ext.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_gl_ext.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/cl_platform.h
-include/caffe2/mobile/contrib/libopencl-stub/include/CL/opencl.h
-include/caffe2/mobile/contrib/libopencl-stub/include/libopencl.h
-include/caffe2/mobile/contrib/libvulkan-stub/include/libvulkan-stub.h
-include/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vk_platform.h
-include/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h
-include/caffe2/mobile/contrib/nnapi/NeuralNetworks.h
-include/caffe2/mobile/contrib/nnapi/dlnnapi.h
-include/caffe2/mobile/contrib/nnapi/nnapi.h
-include/caffe2/mobile/contrib/snpe/snpe_ffi.h
-include/caffe2/mobile/contrib/ulp2/ulp.h
-include/caffe2/mobile/contrib/ulp2/ulp_neon.h
-include/caffe2/mpi/mpi_common.h
-include/caffe2/mpi/mpi_ops.h
-include/caffe2/observers/operator_attaching_net_observer.h
-include/caffe2/observers/profile_observer.h
-include/caffe2/observers/runcnt_observer.h
-include/caffe2/observers/time_observer.h
-include/caffe2/onnx/backend.h
-include/caffe2/onnx/backend_rep.h
-include/caffe2/onnx/device.h
-include/caffe2/onnx/helper.h
-include/caffe2/onnx/offline_tensor.h
-include/caffe2/onnx/onnx_exporter.h
-include/caffe2/onnx/onnxifi_graph_info.h
-include/caffe2/onnx/onnxifi_init.h
-include/caffe2/onnx/torch_ops/constants.h
-include/caffe2/onnx/torch_ops/operator_sets.h
-include/caffe2/onnx/torch_ops/schema.h
-include/caffe2/operators/abs_op.h
-include/caffe2/operators/accumulate_op.h
-include/caffe2/operators/accuracy_op.h
-include/caffe2/operators/acos_op.h
-include/caffe2/operators/activation_ops_cudnn.h
-include/caffe2/operators/affine_channel_op.h
-include/caffe2/operators/alias_with_name.h
-include/caffe2/operators/apmeter_op.h
-include/caffe2/operators/arg_ops.h
-include/caffe2/operators/asin_op.h
-include/caffe2/operators/assert_op.h
-include/caffe2/operators/async_net_barrier_op.h
-include/caffe2/operators/atan_op.h
-include/caffe2/operators/batch_box_cox_op.h
-include/caffe2/operators/batch_bucketize_op.h
-include/caffe2/operators/batch_gather_ops.h
-include/caffe2/operators/batch_matmul_op.h
-include/caffe2/operators/batch_moments_op.h
-include/caffe2/operators/batch_permutation_op.h
-include/caffe2/operators/batch_sparse_to_dense_op.h
-include/caffe2/operators/bbox_transform_op.h
-include/caffe2/operators/bisect_percentile_op.h
-include/caffe2/operators/boolean_mask_ops.h
-include/caffe2/operators/boolean_unmask_ops.h
-include/caffe2/operators/box_with_nms_limit_op.h
-include/caffe2/operators/bucketize_op.h
-include/caffe2/operators/byte_weight_dequant_op.h
-include/caffe2/operators/cast_op.h
-include/caffe2/operators/cbrt_op.h
-include/caffe2/operators/cc_bmm_bg_op.h
-include/caffe2/operators/ceil_op.h
-include/caffe2/operators/channel_backprop_stats_op.h
*** 572 LINES SKIPPED ***