mansiag05
diff --git a/‎.ci/pytorch/macos-build.sh‎
Lines changed: 3 additions & 4 deletions b/‎.ci/pytorch/macos-build.sh‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎.ci/pytorch/macos-test.sh‎
Lines changed: 4 additions & 0 deletions b/‎.ci/pytorch/macos-test.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.ci/wheel/build_wheel.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/wheel/build_wheel.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎BUILD.bazel‎
Lines changed: 1 addition & 2 deletions b/‎BUILD.bazel‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 6 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎buckbuild.bzl‎
Lines changed: 3 additions & 1 deletion b/‎buckbuild.bzl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎c10/ovrsource_defs.bzl‎
Lines changed: 2 additions & 2 deletions b/‎c10/ovrsource_defs.bzl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 65 additions & 79 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 65 additions & 79 deletions
diff --git a/‎cmake/Dependencies.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/Dependencies.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/Summary.cmake‎
Lines changed: 5 additions & 7 deletions b/‎cmake/Summary.cmake‎
Lines changed: 5 additions & 7 deletions
@@ -35,11 +35,10 @@ fi
 
 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
   USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
 
@@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd
 
+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1
 
+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
   # The CircleCI worker hostname doesn't resolve to an address.
   # This environment variable makes ProcessGroupGloo default to
 
@@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp
 
-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1
 
 export USE_MKLDNN=OFF
 
@@ -22,7 +22,6 @@ COMMON_COPTS = [
     "-DHAVE_SHM_UNLINK=1",
     "-D_FILE_OFFSET_BITS=64",
     "-DUSE_FBGEMM",
-    "-DUSE_DISTRIBUTED",
     "-DAT_PER_OPERATOR_HEADERS",
     "-DATEN_THREADING=NATIVE",
     "-DNO_CUDNN_DESTROY_HANDLE",
@@ -811,7 +810,7 @@ cc_library(
     name = "torch_python",
     srcs = libtorch_python_core_sources
         + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + libtorch_python_distributed_sources
         + GENERATED_AUTOGRAD_PYTHON,
     hdrs = glob([
         "torch/csrc/generic/*.cpp",
 
@@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
   set(CPU_POWER ON)
 endif()
 
-# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
-# tested and likely won't work without additional changes.
+# For non-supported platforms, turn USE_DISTRIBUTED off by default.
+# NB: USE_DISTRIBUTED simply disables the backend; distributed code
+# still gets built
 if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED
       OFF
@@ -262,11 +263,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Enable default distributed backends" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@@ -431,11 +432,10 @@ if(WIN32)
       PATH_SUFFIXES lib
       NO_DEFAULT_PATH)
     if(NOT libuv_tmp_LIBRARY)
-      set(USE_DISTRIBUTED OFF)
       set(USE_GLOO OFF)
       message(
         WARNING
-          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
           "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
       )
     else()
 
@@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
 
-C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
+C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
 
 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@@ -948,6 +948,7 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
+                ("", "torch/csrc/**/*.hpp"),
                 ("", "torch/nativert/**/*.h"),
                 ("", "torch/headeronly/**/*.h"),
                 ("", "torch/script.h"),
@@ -2033,6 +2034,7 @@ def define_buck_targets(
                 ("", "caffe2/utils/*.h"),
                 ("", "caffe2/core/*.h"),
                 ("", "torch/csrc/*.h"),
+                ("", "torch/csrc/*.hpp"),
                 ("", "torch/csrc/api/include/torch/*.h"),
                 ("", "torch/csrc/autograd/*.h"),
                 ("", "torch/csrc/autograd/*/*.h"),
 
@@ -18,9 +18,9 @@ cuda_supported_platforms = [
 
 def define_c10_ovrsource(name, is_mobile):
     if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1"]
+        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
     else:
-        pp_flags = []
+        pp_flags = ["-DC10_USE_GLOG"]
 
     oxx_static_library(
         name = name,
 
@@ -540,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
   )
 
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
-    endif()
+  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
   endif()
 endif()
 
@@ -573,32 +571,30 @@ if(USE_CUDA)
     list(APPEND Caffe2_GPU_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-      set_source_files_properties(
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
-        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-      )
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+    )
+  endif()
 
-    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-    if(CMAKE_COMPILER_IS_GNUCXX)
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-    endif()
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
-    endif()
+  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+  endif()
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
   endif()
   set_source_files_properties(
     ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -631,11 +627,9 @@ if(USE_ROCM)
     list(APPEND Caffe2_HIP_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
   endif()
   # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
   # See NOTE [ ATen NVRTC Stub and HIP ]
@@ -1356,12 +1350,10 @@ if(BUILD_TEST)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
     add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    if(USE_DISTRIBUTED)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-      if(NOT WIN32)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
-      endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+    if(NOT WIN32)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
     if(NOT NO_API)
       add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@@ -1466,47 +1458,41 @@ if(BUILD_LITE_INTERPRETER)
   endif()
 endif()
 
-
-# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
-# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
-if(USE_DISTRIBUTED)
-  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
-  if(USE_GLOO AND USE_C10D_GLOO)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-  endif()
-  if(USE_UCC AND USE_C10D_UCC)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-    if(USE_CUDA)
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
-    endif()
-  endif()
-  if(USE_NCCL AND USE_C10D_NCCL)
-    if(USE_ROCM)
-      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-    else()
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
-    endif()
-  endif()
-  if(USE_MPI AND USE_C10D_MPI)
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set_source_files_properties(
-        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-    endif()
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-  endif()
-  # Pass USE_RPC in order to reduce use of
-  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-  # need to be removed when RPC is supported
-  if(NOT WIN32)
-    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+if(USE_GLOO AND USE_C10D_GLOO)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
+endif()
+if(USE_UCC AND USE_C10D_UCC)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+  if(USE_CUDA)
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
   endif()
-  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-  # can only be compiled with USE_TENSORPIPE is set.
-  if(USE_TENSORPIPE)
-    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
+endif()
+if(USE_NCCL AND USE_C10D_NCCL)
+  if(USE_ROCM)
+    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+  else()
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
   endif()
 endif()
+if(USE_MPI AND USE_C10D_MPI)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set_source_files_properties(
+      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  endif()
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+endif()
+# Pass USE_RPC in order to reduce use of
+# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+# need to be removed when RPC is supported
+if(NOT WIN32)
+  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+endif()
+# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+# can only be compiled with USE_TENSORPIPE is set.
+if(USE_TENSORPIPE)
+  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
+endif()
 
 if(NOT INTERN_BUILD_MOBILE)
   if(${CAFFE2_LINK_LOCAL_PROTOBUF})
 
@@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
-if(USE_DISTRIBUTED AND USE_TENSORPIPE)
+if(USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")
   else()
 
@@ -192,13 +192,11 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
   message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
   message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-  if(${USE_DISTRIBUTED})
-    message(STATUS "    USE_MPI               : ${USE_MPI}")
-    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
-    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
-    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
-    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
-  endif()
+  message(STATUS "    USE_MPI               : ${USE_MPI}")
+  message(STATUS "    USE_GLOO              : ${USE_GLOO}")
+  message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+  message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+  message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
   if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
     message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
   endif()