From 3c1b6710b4a28633ef12a85b3491b7d05de68862 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 18 Aug 2021 22:36:36 -0400 Subject: [PATCH 01/62] tweaks for spock --- components/homme/CMakeLists.txt | 4 ++-- components/homme/cmake/HommeMacros.cmake | 4 ++-- components/homme/cmake/SetCompilerFlags.cmake | 4 ++-- components/homme/src/share/cxx/ExecSpaceDefs.hpp | 2 +- components/homme/src/share/cxx/HybridVCoord.cpp | 4 +++- .../theta-l_kokkos/cxx/HyperviscosityFunctorImpl.cpp | 3 ++- components/homme/test/unit_tests/CMakeLists.txt | 2 +- components/homme/test_execs/CMakeLists.txt | 12 ++++++------ 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index cba84022944b..4b8018354b6c 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -401,10 +401,10 @@ IF (${HOMME_USE_KOKKOS}) ENDIF () # This folder contains the CMake macro used to build cxx unit tests -IF (HOMMEXX_BFB_TESTING) +#IF (HOMMEXX_BFB_TESTING) # Add unit tests for C++ code ADD_SUBDIRECTORY(test/unit_tests) -ENDIF () +#ENDIF () # Add libcompose, consisting of just C++ files, so we can build it just once for # all exes. It does not depend on compile-time sizes. diff --git a/components/homme/cmake/HommeMacros.cmake b/components/homme/cmake/HommeMacros.cmake index 6aeb5d7446a8..7657ffcb8e9a 100644 --- a/components/homme/cmake/HommeMacros.cmake +++ b/components/homme/cmake/HommeMacros.cmake @@ -140,7 +140,7 @@ macro(createTestExec execName execType macroNP macroNC ENDIF () IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${execName} kokkos) + TARGET_LINK_LIBRARIES(${execName} kokkoscore) ENDIF () # Move the module files out of the way so the parallel build @@ -237,7 +237,7 @@ macro(createExecLib libName execType libSrcs inclDirs macroNP TARGET_LINK_LIBRARIES(${libName} timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${libName} kokkos) + TARGET_LINK_LIBRARIES(${libName} kokkoscore) ENDIF () IF (NOT HOMME_USE_MKL) diff --git a/components/homme/cmake/SetCompilerFlags.cmake b/components/homme/cmake/SetCompilerFlags.cmake index b772321c7d30..2a42b7aec4a8 100644 --- a/components/homme/cmake/SetCompilerFlags.cmake +++ b/components/homme/cmake/SetCompilerFlags.cmake @@ -67,9 +67,9 @@ IF (${HOMME_USE_CXX}) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") INCLUDE(CheckCXXCompilerFlag) - CHECK_CXX_COMPILER_FLAG("-std=c++14" CXX14_SUPPORTED) + CHECK_CXX_COMPILER_FLAG("-std=c++17" CXX14_SUPPORTED) IF (${CXX14_SUPPORTED}) - SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") ELSEIF (${HOMME_USE_KOKKOS}) MESSAGE (FATAL_ERROR "Kokkos needs C++14, but the C++ compiler does not support it.") ENDIF () diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index d3f2cd43ae25..289b79897e60 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -135,7 +135,7 @@ struct DefaultThreadsDistribution { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE ExecSpaceType::thread_pool_size() #else - ExecSpaceType::impl_thread_pool_size() + 1 //ExecSpaceType::impl_thread_pool_size() #endif , num_parallel_iterations, tp); } diff --git a/components/homme/src/share/cxx/HybridVCoord.cpp b/components/homme/src/share/cxx/HybridVCoord.cpp index e133e2ecaae9..3b45fdc7cbdb 100644 --- a/components/homme/src/share/cxx/HybridVCoord.cpp +++ b/components/homme/src/share/cxx/HybridVCoord.cpp @@ -222,10 +222,12 @@ void HybridVCoord::compute_eta () auto l_hybrid_ai = hybrid_ai; auto l_hybrid_bi = hybrid_bi; + auto l_ps0 = ps0; + Kokkos::parallel_for(Kokkos::RangePolicy(0,NUM_LEV), KOKKOS_LAMBDA(const int& ilev){ l_etam(ilev) = l_hybrid_am(ilev) + l_hybrid_bm(ilev); - l_exner0(ilev) = pow(l_etam(ilev)*ps0/PhysicalConstants::p0,PhysicalConstants::kappa); + l_exner0(ilev) = pow(l_etam(ilev)*l_ps0/PhysicalConstants::p0,PhysicalConstants::kappa); }); Kokkos::parallel_for(Kokkos::RangePolicy(0,NUM_INTERFACE_LEV), KOKKOS_LAMBDA(const int& ilev){ diff --git a/components/homme/src/theta-l_kokkos/cxx/HyperviscosityFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/HyperviscosityFunctorImpl.cpp index 8ffbeb434a4c..103c65f5e972 100644 --- a/components/homme/src/theta-l_kokkos/cxx/HyperviscosityFunctorImpl.cpp +++ b/components/homme/src/theta-l_kokkos/cxx/HyperviscosityFunctorImpl.cpp @@ -224,6 +224,7 @@ void HyperviscosityFunctorImpl::run (const int np1, const Real dt, const Real et // Finally, convert theta back to vtheta, and adjust w at surface auto geo = m_geometry; + const auto process_nh_vars = m_process_nh_vars; Kokkos::parallel_for(Homme::get_default_team_policy(state.num_elems()), KOKKOS_LAMBDA(const TeamMember& team) { const int ie = team.league_rank(); @@ -242,7 +243,7 @@ void HyperviscosityFunctorImpl::run (const int np1, const Real dt, const Real et // Fix w at surface: // Adjust w_i at the surface, since velocity has changed - if (m_process_nh_vars) { + if (process_nh_vars) { Kokkos::single(Kokkos::PerThread(team),[&](){ using InfoI = ColInfo; using InfoM = ColInfo; diff --git a/components/homme/test/unit_tests/CMakeLists.txt b/components/homme/test/unit_tests/CMakeLists.txt index 7e67526c42c9..c3926cbb4778 100644 --- a/components/homme/test/unit_tests/CMakeLists.txt +++ b/components/homme/test/unit_tests/CMakeLists.txt @@ -13,7 +13,7 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con #ENDIF() SET_TESTS_PROPERTIES(${target_name}_test PROPERTIES LABELS "unit") - TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkos) + TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkoscore) STRING(TOUPPER "${PERFORMANCE_PROFILE}" PERF_PROF_UPPER) IF ("${PERF_PROF_UPPER}" STREQUAL "VTUNE") diff --git a/components/homme/test_execs/CMakeLists.txt b/components/homme/test_execs/CMakeLists.txt index 0f22e9de851a..381cabbb0785 100644 --- a/components/homme/test_execs/CMakeLists.txt +++ b/components/homme/test_execs/CMakeLists.txt @@ -239,9 +239,9 @@ IF (${BUILD_HOMME_THETA_KOKKOS}) ADD_SUBDIRECTORY(theta-nlev128) ENDIF() - IF (HOMMEXX_BFB_TESTING) + # IF (HOMMEXX_BFB_TESTING) ADD_SUBDIRECTORY(thetal_kokkos_ut) - ENDIF() + #ENDIF() endif() ENDIF() @@ -256,19 +256,19 @@ IF(${BUILD_HOMME_SWIM}) ADD_SUBDIRECTORY(swim5) ENDIF() -IF (HOMMEXX_BFB_TESTING) +#IF (HOMMEXX_BFB_TESTING) # Test some stuff in the shared folder ADD_SUBDIRECTORY (share_kokkos_ut) -ENDIF () + #ENDIF () IF (${BUILD_HOMME_PREQX_KOKKOS}) ADD_SUBDIRECTORY(preqx-nlev26) ADD_SUBDIRECTORY(preqx-nlev26-kokkos) ADD_SUBDIRECTORY(preqx-nlev72) ADD_SUBDIRECTORY(preqx-nlev72-kokkos) - IF (HOMMEXX_BFB_TESTING) + # IF (HOMMEXX_BFB_TESTING) ADD_SUBDIRECTORY(preqx_kokkos_ut) - ENDIF () + #ENDIF () ENDIF () # Read the test-list.cmake file to get the HOMME_TESTS list From 7604fc473378d4e639457618af6ff88fd4ecbacd Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 18 Aug 2021 22:37:06 -0400 Subject: [PATCH 02/62] cache file --- .../cmake/machineFiles/spock-gpumpi.cmake | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 components/homme/cmake/machineFiles/spock-gpumpi.cmake diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake new file mode 100644 index 000000000000..9e5241f3cc7d --- /dev/null +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -0,0 +1,63 @@ +#interactive job +#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash + + +#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ + +#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") + +SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(ENABLE_CUDA FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") +SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") + +#SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +#SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_CUDA OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") +#SET(Kokkos_ARCH_VEGA908 ON CACHE BOOL "") +#SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "cc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") +#SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") +SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-example-spock-hipcc/bld-hipcc" CACHE STRING "") + +#not the proper way!!! +SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.7/ofi/crayclang/10.0" CACHE STRING "") + +SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.7/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") + +SET(ADD_Fortran_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_C_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_CXX_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_LINKER_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") + + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") + +set (USE_NUM_PROCS 4 CACHE STRING "") + +#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") +SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") From c6422dcd7020163f8aa6d9d05885cfc80a551ca5 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 1 Sep 2021 12:35:01 -0400 Subject: [PATCH 03/62] fix build of unit tests without pio --- components/homme/test/unit_tests/CMakeLists.txt | 2 ++ components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt | 2 +- components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/components/homme/test/unit_tests/CMakeLists.txt b/components/homme/test/unit_tests/CMakeLists.txt index c3926cbb4778..6ae4a68392d0 100644 --- a/components/homme/test/unit_tests/CMakeLists.txt +++ b/components/homme/test/unit_tests/CMakeLists.txt @@ -29,6 +29,8 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con ELSE () TARGET_LINK_LIBRARIES(${target_name} pio) ENDIF () + ELSE () + ADD_DEFINITIONS(-DHOMME_WITHOUT_PIOLIBRARY) ENDIF () IF (NOT "${target_f90_srcs}" EQUAL "") diff --git a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt index 6d6d68597cdc..94176f27b487 100644 --- a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt @@ -23,7 +23,7 @@ ADD_LIBRARY(preqx_kokkos_ut_lib ${PREQX_DEPS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(preqx_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkos) +TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkoscore) TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF(NOT BUILD_HOMME_WITHOUT_PIOLIBRARY) IF(HOMME_USE_SCORPIO) diff --git a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt index 1981aecb3487..15691cacb35e 100644 --- a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt @@ -25,7 +25,7 @@ ADD_LIBRARY(thetal_kokkos_ut_lib ${THETAL_DEPS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(thetal_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkos) +TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkoscore) TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF(NOT BUILD_HOMME_WITHOUT_PIOLIBRARY) IF(HOMME_USE_SCORPIO) @@ -41,7 +41,7 @@ SET_TARGET_PROPERTIES(thetal_kokkos_ut_lib PROPERTIES Fortran_MODULE_DIRECTORY $ # Setting this once and for all, good for all unit tests. SET (CONFIG_DEFINES HAVE_CONFIG_H) -IF (HOMMEXX_BFB_TESTING) +#IF (HOMMEXX_BFB_TESTING) ### Equation of state unit tests SET (EOS_UT_CXX_SRCS @@ -233,7 +233,7 @@ IF (HOMMEXX_BFB_TESTING) ENDIF() cxx_unit_test (remap_theta_ut "${REMAP_THETA_UT_F90_SRCS}" "${REMAP_THETA_UT_CXX_SRCS}" "${REMAP_THETA_UT_INCLUDE_DIRS}" "${CONFIG_DEFINES}" ${NUM_CPUS}) TARGET_LINK_LIBRARIES(remap_theta_ut thetal_kokkos_ut_lib) -ENDIF () +#ENDIF () # ### DIRK functor unit test From b3e0af23fc2590c752a730198333cda54eb3aacb Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Thu, 2 Sep 2021 17:50:22 -0400 Subject: [PATCH 04/62] added define for hip build --- components/homme/CMakeLists.txt | 40 ++++++++++++------- .../cmake/machineFiles/spock-gpumpi.cmake | 1 + components/homme/src/share/cxx/Config.hpp | 9 ++++- components/homme/src/share/cxx/Dimensions.hpp | 2 +- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 38 ++++++++++++++++++ .../homme/src/share/cxx/ExecSpaceDefs.hpp | 15 ++++++- .../homme/src/share/cxx/Hommexx_config.h.in | 3 ++ .../homme/src/share/cxx/KernelVariables.hpp | 22 +++++----- .../homme/src/share/cxx/kokkos_utils.hpp | 4 ++ 9 files changed, 103 insertions(+), 31 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index 4b8018354b6c..13d0a5edf09c 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -177,20 +177,23 @@ IF (${HOMME_USE_KOKKOS}) SET (HOMMEXX_EXEC_SPACE "Default" CACHE STRING "Select the kokkos exec space") +################!!!!!!!!!!!! ????????????? STRING (TOUPPER ${HOMMEXX_EXEC_SPACE} HOMMEXX_EXEC_SPACE_UPPER) - IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") - SET (HOMMEXX_CUDA_SPACE ON) - ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "OPENMP") - SET (HOMMEXX_OPENMP_SPACE ON) - ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "THREADS") - SET (HOMMEXX_THREADS_SPACE ON) - ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "SERIAL") - SET (HOMMEXX_SERIAL_SPACE ON) - ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "DEFAULT") - SET (HOMMEXX_DEFAULT_SPACE ON) - ELSE() - MESSAGE (ABORT "Invalid choice for 'HOMMEXX_EXEC_SPACE'. Valid options (case insensitive) are 'Cuda', 'OpenMP', 'Threads', 'Serial', 'Default'") - ENDIF() +# IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") +# SET (HOMMEXX_HIP_SPACE ON) +# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") +# SET (HOMMEXX_CUDA_SPACE ON) +# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "OPENMP") +# SET (HOMMEXX_OPENMP_SPACE ON) +# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "THREADS") +# SET (HOMMEXX_THREADS_SPACE ON) +# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "SERIAL") +# SET (HOMMEXX_SERIAL_SPACE ON) +# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "DEFAULT") +# SET (HOMMEXX_DEFAULT_SPACE ON) +# ELSE() +# MESSAGE (ABORT "Invalid choice for 'HOMMEXX_EXEC_SPACE'. Valid options (case insensitive) are 'Cuda', 'OpenMP', 'Threads', 'Serial', 'Default'") +# ENDIF() # Execution space parameters. 8 is a nice size for V100. SET (HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 CACHE STRING "Minimum number of warps to get 100% occoupancy on GPU") @@ -259,14 +262,21 @@ MESSAGE(STATUS "Linker Flags = ${CMAKE_EXE_LINKER_FLAGS}") IF (${HOMME_USE_KOKKOS}) - IF (CUDA_BUILD) + if(HIP) + add_definitions(-DHIP_BUILD) + #does not seem to be needed because kokkos::default will be picked up + #SET (HOMMEXX_HIP_SPACE ON) + endif() + + IF (CUDA_BUILD OR HIP) + message("HERE WE ARE--------------------------------------------") SET (DEFAULT_VECTOR_SIZE 1) ELSE () SET (DEFAULT_VECTOR_SIZE 8) ENDIF() SET (HOMMEXX_VECTOR_SIZE ${DEFAULT_VECTOR_SIZE} CACHE STRING - "If AVX or Cuda don't take priority, use this software vector size.") + "If AVX or Cuda or HIP don't take priority, use this software vector size.") IF (CMAKE_BUILD_TYPE_UPPER MATCHES "DEBUG" OR CMAKE_BUILD_TYPE_UPPER MATCHES "RELWITHDEBINFO") SET (HOMMEXX_DEBUG ON) diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake index 9e5241f3cc7d..ce91b0d6a744 100644 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -28,6 +28,7 @@ SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") +SET(HIP TRUE CACHE BOOL "") #SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") diff --git a/components/homme/src/share/cxx/Config.hpp b/components/homme/src/share/cxx/Config.hpp index a1f633e13487..aad65be04d43 100644 --- a/components/homme/src/share/cxx/Config.hpp +++ b/components/homme/src/share/cxx/Config.hpp @@ -13,6 +13,7 @@ # include "config.h.c" # endif #else +///!!!!!!!!!!!!!!! not relevant // Establish a good candidate vector size for eam builds # ifdef CUDA_BUILD # define HOMMEXX_VECTOR_SIZE 1 @@ -21,17 +22,21 @@ # endif #endif -#if ! defined HOMMEXX_CUDA_SPACE && ! defined HOMMEXX_OPENMP_SPACE && ! defined HOMMEXX_THREADS_SPACE && ! defined HOMMEXX_SERIAL_SPACE + +////!!!!!!!!!!! +#if ! defined HOMMEXX_CUDA_SPACE && ! defined HOMMEXX_OPENMP_SPACE && ! defined HOMMEXX_THREADS_SPACE && ! defined HOMMEXX_SERIAL_SPACE && ! defined HOMMEXX_HIP_SPACE # define HOMMEXX_DEFAULT_SPACE #endif +//?????? #ifndef HOMMEXX_MPI_ON_DEVICE # define HOMMEXX_MPI_ON_DEVICE 1 #endif #include -#ifdef KOKKOS_ENABLE_CUDA +/////!!!!!!! +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) # ifndef HOMMEXX_CUDA_MIN_WARP_PER_TEAM # define HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 # endif diff --git a/components/homme/src/share/cxx/Dimensions.hpp b/components/homme/src/share/cxx/Dimensions.hpp index 44adeb5d9b00..ab739f074432 100644 --- a/components/homme/src/share/cxx/Dimensions.hpp +++ b/components/homme/src/share/cxx/Dimensions.hpp @@ -14,7 +14,7 @@ namespace Homme { // Until whenever CUDA supports constexpr properly -#ifdef CUDA_BUILD +#if defined(CUDA_BUILD) || (HIP_BUILD) #ifdef CAM #define QSIZE_D PCNST diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 8e775653ff44..d41fc075686e 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -17,6 +17,11 @@ # include #endif +#if HIP_BUILD +#include +///#include +#endif + namespace Homme { // Since we're initializing from inside a Fortran code and don't have access to @@ -49,6 +54,22 @@ void initialize_kokkos () { str.back() = 0; args.push_back(const_cast(str.data())); #endif +#if HIP_BUILD + int nd; + const auto ret = hipGetDeviceCount(&nd); + if (ret != hipSuccess) { + // It isn't a big deal if we can't get the device count. + nd = 1; + } + std::stringstream ss; + ss << "--kokkos-num-devices=" << nd; + const auto key = ss.str(); + std::vector str(key.size()+1); + std::copy(key.begin(), key.end(), str.begin()); + str.back() = 0; + args.push_back(const_cast(str.data())); +#endif + const char* silence = "--kokkos-disable-warnings"; args.push_back(const_cast(silence)); @@ -173,11 +194,28 @@ team_num_threads_vectors (const int num_parallel_iterations, const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); # endif #else + +#if HIP_BUILD + + //use 40 wavefronts per CU and 64 CUs + const int num_warps_device = 40*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); + const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; + const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; + +///printf() for warpsize + + +#else + // I want thread-distribution rules to be unit-testable even when Cuda is // off. Thus, make up a P100-like machine: const int num_warps_device = 1792; const int num_threads_warp = 32; const int max_num_warps = 16; + +#endif + + #endif return Parallel::team_num_threads_vectors_for_gpu( diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 289b79897e60..ca44fce37cfd 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -20,12 +20,23 @@ namespace Homme // Some in-house names for Kokkos exec spaces, which are // always defined, possibly as alias of void + +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) + #ifdef KOKKOS_ENABLE_CUDA using Hommexx_Cuda = Kokkos::Cuda; +#endif + +#if HIP_BUILD +using Hommexx_Cuda = Kokkos::Experimental::HIP; +#endif + #else using Hommexx_Cuda = void; #endif + + #ifdef KOKKOS_ENABLE_OPENMP using Hommexx_OpenMP = Kokkos::OpenMP; #else @@ -44,7 +55,7 @@ using Hommexx_Serial = Kokkos::Serial; using Hommexx_Serial = void; #endif -#ifdef KOKKOS_ENABLE_CUDA +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) # define HOMMEXX_STATIC #else # define HOMMEXX_STATIC static @@ -52,7 +63,7 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space -#if defined(HOMMEXX_CUDA_SPACE) +#if defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) using ExecSpace = Hommexx_Cuda; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; diff --git a/components/homme/src/share/cxx/Hommexx_config.h.in b/components/homme/src/share/cxx/Hommexx_config.h.in index 9d86a78406e2..049d611bcee7 100644 --- a/components/homme/src/share/cxx/Hommexx_config.h.in +++ b/components/homme/src/share/cxx/Hommexx_config.h.in @@ -1,6 +1,9 @@ #ifndef HOMMEXX_CONFIG_H #define HOMMEXX_CONFIG_H + +///????????????? + // Whether the CUDA exec space has been selected #cmakedefine HOMMEXX_CUDA_SPACE diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 8a6f20ff6cfe..f27e5759a270 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -28,23 +28,23 @@ struct KernelVariables { return 0; } -#ifdef KOKKOS_ENABLE_CUDA -#ifdef __CUDA_ARCH__ +#if defined(KOKKOS_ENABLE_CUDA) || HIP_BUILD +//#ifdef __CUDA_ARCH__ template static KOKKOS_INLINE_FUNCTION typename std::enable_if< OnGpu::value, int>::type get_team_idx(const int /*team_size*/, const int league_rank) { return league_rank; } -#else - template - static KOKKOS_INLINE_FUNCTION typename std::enable_if< - OnGpu::value, int>::type - get_team_idx(const int /*team_size*/, const int /*league_rank*/) { - assert(false); // should never happen - return -1; - } -#endif // __CUDA_ARCH__ +//#else +// template +// static KOKKOS_INLINE_FUNCTION typename std::enable_if< +// OnGpu::value, int>::type +// get_team_idx(const int /*team_size*/, const int /*league_rank*/) { +// assert(false); // should never happen +// return -1; +// } +//#endif // __CUDA_ARCH__ #endif // KOKKOS_ENABLE_CUDA #ifdef KOKKOS_ENABLE_OPENMP diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index f2791153aac3..2ad84e823d71 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -101,6 +101,10 @@ class TeamUtils : public _TeamUtilsCommonBase }; #endif +////////// + + + /* * Specialization for Cuda execution space. */ From 684c6334898205ebe4d5b877e1e49a3adc33efee Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Sat, 4 Sep 2021 01:17:19 -0400 Subject: [PATCH 05/62] more fixes, intermediate --- .../src/share/cxx/EulerStepFunctorImpl.hpp | 15 ++++++++ .../homme/src/share/cxx/ExecSpaceDefs.cpp | 36 ++++++++++++++----- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 14 +++++--- .../homme/src/share/cxx/KernelVariables.hpp | 2 +- .../homme/src/share/cxx/kokkos_utils.hpp | 8 +++-- components/homme/src/share/cxx/profiling.hpp | 2 +- 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index 204004c7ef7a..e7746f2d83b1 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -56,6 +56,21 @@ struct SerialLimiter { }; #endif +#if HIP_BUILD +template <> +struct SerialLimiter { + template + KOKKOS_INLINE_FUNCTION static void + run (const ArrayGll& sphweights, const ArrayGllLvl& idpmass, + const Array2Lvl& iqlim, const ArrayGllLvl& iptens, + const Array2GllLvl& irwrk) { + Kokkos::abort("SerialLimiter::run: Should not be called on GPU."); + } +}; +#endif + + class EulerStepFunctorImpl { struct EulerStepData { EulerStepData () diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index d41fc075686e..2f1f36d46438 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -156,13 +156,26 @@ team_num_threads_vectors_for_gpu ( if (tp.prefer_threads) { const int num_threads = ( (tp.max_threads_usable > num_device_threads) ? num_device_threads : - tp.max_threads_usable ); - return std::make_pair( num_threads, - prevpow2(num_device_threads / num_threads) ); + tp.max_threads_usable ); + +//TP(16,64) for 120*64, 64 +//TP(16,64) for 16*32, 16 +// +//printf("tp. prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); +// return std::make_pair( num_threads, +// prevpow2(num_device_threads / num_threads) ); + +return std::make_pair( 32, 32); + + } else { const int num_vectors = prevpow2( (tp.max_vectors_usable > num_device_threads) ? num_device_threads : tp.max_vectors_usable ); + +//printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, +// num_vectors); + return std::make_pair( num_device_threads / num_vectors, num_vectors ); } @@ -197,13 +210,20 @@ team_num_threads_vectors (const int num_parallel_iterations, #if HIP_BUILD - //use 40 wavefronts per CU and 64 CUs - const int num_warps_device = 40*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); - const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; - const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; +//to make sure this is active, it is +//#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) + +//however slow, combination 16*32 and 16 below ran with all ne -///printf() for warpsize + //use 64 wavefronts per CU and 120 CUs + const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); + const int max_num_warps = 64; //cores per CU, SM ///HOMMEXX_CUDA_MAX_WARP_PER_TEAM; + const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; + +//warpsize is 64 for hip +//printf(" warpsize----------: %4d \n",num_threads_warp); +//printf(" on GPU? %d \n", OnGpu::value); #else diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index ca44fce37cfd..a7b96190ff8d 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -317,10 +317,12 @@ VECTOR_SIMD_LOOP } }; -#if defined KOKKOS_ENABLE_CUDA +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) template <> -struct Dispatch { - using ExeSpace = Kokkos::Cuda; +//struct Dispatch { +// using ExeSpace = Kokkos::Cuda; +struct Dispatch { + using ExeSpace = Hommexx_Cuda; template KOKKOS_FORCEINLINE_FUNCTION @@ -362,7 +364,8 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_for_NP2 ( - const Kokkos::TeamPolicy::member_type& team, + //const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, NP*NP), lambda); @@ -371,7 +374,8 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_reduce_NP2 ( - const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, + //const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda, ValueType& result) { parallel_reduce(team, Kokkos::ThreadVectorRange(team, NP*NP), diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index f27e5759a270..f38d131b393f 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -28,7 +28,7 @@ struct KernelVariables { return 0; } -#if defined(KOKKOS_ENABLE_CUDA) || HIP_BUILD +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) //#ifdef __CUDA_ARCH__ template static KOKKOS_INLINE_FUNCTION typename std::enable_if< diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index 2ad84e823d71..2122d6f142eb 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -108,9 +108,10 @@ class TeamUtils : public _TeamUtilsCommonBase /* * Specialization for Cuda execution space. */ -#ifdef KOKKOS_ENABLE_CUDA +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) template <> -class TeamUtils : public _TeamUtilsCommonBase +//class TeamUtils : public _TeamUtilsCommonBase +class TeamUtils : public _TeamUtilsCommonBase { #ifdef HOMMEXX_CUDA_SHARE_BUFFER using Device = Kokkos::Device; @@ -128,7 +129,8 @@ class TeamUtils : public _TeamUtilsCommonBase public: template TeamUtils(const TeamPolicy& policy, const Real& overprov_factor = 1.25) : - _TeamUtilsCommonBase(policy) + //_TeamUtilsCommonBase(policy) + _TeamUtilsCommonBase(policy) #ifdef HOMMEXX_CUDA_SHARE_BUFFER , _num_ws_slots(_league_size > _num_teams ? (overprov_factor * _num_teams > _league_size ? _league_size : overprov_factor * _num_teams) diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index 80c93fdc3d17..660a0de331db 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -12,7 +12,7 @@ #include "gptl.h" -#if defined(HOMMEXX_CUDA_SPACE) || \ +#if (HIP_BUILD) || defined(HOMMEXX_CUDA_SPACE) || \ (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) // Can't use GPTL timers on CUDA #define start_timer(name) {} #define stop_timer(name) {} From 24f3dc4745acbbc337dcb78962c62c395dc11048 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 22 Sep 2021 18:32:36 -0400 Subject: [PATCH 06/62] intermediate --- .../src/preqx_kokkos/cxx/CaarFunctorImpl.hpp | 8 ++++++ .../src/preqx_kokkos/prim_driver_mod.F90 | 2 ++ .../homme/src/share/compose/compose_cedr.cpp | 18 +++++++------ .../homme/src/share/cxx/ExecSpaceDefs.cpp | 25 ++++++++++++++----- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 2 +- .../src/theta-l_kokkos/prim_driver_mod.F90 | 3 +++ 6 files changed, 44 insertions(+), 14 deletions(-) diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp index a7a7347dd66f..1cba57a95913 100644 --- a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp @@ -218,6 +218,14 @@ struct CaarFunctorImpl { const int igp = idx / NP; const int jgp = idx % NP; +//#if __HIP_DEVICE_COMPILE__ == 1 +// printf("DEVICEEEEEEEEEEEEEEE \n"); +//#endif +//#ifndef __HIP_DEVICE_COMPILE__ +// printf("HOSTTTTT \n"); +//#endif + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV), [&] (const int& ilev) { // pre-fill energy_grad with the pressure(_grad)-temperature part m_buffers.energy_grad(kv.team_idx, 0, igp, jgp, ilev) = diff --git a/components/homme/src/preqx_kokkos/prim_driver_mod.F90 b/components/homme/src/preqx_kokkos/prim_driver_mod.F90 index 94defdd63967..31cd21af6e04 100644 --- a/components/homme/src/preqx_kokkos/prim_driver_mod.F90 +++ b/components/homme/src/preqx_kokkos/prim_driver_mod.F90 @@ -292,11 +292,13 @@ end subroutine cxx_push_results_to_f90 elem_derived_omega_p_ptr = c_loc(elem_derived_omega_p) ! Copy cxx arrays back to f90 structures + if (.false.) then call t_startf('push_to_f90') call cxx_push_results_to_f90(elem_state_v_ptr, elem_state_temp_ptr, elem_state_dp3d_ptr, & elem_state_Qdp_ptr, elem_state_Q_ptr, elem_state_ps_v_ptr, & elem_derived_omega_p_ptr) call t_stopf('push_to_f90') + endif endif ! Print some diagnostic information diff --git a/components/homme/src/share/compose/compose_cedr.cpp b/components/homme/src/share/compose/compose_cedr.cpp index c996865c48d9..62ae64c46107 100644 --- a/components/homme/src/share/compose/compose_cedr.cpp +++ b/components/homme/src/share/compose/compose_cedr.cpp @@ -131,10 +131,12 @@ struct ExeSpaceUtils { } }; -#ifdef KOKKOS_ENABLE_CUDA +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) template <> -struct ExeSpaceUtils { - using TeamPolicy = Kokkos::TeamPolicy; +//struct ExeSpaceUtils { +// using TeamPolicy = Kokkos::TeamPolicy; +struct ExeSpaceUtils { + using TeamPolicy = Kokkos::TeamPolicy; using Member = typename TeamPolicy::member_type; static TeamPolicy get_default_team_policy (int outer, int inner) { return TeamPolicy(outer, std::min(128, 32*((inner + 31)/32)), 1); @@ -3744,8 +3746,9 @@ template class cedr::qlt::QLT; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::qlt::QLT; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::qlt::QLT; +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +//template class cedr::qlt::QLT; +template class cedr::qlt::QLT; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::qlt::QLT; @@ -4155,8 +4158,9 @@ template class cedr::caas::CAAS; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::caas::CAAS; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::caas::CAAS; +#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +//template class cedr::caas::CAAS; +template class cedr::caas::CAAS; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::caas::CAAS; diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 2f1f36d46438..3e13ed0ccee6 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -124,6 +124,8 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); +#if 0 + int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -162,10 +164,8 @@ team_num_threads_vectors_for_gpu ( //TP(16,64) for 16*32, 16 // //printf("tp. prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); -// return std::make_pair( num_threads, -// prevpow2(num_device_threads / num_threads) ); - -return std::make_pair( 32, 32); + return std::make_pair( num_threads, + prevpow2(num_device_threads / num_threads) ); } else { @@ -179,6 +179,11 @@ return std::make_pair( 32, 32); return std::make_pair( num_device_threads / num_vectors, num_vectors ); } +#endif + +return std::make_pair( 16,4 ); + + } } // namespace Parallel @@ -191,7 +196,7 @@ team_num_threads_vectors (const int num_parallel_iterations, // fewer than 4 warps/thread block limits the thread occupancy to that // number/4. That seems to be in Cuda specs, but I don't know of a function // that provides this number. Use a configuration option that defaults to 4. - const int min_num_warps = HOMMEXX_CUDA_MIN_WARP_PER_TEAM; + const int min_num_warps = 4; //HOMMEXX_CUDA_MIN_WARP_PER_TEAM; #ifdef KOKKOS_ENABLE_CUDA const int num_warps_device = Kokkos::Impl::cuda_internal_maximum_concurrent_block_count(); const int num_threads_warp = Kokkos::Impl::CudaTraits::WarpSize; @@ -218,13 +223,21 @@ team_num_threads_vectors (const int num_parallel_iterations, //use 64 wavefronts per CU and 120 CUs const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); - const int max_num_warps = 64; //cores per CU, SM ///HOMMEXX_CUDA_MAX_WARP_PER_TEAM; + const int max_num_warps = 40; //cores per CU, SM ///HOMMEXX_CUDA_MAX_WARP_PER_TEAM; const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; //warpsize is 64 for hip //printf(" warpsize----------: %4d \n",num_threads_warp); //printf(" on GPU? %d \n", OnGpu::value); +//shows host +//#if __HIP_DEVICE_COMPILE__ == 1 +// printf("DEVICEEEEEEEEEEEEEEE \n"); +//#endif +//#ifndef __HIP_DEVICE_COMPILE__ +// printf("HOSTTTTT \n"); +//#endif + #else // I want thread-distribution rules to be unit-testable even when Cuda is diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index a7b96190ff8d..995c3a4332cc 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -146,7 +146,7 @@ struct DefaultThreadsDistribution { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE ExecSpaceType::thread_pool_size() #else - 1 //ExecSpaceType::impl_thread_pool_size() + ExecSpaceType::impl_thread_pool_size() #endif , num_parallel_iterations, tp); } diff --git a/components/homme/src/theta-l_kokkos/prim_driver_mod.F90 b/components/homme/src/theta-l_kokkos/prim_driver_mod.F90 index 8d1620a3582f..b6e160e62067 100644 --- a/components/homme/src/theta-l_kokkos/prim_driver_mod.F90 +++ b/components/homme/src/theta-l_kokkos/prim_driver_mod.F90 @@ -371,10 +371,13 @@ subroutine prim_run_subcycle(elem, hybrid, nets, nete, dt, single_column, tl, hv #ifndef SCREAM ! Scream already computes all forcing using the same pointers ! stored in Hommexx, so the forcing is already up to date + + if(.false.) then call t_startf('push_to_cxx') call push_forcing_to_c(elem_derived_FM, elem_derived_FVTheta, elem_derived_FT, & elem_derived_FPHI, elem_derived_FQ) call t_stopf('push_to_cxx') + endif #endif call prim_run_subcycle_c(dt,nstep_c,nm1_c,n0_c,np1_c,nextOutputStep) From 3d3cd6da0bd331c781bc8601001340095baf5187 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 15:38:20 -0500 Subject: [PATCH 07/62] revert nonsgnificant changes --- components/homme/cmake/HommeMacros.cmake | 4 ++-- components/homme/test_execs/CMakeLists.txt | 12 ++++++------ .../homme/test_execs/preqx_kokkos_ut/CMakeLists.txt | 2 +- .../homme/test_execs/thetal_kokkos_ut/CMakeLists.txt | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/components/homme/cmake/HommeMacros.cmake b/components/homme/cmake/HommeMacros.cmake index 7657ffcb8e9a..6aeb5d7446a8 100644 --- a/components/homme/cmake/HommeMacros.cmake +++ b/components/homme/cmake/HommeMacros.cmake @@ -140,7 +140,7 @@ macro(createTestExec execName execType macroNP macroNC ENDIF () IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${execName} kokkoscore) + TARGET_LINK_LIBRARIES(${execName} kokkos) ENDIF () # Move the module files out of the way so the parallel build @@ -237,7 +237,7 @@ macro(createExecLib libName execType libSrcs inclDirs macroNP TARGET_LINK_LIBRARIES(${libName} timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${libName} kokkoscore) + TARGET_LINK_LIBRARIES(${libName} kokkos) ENDIF () IF (NOT HOMME_USE_MKL) diff --git a/components/homme/test_execs/CMakeLists.txt b/components/homme/test_execs/CMakeLists.txt index 381cabbb0785..0f22e9de851a 100644 --- a/components/homme/test_execs/CMakeLists.txt +++ b/components/homme/test_execs/CMakeLists.txt @@ -239,9 +239,9 @@ IF (${BUILD_HOMME_THETA_KOKKOS}) ADD_SUBDIRECTORY(theta-nlev128) ENDIF() - # IF (HOMMEXX_BFB_TESTING) + IF (HOMMEXX_BFB_TESTING) ADD_SUBDIRECTORY(thetal_kokkos_ut) - #ENDIF() + ENDIF() endif() ENDIF() @@ -256,19 +256,19 @@ IF(${BUILD_HOMME_SWIM}) ADD_SUBDIRECTORY(swim5) ENDIF() -#IF (HOMMEXX_BFB_TESTING) +IF (HOMMEXX_BFB_TESTING) # Test some stuff in the shared folder ADD_SUBDIRECTORY (share_kokkos_ut) - #ENDIF () +ENDIF () IF (${BUILD_HOMME_PREQX_KOKKOS}) ADD_SUBDIRECTORY(preqx-nlev26) ADD_SUBDIRECTORY(preqx-nlev26-kokkos) ADD_SUBDIRECTORY(preqx-nlev72) ADD_SUBDIRECTORY(preqx-nlev72-kokkos) - # IF (HOMMEXX_BFB_TESTING) + IF (HOMMEXX_BFB_TESTING) ADD_SUBDIRECTORY(preqx_kokkos_ut) - #ENDIF () + ENDIF () ENDIF () # Read the test-list.cmake file to get the HOMME_TESTS list diff --git a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt index 94176f27b487..6d6d68597cdc 100644 --- a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt @@ -23,7 +23,7 @@ ADD_LIBRARY(preqx_kokkos_ut_lib ${PREQX_DEPS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(preqx_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkoscore) +TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkos) TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF(NOT BUILD_HOMME_WITHOUT_PIOLIBRARY) IF(HOMME_USE_SCORPIO) diff --git a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt index 15691cacb35e..1981aecb3487 100644 --- a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt @@ -25,7 +25,7 @@ ADD_LIBRARY(thetal_kokkos_ut_lib ${THETAL_DEPS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(thetal_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkoscore) +TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkos) TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing ${COMPOSE_LIBRARY} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF(NOT BUILD_HOMME_WITHOUT_PIOLIBRARY) IF(HOMME_USE_SCORPIO) @@ -41,7 +41,7 @@ SET_TARGET_PROPERTIES(thetal_kokkos_ut_lib PROPERTIES Fortran_MODULE_DIRECTORY $ # Setting this once and for all, good for all unit tests. SET (CONFIG_DEFINES HAVE_CONFIG_H) -#IF (HOMMEXX_BFB_TESTING) +IF (HOMMEXX_BFB_TESTING) ### Equation of state unit tests SET (EOS_UT_CXX_SRCS @@ -233,7 +233,7 @@ SET (CONFIG_DEFINES HAVE_CONFIG_H) ENDIF() cxx_unit_test (remap_theta_ut "${REMAP_THETA_UT_F90_SRCS}" "${REMAP_THETA_UT_CXX_SRCS}" "${REMAP_THETA_UT_INCLUDE_DIRS}" "${CONFIG_DEFINES}" ${NUM_CPUS}) TARGET_LINK_LIBRARIES(remap_theta_ut thetal_kokkos_ut_lib) -#ENDIF () +ENDIF () # ### DIRK functor unit test From dfee17bee957a5bdc5437e6112fb48cf94cf2584 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 15:42:03 -0500 Subject: [PATCH 08/62] revert nonsgnificant changes --- components/homme/test/unit_tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/test/unit_tests/CMakeLists.txt b/components/homme/test/unit_tests/CMakeLists.txt index 6ae4a68392d0..7b7e50362f40 100644 --- a/components/homme/test/unit_tests/CMakeLists.txt +++ b/components/homme/test/unit_tests/CMakeLists.txt @@ -13,7 +13,7 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con #ENDIF() SET_TESTS_PROPERTIES(${target_name}_test PROPERTIES LABELS "unit") - TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkoscore) + TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkos) STRING(TOUPPER "${PERFORMANCE_PROFILE}" PERF_PROF_UPPER) IF ("${PERF_PROF_UPPER}" STREQUAL "VTUNE") From 3523d5dfd4cdb2d360823f7016393bb8bbf34740 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 16:01:22 -0500 Subject: [PATCH 09/62] reverting --- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 34 ++----------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 3e13ed0ccee6..d4d98f61f29e 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -124,8 +124,6 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); -#if 0 - int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -160,10 +158,6 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_threads_usable ); -//TP(16,64) for 120*64, 64 -//TP(16,64) for 16*32, 16 -// -//printf("tp. prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); @@ -173,15 +167,12 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_vectors_usable ); -//printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, -// num_vectors); - return std::make_pair( num_device_threads / num_vectors, num_vectors ); } -#endif -return std::make_pair( 16,4 ); +//manual override for HIP +//return std::make_pair( 16,4 ); } @@ -211,33 +202,14 @@ team_num_threads_vectors (const int num_parallel_iterations, # else const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); # endif -#else - -#if HIP_BUILD - -//to make sure this is active, it is -//#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) - -//however slow, combination 16*32 and 16 below ran with all ne +#elif HIP_BUILD //use 64 wavefronts per CU and 120 CUs const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); const int max_num_warps = 40; //cores per CU, SM ///HOMMEXX_CUDA_MAX_WARP_PER_TEAM; const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; -//warpsize is 64 for hip -//printf(" warpsize----------: %4d \n",num_threads_warp); -//printf(" on GPU? %d \n", OnGpu::value); - -//shows host -//#if __HIP_DEVICE_COMPILE__ == 1 -// printf("DEVICEEEEEEEEEEEEEEE \n"); -//#endif -//#ifndef __HIP_DEVICE_COMPILE__ -// printf("HOSTTTTT \n"); -//#endif - #else // I want thread-distribution rules to be unit-testable even when Cuda is From 0a2d96d64b073c92e1d2fef927ff2c94465093f9 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 16:05:32 -0500 Subject: [PATCH 10/62] clean up --- components/homme/CMakeLists.txt | 36 +++++++++---------- .../homme/src/share/cxx/KernelVariables.hpp | 19 +++++----- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index 13d0a5edf09c..55a2594b7611 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -177,23 +177,22 @@ IF (${HOMME_USE_KOKKOS}) SET (HOMMEXX_EXEC_SPACE "Default" CACHE STRING "Select the kokkos exec space") -################!!!!!!!!!!!! ????????????? STRING (TOUPPER ${HOMMEXX_EXEC_SPACE} HOMMEXX_EXEC_SPACE_UPPER) -# IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") -# SET (HOMMEXX_HIP_SPACE ON) -# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") -# SET (HOMMEXX_CUDA_SPACE ON) -# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "OPENMP") -# SET (HOMMEXX_OPENMP_SPACE ON) -# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "THREADS") -# SET (HOMMEXX_THREADS_SPACE ON) -# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "SERIAL") -# SET (HOMMEXX_SERIAL_SPACE ON) -# ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "DEFAULT") -# SET (HOMMEXX_DEFAULT_SPACE ON) -# ELSE() -# MESSAGE (ABORT "Invalid choice for 'HOMMEXX_EXEC_SPACE'. Valid options (case insensitive) are 'Cuda', 'OpenMP', 'Threads', 'Serial', 'Default'") -# ENDIF() + IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") + SET (HOMMEXX_HIP_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") + SET (HOMMEXX_CUDA_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "OPENMP") + SET (HOMMEXX_OPENMP_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "THREADS") + SET (HOMMEXX_THREADS_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "SERIAL") + SET (HOMMEXX_SERIAL_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "DEFAULT") + SET (HOMMEXX_DEFAULT_SPACE ON) + ELSE() + MESSAGE (ABORT "Invalid choice for 'HOMMEXX_EXEC_SPACE'. Valid options (case insensitive) are 'Cuda', 'OpenMP', 'Threads', 'Serial', 'Default'") + ENDIF() # Execution space parameters. 8 is a nice size for V100. SET (HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 CACHE STRING "Minimum number of warps to get 100% occoupancy on GPU") @@ -269,7 +268,6 @@ IF (${HOMME_USE_KOKKOS}) endif() IF (CUDA_BUILD OR HIP) - message("HERE WE ARE--------------------------------------------") SET (DEFAULT_VECTOR_SIZE 1) ELSE () SET (DEFAULT_VECTOR_SIZE 8) @@ -411,10 +409,10 @@ IF (${HOMME_USE_KOKKOS}) ENDIF () # This folder contains the CMake macro used to build cxx unit tests -#IF (HOMMEXX_BFB_TESTING) +IF (HOMMEXX_BFB_TESTING) # Add unit tests for C++ code ADD_SUBDIRECTORY(test/unit_tests) -#ENDIF () +ENDIF () # Add libcompose, consisting of just C++ files, so we can build it just once for # all exes. It does not depend on compile-time sizes. diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index f38d131b393f..06dcd3504667 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -36,16 +36,15 @@ struct KernelVariables { get_team_idx(const int /*team_size*/, const int league_rank) { return league_rank; } -//#else -// template -// static KOKKOS_INLINE_FUNCTION typename std::enable_if< -// OnGpu::value, int>::type -// get_team_idx(const int /*team_size*/, const int /*league_rank*/) { -// assert(false); // should never happen -// return -1; -// } -//#endif // __CUDA_ARCH__ -#endif // KOKKOS_ENABLE_CUDA +#else + template + static KOKKOS_INLINE_FUNCTION typename std::enable_if< + OnGpu::value, int>::type + get_team_idx(const int /*team_size*/, const int /*league_rank*/) { + assert(false); // should never happen + return -1; + } +#endif // KOKKOS_ENABLE_CUDA or HIP #ifdef KOKKOS_ENABLE_OPENMP template From 5460f986f292f6b3c0bc3750c24723580352dc43 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 16:06:50 -0500 Subject: [PATCH 11/62] clean up --- components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp index 1cba57a95913..a7a7347dd66f 100644 --- a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp @@ -218,14 +218,6 @@ struct CaarFunctorImpl { const int igp = idx / NP; const int jgp = idx % NP; -//#if __HIP_DEVICE_COMPILE__ == 1 -// printf("DEVICEEEEEEEEEEEEEEE \n"); -//#endif -//#ifndef __HIP_DEVICE_COMPILE__ -// printf("HOSTTTTT \n"); -//#endif - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV), [&] (const int& ilev) { // pre-fill energy_grad with the pressure(_grad)-temperature part m_buffers.energy_grad(kv.team_idx, 0, igp, jgp, ilev) = From ec3308a86fbfa6f8250e046d9efb17c20337e5af Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 16:13:08 -0500 Subject: [PATCH 12/62] reverting --- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index d4d98f61f29e..a9a31743d014 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -156,12 +156,9 @@ team_num_threads_vectors_for_gpu ( if (tp.prefer_threads) { const int num_threads = ( (tp.max_threads_usable > num_device_threads) ? num_device_threads : - tp.max_threads_usable ); - + tp.max_threads_usable ); return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); - - } else { const int num_vectors = prevpow2( (tp.max_vectors_usable > num_device_threads) ? num_device_threads : @@ -187,7 +184,7 @@ team_num_threads_vectors (const int num_parallel_iterations, // fewer than 4 warps/thread block limits the thread occupancy to that // number/4. That seems to be in Cuda specs, but I don't know of a function // that provides this number. Use a configuration option that defaults to 4. - const int min_num_warps = 4; //HOMMEXX_CUDA_MIN_WARP_PER_TEAM; + const int min_num_warps = HOMMEXX_CUDA_MIN_WARP_PER_TEAM; #ifdef KOKKOS_ENABLE_CUDA const int num_warps_device = Kokkos::Impl::cuda_internal_maximum_concurrent_block_count(); const int num_threads_warp = Kokkos::Impl::CudaTraits::WarpSize; From 909334642e7bd26fc05a294a4711535fa9ae67b8 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 16:58:39 -0500 Subject: [PATCH 13/62] fix ifdef, add prnts for the last pair --- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index a9a31743d014..966a7bd10695 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -157,6 +157,9 @@ team_num_threads_vectors_for_gpu ( const int num_threads = ( (tp.max_threads_usable > num_device_threads) ? num_device_threads : tp.max_threads_usable ); + +printf("tp.prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); + return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); } else { @@ -164,6 +167,9 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_vectors_usable ); +printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, + num_vectors); + return std::make_pair( num_device_threads / num_vectors, num_vectors ); } @@ -215,9 +221,6 @@ team_num_threads_vectors (const int num_parallel_iterations, const int num_threads_warp = 32; const int max_num_warps = 16; -#endif - - #endif return Parallel::team_num_threads_vectors_for_gpu( From ad20860e809025c79f7ba5893f24fd44957cc21d Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 17:19:13 -0500 Subject: [PATCH 14/62] mods for cmake file, comment out printing pairs --- .../homme/cmake/machineFiles/summit-gpumpi.cmake | 11 ++++++----- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/components/homme/cmake/machineFiles/summit-gpumpi.cmake b/components/homme/cmake/machineFiles/summit-gpumpi.cmake index 7bd33ca9bc27..6f6f4c73413f 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi.cmake @@ -2,13 +2,13 @@ #bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash -#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ - #SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") -SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") -SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") +#SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +#SET (NetCDF_Fortran_PATH "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-fortran-4.4.5-e2hkh7w3253wz5uubjxbbvh56a7xjl7n" CACHE STRING "") +#SET(NetCDF_C_LIBRARY "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/lib" CACHE STRING "") +#SET(NetCDF_C_INCLUDE_DIR "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/include" CACHE STRING "") SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") @@ -22,6 +22,7 @@ SET(ENABLE_CUDA FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") +#SET(HOMME_ENABLE_COMPOSE TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") #SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") @@ -36,7 +37,7 @@ SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") SET(CMAKE_C_COMPILER "mpicc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "") -SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/acme-MASTER-GB/externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") set (ENABLE_OPENMP OFF CACHE BOOL "") set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 966a7bd10695..926d7ca09baf 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -158,7 +158,7 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_threads_usable ); -printf("tp.prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); +//printf("tp.prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); @@ -167,8 +167,8 @@ printf("tp.prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads num_device_threads : tp.max_vectors_usable ); -printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, - num_vectors); +//printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, +// num_vectors); return std::make_pair( num_device_threads / num_vectors, num_vectors ); From 2f173cf4e0adb7e3abf846b3804518bead7b2c00 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 17:31:54 -0500 Subject: [PATCH 15/62] set the last pair for hip manuall --- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 926d7ca09baf..fa3e9c5757b7 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -124,6 +124,7 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); +#if !HIP_BUILD int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -173,10 +174,10 @@ team_num_threads_vectors_for_gpu ( return std::make_pair( num_device_threads / num_vectors, num_vectors ); } - +#else //manual override for HIP -//return std::make_pair( 16,4 ); - +return std::make_pair( 16,4 ); +#endif } From b97eb9432a3571620116e430530bf4bb0f7d1a0a Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 10 Nov 2021 21:55:50 -0500 Subject: [PATCH 16/62] add flags --- components/homme/cmake/machineFiles/spock-gpumpi.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake index ce91b0d6a744..b3a2b27b3ba8 100644 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -46,10 +46,10 @@ SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.7/ofi/crayclang/10.0" CACHE STRING "") SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.7/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") -SET(ADD_Fortran_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_C_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_CXX_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_LINKER_FLAGS "${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_CXX_FLAGS "-O3 -DNDEBUG --amdgpu-target=gfx908 -fno-gpu-rdc ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") set (ENABLE_OPENMP OFF CACHE BOOL "") From bf6f042ae82f21cc3f274884867dfa5ef774900a Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Sat, 5 Feb 2022 15:35:18 -0500 Subject: [PATCH 17/62] newer environment --- components/homme/cmake/machineFiles/spock-gpumpi.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake index b3a2b27b3ba8..08dda79db79a 100644 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -42,9 +42,9 @@ SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-example-spock-hipcc/bld-hipcc" CACHE STRING "") #not the proper way!!! -SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.7/ofi/crayclang/10.0" CACHE STRING "") +SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") -SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.7/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") +SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") From 6689abd53bfbe456f87ef756802897fe76ae1bb8 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 5 Apr 2022 20:40:53 -0400 Subject: [PATCH 18/62] update kokkos build made with a new env --- components/homme/cmake/machineFiles/spock-gpumpi.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake index 08dda79db79a..166421109e6d 100644 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -39,7 +39,7 @@ SET(CMAKE_C_COMPILER "cc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") #SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") -SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-example-spock-hipcc/bld-hipcc" CACHE STRING "") +SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-example-spock-hipcc2/bld-hipcc" CACHE STRING "") #not the proper way!!! SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") From cfb8dfd763a3821d23e561957cccbb7168c1fabe Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 5 Apr 2022 20:44:58 -0400 Subject: [PATCH 19/62] minor, 1 comment on LB, commenting out pow call that wasnt implemented --- components/homme/src/share/cxx/EulerStepFunctorImpl.hpp | 4 +++- components/homme/src/share/cxx/HybridVCoord.cpp | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index e7746f2d83b1..e317c9401f18 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -413,7 +413,9 @@ class EulerStepFunctorImpl { ExecSpace::impl_static_fence(); m_kernel_will_run_limiters = true; Kokkos::parallel_for( - Homme::get_default_team_policy( + //to play with launch bounds + //Homme::get_default_team_policy >( + Homme::get_default_team_policy( m_geometry.num_elems() * m_data.qsize, m_tpref), *this); ExecSpace::impl_static_fence(); diff --git a/components/homme/src/share/cxx/HybridVCoord.cpp b/components/homme/src/share/cxx/HybridVCoord.cpp index 3b45fdc7cbdb..3f17fd03c996 100644 --- a/components/homme/src/share/cxx/HybridVCoord.cpp +++ b/components/homme/src/share/cxx/HybridVCoord.cpp @@ -227,7 +227,8 @@ void HybridVCoord::compute_eta () Kokkos::parallel_for(Kokkos::RangePolicy(0,NUM_LEV), KOKKOS_LAMBDA(const int& ilev){ l_etam(ilev) = l_hybrid_am(ilev) + l_hybrid_bm(ilev); - l_exner0(ilev) = pow(l_etam(ilev)*l_ps0/PhysicalConstants::p0,PhysicalConstants::kappa); + //l_exner0(ilev) = pow(l_etam(ilev)*l_ps0/PhysicalConstants::p0,PhysicalConstants::kappa); + l_exner0(ilev) = 1.0; }); Kokkos::parallel_for(Kokkos::RangePolicy(0,NUM_INTERFACE_LEV), KOKKOS_LAMBDA(const int& ilev){ From cced5d9a31938b8306c2919f5b916e2d7c761394 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Mon, 13 Jun 2022 15:49:02 -0400 Subject: [PATCH 20/62] add crusher cmake --- .../cmake/machineFiles/crusher-gpumpi.cmake | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 components/homme/cmake/machineFiles/crusher-gpumpi.cmake diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake new file mode 100644 index 000000000000..c764a4ceca67 --- /dev/null +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -0,0 +1,64 @@ +#interactive job +#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash + + +#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ + +#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") + +SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(ENABLE_CUDA FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") +SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") + +#SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +SET(HIP TRUE CACHE BOOL "") +#SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_CUDA OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") +#SET(Kokkos_ARCH_VEGA908 ON CACHE BOOL "") +#SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "cc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") +#SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") +SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc" CACHE STRING "") + +#not the proper way!!! +SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") + +SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") + +SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_CXX_FLAGS "-std=c++14 -O3 -DNDEBUG --amdgpu-target=gfx90a -fno-gpu-rdc ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") + + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") + +set (USE_NUM_PROCS 4 CACHE STRING "") + +#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") +SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") From 41f0c42e4550e8d1782153a650659c38e21846f6 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 21 Jun 2022 13:27:14 -0400 Subject: [PATCH 21/62] fix bug from merging --- components/homme/src/share/cxx/HybridVCoord.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/homme/src/share/cxx/HybridVCoord.cpp b/components/homme/src/share/cxx/HybridVCoord.cpp index b5fbc7e5e9e8..569eef1d6849 100644 --- a/components/homme/src/share/cxx/HybridVCoord.cpp +++ b/components/homme/src/share/cxx/HybridVCoord.cpp @@ -225,8 +225,6 @@ void HybridVCoord::compute_eta () const auto p0 = PhysicalConstants::p0; const auto kappa = PhysicalConstants::kappa; - auto l_ps0 = ps0; - Kokkos::parallel_for(Kokkos::RangePolicy(0,NUM_LEV), KOKKOS_LAMBDA(const int& ilev){ l_etam(ilev) = l_hybrid_am(ilev) + l_hybrid_bm(ilev); From 62db3fc0f181e40bf5a8607456839723cfa01de1 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 22 Jun 2022 16:15:31 -0500 Subject: [PATCH 22/62] clean comments, remove if-statement in preqx push logic --- components/homme/src/preqx_kokkos/prim_driver_mod.F90 | 2 -- components/homme/src/share/cxx/Config.hpp | 5 ----- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 1 - components/homme/src/share/cxx/ExecSpaceDefs.hpp | 2 -- components/homme/src/share/cxx/Hommexx_config.h.in | 3 --- components/homme/src/share/cxx/kokkos_utils.hpp | 6 ------ 6 files changed, 19 deletions(-) diff --git a/components/homme/src/preqx_kokkos/prim_driver_mod.F90 b/components/homme/src/preqx_kokkos/prim_driver_mod.F90 index beee7f54b0a7..549a198cbc64 100644 --- a/components/homme/src/preqx_kokkos/prim_driver_mod.F90 +++ b/components/homme/src/preqx_kokkos/prim_driver_mod.F90 @@ -293,13 +293,11 @@ end subroutine cxx_push_results_to_f90 elem_derived_omega_p_ptr = c_loc(elem_derived_omega_p) ! Copy cxx arrays back to f90 structures - if (.false.) then call t_startf('push_to_f90') call cxx_push_results_to_f90(elem_state_v_ptr, elem_state_temp_ptr, elem_state_dp3d_ptr, & elem_state_Qdp_ptr, elem_state_Q_ptr, elem_state_ps_v_ptr, & elem_derived_omega_p_ptr) call t_stopf('push_to_f90') - endif endif ! Print some diagnostic information diff --git a/components/homme/src/share/cxx/Config.hpp b/components/homme/src/share/cxx/Config.hpp index e4ed00f14100..7c4df9733a5b 100644 --- a/components/homme/src/share/cxx/Config.hpp +++ b/components/homme/src/share/cxx/Config.hpp @@ -13,7 +13,6 @@ # include "config.h.c" # endif #else -///!!!!!!!!!!!!!!! not relevant // Establish a good candidate vector size for eam builds # ifdef CUDA_BUILD # define HOMMEXX_VECTOR_SIZE 1 @@ -22,20 +21,16 @@ # endif #endif - -////!!!!!!!!!!! #if ! defined HOMMEXX_CUDA_SPACE && ! defined HOMMEXX_OPENMP_SPACE && ! defined HOMMEXX_THREADS_SPACE && ! defined HOMMEXX_SERIAL_SPACE && ! defined HOMMEXX_HIP_SPACE # define HOMMEXX_DEFAULT_SPACE #endif -//?????? #ifndef HOMMEXX_MPI_ON_DEVICE # define HOMMEXX_MPI_ON_DEVICE 1 #endif #include -/////!!!!!!! #if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) # ifndef HOMMEXX_CUDA_MIN_WARP_PER_TEAM # define HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index fa3e9c5757b7..ab5daafe26b0 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -19,7 +19,6 @@ #if HIP_BUILD #include -///#include #endif namespace Homme { diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 995c3a4332cc..650ef1076d68 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -319,8 +319,6 @@ VECTOR_SIMD_LOOP #if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) template <> -//struct Dispatch { -// using ExeSpace = Kokkos::Cuda; struct Dispatch { using ExeSpace = Hommexx_Cuda; diff --git a/components/homme/src/share/cxx/Hommexx_config.h.in b/components/homme/src/share/cxx/Hommexx_config.h.in index d0b92b054c9c..e3664389e4c4 100644 --- a/components/homme/src/share/cxx/Hommexx_config.h.in +++ b/components/homme/src/share/cxx/Hommexx_config.h.in @@ -1,9 +1,6 @@ #ifndef HOMMEXX_CONFIG_H #define HOMMEXX_CONFIG_H - -///????????????? - // Whether the CUDA exec space has been selected #cmakedefine HOMMEXX_CUDA_SPACE diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index 2122d6f142eb..aca465e93ede 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -101,16 +101,11 @@ class TeamUtils : public _TeamUtilsCommonBase }; #endif -////////// - - - /* * Specialization for Cuda execution space. */ #if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) template <> -//class TeamUtils : public _TeamUtilsCommonBase class TeamUtils : public _TeamUtilsCommonBase { #ifdef HOMMEXX_CUDA_SHARE_BUFFER @@ -129,7 +124,6 @@ class TeamUtils : public _TeamUtilsCommonBase public: template TeamUtils(const TeamPolicy& policy, const Real& overprov_factor = 1.25) : - //_TeamUtilsCommonBase(policy) _TeamUtilsCommonBase(policy) #ifdef HOMMEXX_CUDA_SHARE_BUFFER , _num_ws_slots(_league_size > _num_teams From 1c042ab6d373a9b18faa772da8b35671dd641456 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 22 Jun 2022 16:30:02 -0500 Subject: [PATCH 23/62] switch to c++17 --- components/homme/cmake/SetCompilerFlags.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/homme/cmake/SetCompilerFlags.cmake b/components/homme/cmake/SetCompilerFlags.cmake index 2a42b7aec4a8..aa063b48ba54 100644 --- a/components/homme/cmake/SetCompilerFlags.cmake +++ b/components/homme/cmake/SetCompilerFlags.cmake @@ -67,11 +67,11 @@ IF (${HOMME_USE_CXX}) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") INCLUDE(CheckCXXCompilerFlag) - CHECK_CXX_COMPILER_FLAG("-std=c++17" CXX14_SUPPORTED) - IF (${CXX14_SUPPORTED}) + CHECK_CXX_COMPILER_FLAG("-std=c++17" CXX17_SUPPORTED) + IF (${CXX17_SUPPORTED}) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") ELSEIF (${HOMME_USE_KOKKOS}) - MESSAGE (FATAL_ERROR "Kokkos needs C++14, but the C++ compiler does not support it.") + MESSAGE (FATAL_ERROR "Kokkos needs C++17, but the C++ compiler does not support it.") ENDIF () CHECK_CXX_COMPILER_FLAG("-cxxlib" CXXLIB_SUPPORTED) IF (CXXLIB_SUPPORTED) From 076805f0852287e9fbcace7091e8867a6d39ca5c Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 22 Jun 2022 16:57:30 -0500 Subject: [PATCH 24/62] remove unused cmake var enable_cuda --- components/homme/CMakeLists.txt | 3 ++- components/homme/cmake/machineFiles/crusher-gpumpi.cmake | 2 -- components/homme/cmake/machineFiles/spock-gpumpi.cmake | 2 -- components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake | 2 -- components/homme/cmake/machineFiles/summit-gpumpi.cmake | 2 -- components/homme/cmake/machineFiles/summit-nogpumpi.cmake | 2 -- components/homme/cmake/machineFiles/summit-p9-noomp.cmake | 2 -- components/homme/cmake/machineFiles/summit-p9-omp.cmake | 2 -- components/homme/cmake/machineFiles/summit-p9.cmake | 2 -- components/homme/cmake/machineFiles/summit.cmake | 2 -- 10 files changed, 2 insertions(+), 19 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index 0f9633d8023a..9807327e5117 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -278,9 +278,10 @@ MESSAGE(STATUS "Linker Flags = ${CMAKE_EXE_LINKER_FLAGS}") IF (${HOMME_USE_KOKKOS}) +#set HIP to ON in cmake machine file if(HIP) add_definitions(-DHIP_BUILD) - #does not seem to be needed because kokkos::default will be picked up + #does not seem to be needed because kokkos::default exec space will be picked up #SET (HOMMEXX_HIP_SPACE ON) endif() diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake index c764a4ceca67..587dd6462a6b 100644 --- a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -18,8 +18,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake index 166421109e6d..4b288ef5c1cd 100644 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/spock-gpumpi.cmake @@ -18,8 +18,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake b/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake index c860dffe8dd5..6de0d6f040be 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake @@ -19,8 +19,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_SWEQX FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX_ACC FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-gpumpi.cmake b/components/homme/cmake/machineFiles/summit-gpumpi.cmake index 6f6f4c73413f..ff40000e0b77 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi.cmake @@ -18,8 +18,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") #SET(HOMME_ENABLE_COMPOSE TRUE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-nogpumpi.cmake b/components/homme/cmake/machineFiles/summit-nogpumpi.cmake index 892c0637852f..3b649122c534 100644 --- a/components/homme/cmake/machineFiles/summit-nogpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-nogpumpi.cmake @@ -16,8 +16,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos/build-serial-cuda-nodebug" CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/summit-p9-noomp.cmake b/components/homme/cmake/machineFiles/summit-p9-noomp.cmake index 70602695a33b..1d741db175fe 100644 --- a/components/homme/cmake/machineFiles/summit-p9-noomp.cmake +++ b/components/homme/cmake/machineFiles/summit-p9-noomp.cmake @@ -15,8 +15,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-p9-omp.cmake b/components/homme/cmake/machineFiles/summit-p9-omp.cmake index 70ade1f043cc..05d37d12154a 100644 --- a/components/homme/cmake/machineFiles/summit-p9-omp.cmake +++ b/components/homme/cmake/machineFiles/summit-p9-omp.cmake @@ -15,8 +15,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-p9.cmake b/components/homme/cmake/machineFiles/summit-p9.cmake index c240b3e39c11..9136dc0bfbdc 100644 --- a/components/homme/cmake/machineFiles/summit-p9.cmake +++ b/components/homme/cmake/machineFiles/summit-p9.cmake @@ -21,8 +21,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos/build-serial-p9-nodebug" CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/summit.cmake b/components/homme/cmake/machineFiles/summit.cmake index e44ce08db16f..ee449c90c125 100644 --- a/components/homme/cmake/machineFiles/summit.cmake +++ b/components/homme/cmake/machineFiles/summit.cmake @@ -15,8 +15,6 @@ SET (WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(KOKKOS_PATH "$ENV{HOME}/kokkos/build-serial-cuda-nodebug" CACHE STRING "") From d1cc87321045519508f827efc05c58404357c264 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 22 Jun 2022 17:11:15 -0500 Subject: [PATCH 25/62] change if HIP to ifdef HIP --- components/homme/src/share/cxx/Config.hpp | 2 +- components/homme/src/share/cxx/Dimensions.hpp | 2 +- components/homme/src/share/cxx/EulerStepFunctorImpl.hpp | 2 +- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 8 ++++---- components/homme/src/share/cxx/ExecSpaceDefs.hpp | 8 ++++---- components/homme/src/share/cxx/KernelVariables.hpp | 2 +- components/homme/src/share/cxx/kokkos_utils.hpp | 2 +- components/homme/src/share/cxx/profiling.hpp | 3 ++- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/components/homme/src/share/cxx/Config.hpp b/components/homme/src/share/cxx/Config.hpp index 7c4df9733a5b..e9a95f04f34e 100644 --- a/components/homme/src/share/cxx/Config.hpp +++ b/components/homme/src/share/cxx/Config.hpp @@ -31,7 +31,7 @@ #include -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) # ifndef HOMMEXX_CUDA_MIN_WARP_PER_TEAM # define HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 # endif diff --git a/components/homme/src/share/cxx/Dimensions.hpp b/components/homme/src/share/cxx/Dimensions.hpp index ab739f074432..e596e1ec5af6 100644 --- a/components/homme/src/share/cxx/Dimensions.hpp +++ b/components/homme/src/share/cxx/Dimensions.hpp @@ -14,7 +14,7 @@ namespace Homme { // Until whenever CUDA supports constexpr properly -#if defined(CUDA_BUILD) || (HIP_BUILD) +#if defined(CUDA_BUILD) || defined(HIP_BUILD) #ifdef CAM #define QSIZE_D PCNST diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index 0886f22b1374..cd54b56183e8 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -56,7 +56,7 @@ struct SerialLimiter { }; #endif -#if HIP_BUILD +#ifdef HIP_BUILD template <> struct SerialLimiter { template #endif -#if HIP_BUILD +#ifdef HIP_BUILD #include #endif @@ -53,7 +53,7 @@ void initialize_kokkos () { str.back() = 0; args.push_back(const_cast(str.data())); #endif -#if HIP_BUILD +#ifdef HIP_BUILD int nd; const auto ret = hipGetDeviceCount(&nd); if (ret != hipSuccess) { @@ -123,7 +123,7 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); -#if !HIP_BUILD +#if !defined(HIP_BUILD) int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -206,7 +206,7 @@ team_num_threads_vectors (const int num_parallel_iterations, const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); # endif -#elif HIP_BUILD +#elif defined(HIP_BUILD) //use 64 wavefronts per CU and 120 CUs const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 650ef1076d68..5cf66296a181 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -21,13 +21,13 @@ namespace Homme // Some in-house names for Kokkos exec spaces, which are // always defined, possibly as alias of void -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) #ifdef KOKKOS_ENABLE_CUDA using Hommexx_Cuda = Kokkos::Cuda; #endif -#if HIP_BUILD +#ifdef HIP_BUILD using Hommexx_Cuda = Kokkos::Experimental::HIP; #endif @@ -55,7 +55,7 @@ using Hommexx_Serial = Kokkos::Serial; using Hommexx_Serial = void; #endif -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) # define HOMMEXX_STATIC #else # define HOMMEXX_STATIC static @@ -317,7 +317,7 @@ VECTOR_SIMD_LOOP } }; -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) template <> struct Dispatch { using ExeSpace = Hommexx_Cuda; diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 06dcd3504667..15a5c296dd71 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -28,7 +28,7 @@ struct KernelVariables { return 0; } -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) //#ifdef __CUDA_ARCH__ template static KOKKOS_INLINE_FUNCTION typename std::enable_if< diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index aca465e93ede..d0ff20285502 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -104,7 +104,7 @@ class TeamUtils : public _TeamUtilsCommonBase /* * Specialization for Cuda execution space. */ -#if defined(KOKKOS_ENABLE_CUDA) || (HIP_BUILD) +#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) template <> class TeamUtils : public _TeamUtilsCommonBase { diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index 660a0de331db..8684a873852d 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -12,7 +12,8 @@ #include "gptl.h" -#if (HIP_BUILD) || defined(HOMMEXX_CUDA_SPACE) || \ +//OG not sure about timers and HIP, probably the same as with CUDA +#if defined(HIP_BUILD) || defined(HOMMEXX_CUDA_SPACE) || \ (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) // Can't use GPTL timers on CUDA #define start_timer(name) {} #define stop_timer(name) {} From ecf447fb39110e74cad99a55d3edf72774c53f1a Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Thu, 23 Jun 2022 13:24:46 -0600 Subject: [PATCH 26/62] fix cprnc path when not using cprnc_dir --- components/homme/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index 9807327e5117..78dc1e3c98c2 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -409,19 +409,19 @@ if(NOT BUILD_HOMME_WITHOUT_PIOLIBRARY) set(NETCDF_INCLUDE_DIR ${NetCDF_Fortran_INCLUDE_DIRS}) # needed for CPRNC build system - set (cprnc_dummy_file "${CMAKE_CURRENT_SOURCE_DIR}/utils/cime/tools/cprnc/Macros.cmake") + set (cprnc_dummy_file "${CMAKE_CURRENT_SOURCE_DIR}/utils/cime/CIME/non_py/cprnc/Macros.cmake") if (NOT EXISTS "${cprnc_dummy_file}") file(WRITE "${cprnc_dummy_file}" "#dummy Macros file for non-CIME machines") endif () # cprnc's CMake system needs various nonstandard variables: set(FFLAGS ${CMAKE_Fortran_FLAGS}) - set(BLDROOT ${CMAKE_CURRENT_SOURCE_DIR}/utils/cime/tools/cprnc) + set(BLDROOT ${CMAKE_CURRENT_SOURCE_DIR}/utils/cime/CIME/non_py/cprnc) set(SFC ${CMAKE_Fortran_COMPILER}) set(SCC ${CMAKE_C_COMPILER}) - SET (CPRNC_INSTALL_DIR ${HOMME_BINARY_DIR}/utils/cime/tools/cprnc) - SET (CPRNC_BINARY ${HOMME_BINARY_DIR}/utils/cime/tools/cprnc/cprnc) + SET (CPRNC_INSTALL_DIR ${HOMME_BINARY_DIR}/utils/cime/CIME/non_py/cprnc) + SET (CPRNC_BINARY ${HOMME_BINARY_DIR}/utils/cime/CIME/non_py/cprnc/cprnc) ADD_SUBDIRECTORY(utils/cime/CIME/non_py/cprnc) ENDIF () ENDIF () From ba358e36de4c6187bc59d3c3c85ebe434e60b4c5 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Mon, 27 Jun 2022 12:03:07 -0400 Subject: [PATCH 27/62] summit bfb file for tests --- .../homme/cmake/machineFiles/summit-bfb.cmake | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 components/homme/cmake/machineFiles/summit-bfb.cmake diff --git a/components/homme/cmake/machineFiles/summit-bfb.cmake b/components/homme/cmake/machineFiles/summit-bfb.cmake new file mode 100644 index 000000000000..c9313f346a54 --- /dev/null +++ b/components/homme/cmake/machineFiles/summit-bfb.cmake @@ -0,0 +1,62 @@ +#interactive job +#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash + +#1 gpu on 1 node +#jsrun -n 1 -r 1 -l gpu-gpu -b packed:1 -d plane:1 -a1 -c7 -g1 --smpiargs "-gpu" EXEC < ${nlname} +#6 gpus on 1 node +#jsrun -n 6 -r 6 -l gpu-gpu -b packed:1 -d plane:1 -a1 -c7 -g1 --smpiargs "-gpu" EXEC < ${nlname} +#6 ranks, cpu only on 1 node, not 42 rank, NEEDS visible GPU due to some kokkos init, so, -g1, not -g0 +#jsrun -n 6 -r 6 -l cpu-cpu -b packed:1 -d plane:1 -a1 -c7 -g1 EXEC < ${nlname} + + +set(CMAKE_C_FLAGS "-w" CACHE STRING "") +set(ADD_CXX_FLAGS "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored -Wno-unknown-pragmas --fmad=false -O0" CACHE STRING "") +set(ADD_Fortran_FLAGS " -ffp-contract=off -O0" CACHE STRING "") +set(OPT_FLAGS "-O0" CACHE STRING "") +set(DEBUG_FLAGS "-ffp-contract=off -g"CACHE STRING "") + + +#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +SET (NetCDF_Fortran_PATH $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE STRING "") +SET (NetCDF_C_PATH $ENV{OLCF_NETCDF_C_ROOT} CACHE STRING "") +#SET(NetCDF_C_LIBRARY "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/lib" CACHE STRING "") +#SET(NetCDF_C_INCLUDE_DIR "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/include" CACHE STRING "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") +SET(HOMME_ENABLE_COMPOSE TRUE CACHE BOOL "") +#SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") + +SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +SET(Kokkos_ENABLE_CUDA ON CACHE BOOL "") +SET(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "") +SET(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "") +SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "mpicc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "${CMAKE_CURRENT_SOURCE_DIR}/../../externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") + +set (USE_NUM_PROCS 4 CACHE STRING "") + +#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") +SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") From f6e28c8cf9879bf05a615ee65d93110f2495aecc Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 28 Jun 2022 15:40:16 -0400 Subject: [PATCH 28/62] minor fix in summit cmake --- components/homme/cmake/machineFiles/summit-gpumpi.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/cmake/machineFiles/summit-gpumpi.cmake b/components/homme/cmake/machineFiles/summit-gpumpi.cmake index ff40000e0b77..644b7840d94c 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi.cmake @@ -35,7 +35,7 @@ SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") SET(CMAKE_C_COMPILER "mpicc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "") -SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/acme-MASTER-GB/externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "${CMAKE_CURRENT_SOURCE_DIR}/../../externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") set (ENABLE_OPENMP OFF CACHE BOOL "") set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") From 91b396a20185f377bc3ec403d4ec8d4d32af2568 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 28 Jun 2022 15:41:43 -0400 Subject: [PATCH 29/62] theta no-compose build --- components/homme/src/prim_main.F90 | 4 ++++ components/homme/src/share/cxx/ComposeTransport.cpp | 5 +++++ components/homme/src/share/cxx/ComposeTransport.hpp | 5 +++++ .../homme/src/share/cxx/ComposeTransportImpl.hpp | 5 +++++ .../src/share/cxx/ComposeTransportImplGeneral.cpp | 6 ++++++ .../src/share/cxx/ComposeTransportImplHypervis.cpp | 5 +++++ .../src/share/cxx/ComposeTransportImplTest2D.cpp | 5 +++++ .../src/share/cxx/ComposeTransportImplTrajectory.cpp | 5 +++++ .../share/cxx/ComposeTransportImplVerticalRemap.cpp | 5 +++++ .../homme/src/share/cxx/prim_advec_tracers_remap.cpp | 6 +++--- .../homme/src/share/cxx/prim_cxx_driver_base.F90 | 10 +++++++++- components/homme/src/share/cxx/prim_step.cpp | 4 +++- components/homme/src/share/sl_advection.F90 | 9 ++++++++- components/homme/src/theta-l_kokkos/CMakeLists.txt | 8 ++++++++ .../theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp | 12 +++++++++++- 15 files changed, 87 insertions(+), 7 deletions(-) diff --git a/components/homme/src/prim_main.F90 b/components/homme/src/prim_main.F90 index 1463df36acfc..bfbe57e8b317 100644 --- a/components/homme/src/prim_main.F90 +++ b/components/homme/src/prim_main.F90 @@ -26,7 +26,9 @@ program prim_main #if (defined MODEL_THETA_L && defined ARKODE) use arkode_mod, only: calc_nonlinear_stats, finalize_nonlinear_stats #endif +#ifdef HOMME_ENABLE_COMPOSE use compose_test_mod, only: compose_test +#endif use test_mod, only: print_test_results #ifdef PIO_INTERP @@ -220,7 +222,9 @@ end subroutine finalize_kokkos_f90 #endif endif +#ifdef HOMME_ENABLE_COMPOSE call compose_test(par, hvcoord, dom_mt, elem) +#endif if(par%masterproc) print *,"Entering main timestepping loop" call t_startf('prim_main_loop') diff --git a/components/homme/src/share/cxx/ComposeTransport.cpp b/components/homme/src/share/cxx/ComposeTransport.cpp index 255445dbe6be..7a69b8007d8c 100644 --- a/components/homme/src/share/cxx/ComposeTransport.cpp +++ b/components/homme/src/share/cxx/ComposeTransport.cpp @@ -4,6 +4,8 @@ * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory *******************************************************************************/ +#ifdef HOMME_ENABLE_COMPOSE + #include "ComposeTransport.hpp" #include "ComposeTransportImpl.hpp" #include "Context.hpp" @@ -84,3 +86,6 @@ void ComposeTransport::test_2d (const bool bfb, const int nstep, std::vector(); if (params.transport_alg > 0) { -#ifdef MODEL_THETA_L +#if defined(MODEL_THETA_L) && defined(HOMME_ENABLE_COMPOSE) prim_advec_tracers_remap_compose(dt); #else Errors::runtime_abort("prim_advec_tracers_remap: " @@ -96,7 +96,7 @@ static void prim_advec_tracers_remap_RK2 (const Real dt) GPTLstop("tl-at prim_advec_tracers_remap_RK2"); } -#ifdef MODEL_THETA_L +#if defined(MODEL_THETA_L) && defined(HOMME_ENABLE_COMPOSE) static void prim_advec_tracers_remap_compose (const Real dt) { GPTLstart("tl-at prim_advec_tracers_compose"); const auto& params = Context::singleton().get(); diff --git a/components/homme/src/share/cxx/prim_cxx_driver_base.F90 b/components/homme/src/share/cxx/prim_cxx_driver_base.F90 index 79e1e05842f9..57e18673a642 100644 --- a/components/homme/src/share/cxx/prim_cxx_driver_base.F90 +++ b/components/homme/src/share/cxx/prim_cxx_driver_base.F90 @@ -32,9 +32,13 @@ subroutine prim_init1(elem, par, dom_mt, tl) use time_mod, only : TimeLevel_t, TimeLevel_init use prim_driver_base, only : prim_init1_geometry, prim_init1_elem_arrays, & prim_init1_cleanup, prim_init1_buffers, & - prim_init1_compose, & MetaVertex, GridEdge, deriv1 + +#ifdef HOMME_ENABLE_COMPOSE + use prim_driver_base, only : prim_init1_compose use compose_mod, only : compose_control_kokkos_init_and_fin +#endif + #if !defined(CAM) && !defined(SCREAM) use prim_driver_base, only : prim_init1_no_cam #endif @@ -65,7 +69,9 @@ end subroutine initialize_hommexx_session ! Initialize kokkos before any environment changes from the Fortran call initialize_hommexx_session() ! Don't let any other components that use Kokkos control init/finalization. +#ifdef HOMME_ENABLE_COMPOSE call compose_control_kokkos_init_and_fin(.false.) +#endif #if !defined(CAM) && !defined(SCREAM) call prim_init1_no_cam(par) @@ -96,7 +102,9 @@ end subroutine initialize_hommexx_session ! ================================== call prim_init1_elem_arrays(elem,par) +#ifdef HOMME_ENABLE_COMPOSE call prim_init1_compose(par,elem) +#endif ! ================================== ! Initialize the buffers for exchanges diff --git a/components/homme/src/share/cxx/prim_step.cpp b/components/homme/src/share/cxx/prim_step.cpp index 8ab37cda79b6..8b82a65db77f 100644 --- a/components/homme/src/share/cxx/prim_step.cpp +++ b/components/homme/src/share/cxx/prim_step.cpp @@ -196,9 +196,11 @@ void prim_step_flexible (const Real dt, const bool compute_diagnostics) { context.get().run_diagnostics(false, 3); // Remap tracers. +#ifdef HOMME_ENABLE_COMPOSE if (params.qsize > 0) Context::singleton().get().remap_q(tl); - +#endif + GPTLstop("tl-s prim_step_flexible"); #else Errors::runtime_abort("prim_step_flexible not supported in non-theta-l builds."); diff --git a/components/homme/src/share/sl_advection.F90 b/components/homme/src/share/sl_advection.F90 index c05fc528046d..fd14e437d4a9 100644 --- a/components/homme/src/share/sl_advection.F90 +++ b/components/homme/src/share/sl_advection.F90 @@ -18,7 +18,9 @@ module sl_advection use perf_mod, only : t_startf, t_stopf, t_barrierf ! _EXTERNAL use parallel_mod, only : abortmp, parallel_t use coordinate_systems_mod, only : cartesian3D_t +#ifdef HOMME_ENABLE_COMPOSE use compose_mod +#endif implicit none @@ -165,7 +167,9 @@ subroutine prim_advec_tracers_remap_ALE(elem, deriv, hvcoord, hybrid, dt, tl, ne ! For DCMIP16 supercell test case. use control_mod, only : dcmip16_mu_q use prim_advection_base, only : advance_physical_vis +#ifdef HOMME_ENABLE_COMPOSE use compose_mod, only : compose_h2d, compose_d2h +#endif use iso_c_binding, only : c_bool implicit none @@ -193,9 +197,12 @@ subroutine prim_advec_tracers_remap_ALE(elem, deriv, hvcoord, hybrid, dt, tl, ne ! Until I get the DSS onto GPU, always need to h<->d. !h2d = hybrid%par%nprocs > 1 .or. semi_lagrange_cdr_check .or. & (semi_lagrange_hv_q > 0 .and. nu_q > 0) h2d = .true. +#ifdef HOMME_ENABLE_COMPOSE d2h = compose_d2h .or. h2d h2d = compose_h2d .or. h2d - +#else + d2h = h2d +#endif call TimeLevel_Qdp(tl, dt_tracer_factor, n0_qdp, np1_qdp) call calc_trajectory(elem, deriv, hvcoord, hybrid, dt, tl, & diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt index 3550c9594201..b99402040453 100644 --- a/components/homme/src/theta-l_kokkos/CMakeLists.txt +++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt @@ -24,6 +24,9 @@ MACRO(THETAL_KOKKOS_SETUP) ${SRC_SHARE_DIR}/compose ${CMAKE_BINARY_DIR}/src/share/cxx ) + IF (NOT HOMME_ENABLE_COMPOSE) + LIST(REMOVE_ITEM EXEC_LIB_INCLUDE_DIRS ${SRC_SHARE_DIR}/compose) + ENDIF() SET (EXEC_INCLUDE_DIRS ${EXEC_LIB_INCLUDE_DIRS} @@ -106,6 +109,11 @@ MACRO(THETAL_KOKKOS_SETUP) ${SRC_SHARE_DIR}/planar_mesh_mod.F90 ) + IF (NOT HOMME_ENABLE_COMPOSE) + LIST(REMOVE_ITEM SRC_SHARE_F90 ${SRC_SHARE_DIR}/compose_mod.F90) + LIST(REMOVE_ITEM SRC_SHARE_F90 ${SRC_SHARE_DIR}/compose_test_mod.F90) + ENDIF() + SET(TEST_SRC_F90 ${TEST_SRC_DIR}/asp_tests.F90 ${TEST_SRC_DIR}/baroclinic_inst_mod.F90 diff --git a/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp b/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp index 52ab35571fbd..5b6801dbe306 100644 --- a/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp +++ b/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp @@ -342,7 +342,9 @@ void init_functors_c (const bool& allocate_buffer) // use the create_if_not_there() function. auto& caar = c.create_if_not_there(elems,tracers,ref_FE,hvcoord,sph_op,params); if (params.transport_alg == 0) c.create_if_not_there(); +#ifdef HOMME_ENABLE_COMPOSE else c.create_if_not_there(); +#endif auto& hvf = c.create_if_not_there(); auto& ff = c.create_if_not_there(); auto& diag = c.create_if_not_there (elems.num_elems(),params.theta_hydrostatic_mode); @@ -360,8 +362,10 @@ void init_functors_c (const bool& allocate_buffer) auto& esf = c.get(); if (esf.setup_needed()) esf.setup(); } else { +#ifdef HOMME_ENABLE_COMPOSE auto& ct = c.get(); if (ct.setup_needed()) ct.setup(); +#endif } if (hvf.setup_needed()) { hvf.setup(geometry, state, derived); @@ -389,8 +393,10 @@ void init_functors_c (const bool& allocate_buffer) fbm.request_size(caar.requested_buffer_size()); if (params.transport_alg == 0) fbm.request_size(c.get().requested_buffer_size()); - else +#ifdef HOMME_ENABLE_COMPOSE + else fbm.request_size(c.get().requested_buffer_size()); +#endif fbm.request_size(hvf.requested_buffer_size()); fbm.request_size(diag.requested_buffer_size()); fbm.request_size(ff.requested_buffer_size()); @@ -407,8 +413,10 @@ void init_functors_c (const bool& allocate_buffer) caar.init_buffers(fbm); if (params.transport_alg == 0) Context::singleton().get().init_buffers(fbm); +#ifdef HOMME_ENABLE_COMPOSE else Context::singleton().get().init_buffers(fbm); +#endif hvf.init_buffers(fbm); diag.init_buffers(fbm); ff.init_buffers(fbm); @@ -552,9 +560,11 @@ void init_boundary_exchanges_c () esf.reset(params); esf.init_boundary_exchanges(); } else { +#ifdef HOMME_ENABLE_COMPOSE auto& ct = c.get(); ct.reset(params); ct.init_boundary_exchanges(); +#endif } // RK stages BE's From 026a9f1c1d73b42b301b8463caa19c366f9dc27d Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Fri, 1 Jul 2022 15:24:14 -0400 Subject: [PATCH 30/62] remove opt flag for 1 remap file --- .../homme/src/preqx_kokkos/CMakeLists.txt | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/components/homme/src/preqx_kokkos/CMakeLists.txt b/components/homme/src/preqx_kokkos/CMakeLists.txt index ac5599e82f68..ff37d09ba804 100644 --- a/components/homme/src/preqx_kokkos/CMakeLists.txt +++ b/components/homme/src/preqx_kokkos/CMakeLists.txt @@ -194,6 +194,51 @@ MACRO(PREQX_KOKKOS_SETUP) ${SRC_SHARE_DIR}/cxx/utilities/BfbUtils.cpp ) +#debugging which file causes issues when opt flags are used: + +#this works +#set_source_files_properties(${PREQX_DEPS_CXX} PROPERTIES COMPILE_FLAGS -O1) + +#these work, except i prob should have done "${CMAKE_CXX_FLAGS} -O3" instead of just "-O3" + +#set_source_files_properties( ${TARGET_DIR}/cxx/cxx_f90_interface_preqx.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/CamForcing.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/Diagnostics.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/ElementsForcing.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/ElementsState.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/HyperviscosityFunctorImpl.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${TARGET_DIR}/cxx/prim_advance_exp.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/CaarFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Context.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Elements.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ElementsDerivedState.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ElementsGeometry.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ErrorDefs.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/EulerStepFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ExecSpaceDefs.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/FunctorsBuffersManager.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Hommexx_Session.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/HybridVCoord.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/HyperviscosityFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ReferenceElement.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Tracers.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/BoundaryExchange.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/Comm.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/Connectivity.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/MpiBuffersManager.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/mpi_cxx_f90_interface.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_advec_tracers_remap.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_driver.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_step.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/vertical_remap.cpp PROPERTIES COMPILE_FLAGS -O3) +#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/utilities/BfbUtils.cpp PROPERTIES COMPILE_FLAGS -O3) + + if(${HIP}) + #remove opt for this file. with opt >O0 asserts fail here. + set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -O0") + endif() + IF (HOMME_USE_TRILINOS) SET (PREQX_SRCS_ZOLTAN ${TRILINOS_ZOLTAN_DIR}/zoltan_interface.c ${TRILINOS_ZOLTAN_DIR}/zoltan_cppinterface.cpp) SET_SOURCE_FILES_PROPERTIES( ${TRILINOS_ZOLTAN_DIR}/zoltan_cppinterface.cpp PROPERTIES LANGUAGE CXX ) From 5f41bbaf04f615b98ea276dabae5602c6f56318c Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 12 Jul 2022 10:54:46 -0400 Subject: [PATCH 31/62] fix to the team size in diagnostics, not verified on crusher yet --- .../src/theta-l_kokkos/cxx/Diagnostics.hpp | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp index 9598a30d73bc..7ee7fb2041ae 100644 --- a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp @@ -35,10 +35,34 @@ class Diagnostics ExecViewUnmanaged dpnh_dp_i; }; + + template + typename std::enable_if::value == false, + Kokkos::TeamPolicy >::type + d_team_policy(const int num_exec) { + return Homme::get_default_team_policy(num_exec); + } + + template + typename std::enable_if::value == true, + Kokkos::TeamPolicy >::type + d_team_policy(const int num_exec) { + ThreadPreferences tp; + tp.max_threads_usable = 8; //16 + tp.max_vectors_usable = 32; //32 + tp.prefer_larger_team = true; + return Homme::get_default_team_policy(num_exec, tp); + } + + + + public: + Diagnostics (const int num_elems, const bool theta_hydrostatic_mode) : - m_policy(Homme::get_default_team_policy(num_elems)), + //m_policy(Homme::get_default_team_policy(num_elems)), + m_policy(d_team_policy(num_elems)), m_tu(m_policy), m_num_elems(num_elems), m_theta_hydrostatic_mode(theta_hydrostatic_mode) From b4a1e39585f152a3e39d57a4a6c3d5ef7b61ef95 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 12 Jul 2022 11:22:48 -0400 Subject: [PATCH 32/62] diagn for repro_sum, not verified on crusher --- .../homme/src/share/global_norms_mod.F90 | 28 ++++++++-- .../src/theta-l/share/prim_state_mod.F90 | 51 ++++++++++--------- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/components/homme/src/share/global_norms_mod.F90 b/components/homme/src/share/global_norms_mod.F90 index c3084e32803f..6f4be6584a83 100644 --- a/components/homme/src/share/global_norms_mod.F90 +++ b/components/homme/src/share/global_norms_mod.F90 @@ -38,7 +38,7 @@ module global_norms_mod ! ! ================================ ! -------------------------- - function global_integral(elem, h,hybrid,npts,nets,nete) result(I_sphere) + function global_integral(elem, h,hybrid,npts,nets,nete,which) result(I_sphere) use kinds, only : real_kind use hybrid_mod, only : hybrid_t use element_mod, only : element_t @@ -57,6 +57,8 @@ function global_integral(elem, h,hybrid,npts,nets,nete) result(I_sphere) real (kind=real_kind) :: I_shared common /gblintcom/I_shared + character(len=*) , intent(in) ,optional :: which + ! Local variables integer :: ie,j,i @@ -83,8 +85,13 @@ function global_integral(elem, h,hybrid,npts,nets,nete) result(I_sphere) global_shared_buf(ie,1) = J_tmp(ie) enddo !JMD print *,'global_integral: before wrap_repro_sum' - call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) + + if (present(which))then + call wrap_repro_sum(nvars=1, comm=hybrid%par%comm, which=which) + else + call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) !JMD print *,'global_integral: after wrap_repro_sum' + endif I_tmp = global_shared_sum(1) !JMD print *,'global_integral: after global_shared_sum' @@ -819,7 +826,7 @@ function linf_vnorm(elem,v,vt,hybrid,npts,nets,nete) result(linf) end function linf_vnorm - subroutine wrap_repro_sum (nvars, comm, nsize) + subroutine wrap_repro_sum (nvars, comm, nsize, which) use dimensions_mod, only: nelemd #ifdef CAM use shr_reprosum_mod, only: repro_sum => shr_reprosum_calc @@ -834,6 +841,8 @@ subroutine wrap_repro_sum (nvars, comm, nsize) integer :: comm ! mpi communicator integer, optional :: nsize ! local buffer size (defaults to nelemd - number of elements in mpi task) + character(len=*), optional :: which + integer nsize_use,n,i if (present(nsize)) then @@ -853,8 +862,19 @@ subroutine wrap_repro_sum (nvars, comm, nsize) do n=1,nvars do i=1,nsize_use if (global_shared_buf(i,n) /= global_shared_buf(i,n) ) then - print *, "var,nvars:",n,nvars + + print *, "var,nvars:",n,nvars + print *, 'failed for',n,' out of ',nvars + print *, 'which is ', which + if (present(which)) then + write(iulog,'(a26,i5,a8,i5)') 'wrap_repro_sum failed for nvar=',n,' out of ',nvars + call abortmp('NaNs detected in repro sum input and marker is '//which) + else + write(iulog,'(a26,i5,a8,i5)') 'wrap_repro_sum failed for nvar=',n,' out of ',nvars call abortmp('NaNs detected in repro sum input') + endif + + !call abortmp('NaNs detected in repro sum input') endif enddo enddo diff --git a/components/homme/src/theta-l/share/prim_state_mod.F90 b/components/homme/src/theta-l/share/prim_state_mod.F90 index 1bbe9c003277..4a9b492cdaab 100644 --- a/components/homme/src/theta-l/share/prim_state_mod.F90 +++ b/components/homme/src/theta-l/share/prim_state_mod.F90 @@ -213,7 +213,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) SUM(elem(ie)%spheremp*elem(ie)%state%Qdp(:,:,k,q,n0q)) enddo enddo - call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) + call wrap_repro_sum(nvars=1, comm=hybrid%par%comm, which="qdp") qvsum_p(q) = global_shared_sum(1) enddo @@ -353,7 +353,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) psmin_p = ParallelMin(psmin_local,hybrid) psmax_p = ParallelMax(psmax_local,hybrid) - call wrap_repro_sum(nvars=12, comm=hybrid%par%comm) + call wrap_repro_sum(nvars=12, comm=hybrid%par%comm,which="u to w_i") usum_p = global_shared_sum(1) vsum_p = global_shared_sum(2) tsum_p = global_shared_sum(3) @@ -511,7 +511,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) do ie=nets,nete tmp(:,:,ie)=elem(ie)%accum%Qmass(:,:,q,n) enddo - Qmass(q,n) = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + Qmass(q,n) = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which="qmass") Qmass(q,n) = Qmass(q,n)*scale if (n==2) then @@ -535,105 +535,108 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) tmp(:,:,ie) = elem(ie)%accum%KEu_horiz1 enddo !if(hybrid%masterthread) print *,'KEH1' - KEH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + + + !crash happens here + KEH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_horiz1') KEH1 = KEH1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_horiz2 enddo !if(hybrid%masterthread) print *,'KEH2' - KEH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_horiz2') KEH2 = KEH2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_vert1 enddo !if(hybrid%masterthread) print *,'KEV1' - KEV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_vert1') KEV1 = KEV1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_vert2 enddo !if(hybrid%masterthread) print *,'KEV2' - KEV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_vert2') KEV2 = KEV2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz1 enddo !if(hybrid%masterthread) print *,'KEwH1' - KEwH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEwH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz1') KEwH1 = KEwH1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz2 enddo !if(hybrid%masterthread) print *,'KEwH2' - KEwH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEwH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz2') KEwH2 = KEwH2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz3 enddo !if(hybrid%masterthread) print *,'KEwH3' - KEwH3 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEwH3 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz3') KEwH3 = KEwH3*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_vert1 enddo !if(hybrid%masterthread) print *,'KEwV1' - KEwV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEwV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_vert1') KEwV1 = KEwV1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_vert2 enddo !if(hybrid%masterthread) print *,'KEwV2' - KEwV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + KEwV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_vert2') KEwV2 = KEwV2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%IEvert1 enddo !if(hybrid%masterthread) print *,'IEvert1' - IEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + IEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='IEvert1') IEvert1 = IEvert1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%IEvert2 enddo !if(hybrid%masterthread) print *,'IEvert2' - IEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + IEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='IEvert2') IEvert2 = IEvert2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEhoriz1 enddo !if(hybrid%masterthread) print *,'PEhoriz1' - PEhorz1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + PEhorz1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEhoriz1') PEhorz1 = PEhorz1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEhoriz2 enddo !if(hybrid%masterthread) print *,'PEhoriz2' - PEhorz2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + PEhorz2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEhoriz2') PEhorz2 = PEhorz2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEvert1 enddo !if(hybrid%masterthread) print *,'PEver1' - PEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + PEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEvert1') PEvert1 = PEvert1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEvert2 enddo !if(hybrid%masterthread) print *,'PEver2' - PEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + PEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEvert2') PEvert2 = PEvert2*scale ! KE->IE @@ -641,42 +644,42 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) tmp(:,:,ie) = elem(ie)%accum%T01 enddo !if(hybrid%masterthread) print *,'T01' - T1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + T1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='T01') T1 = T1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%T2 enddo !if(hybrid%masterthread) print *,'T2' - T2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + T2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='T2') T2 = T2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%S1 enddo !if(hybrid%masterthread) print *,'S1' - S1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + S1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='S1') S1 = S1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%S2 enddo !if(hybrid%masterthread) print *,'S2' - S2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + S2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='S2') S2 = S2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%P1 enddo !if(hybrid%masterthread) print *,'P1' - P1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + P1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='P1') P1 = P1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%P2 enddo !if(hybrid%masterthread) print *,'P2' - P2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) + P2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='P2') P2 = P2*scale #else From babc296970f6312f2b89cc9cc05286b565a2d952 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Fri, 15 Jul 2022 13:03:01 -0400 Subject: [PATCH 33/62] fixing in vs inout issue for nstep_c --- components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 index 4f0277a45114..6ac9ee5a552f 100644 --- a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 +++ b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 @@ -173,8 +173,9 @@ subroutine prim_run_subcycle_c(tstep,nstep,nm1,n0,np1,next_output_step,nsplit_it ! ! Inputs ! - integer(kind=c_int), intent(in) :: nstep, nm1, n0, np1, next_output_step, nsplit_iter - real (kind=c_double), intent(in) :: tstep + integer(kind=c_int), intent(inout) :: nstep + integer(kind=c_int), intent(in) :: nm1, n0, np1, next_output_step, nsplit_iter + real (kind=c_double), intent(in) :: tstep end subroutine prim_run_subcycle_c ! Copy results from C++ views back to f90 arrays From 811933a4ae4543880588ce5a11b4aab7c6cfe35a Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 19 Jul 2022 15:20:40 -0400 Subject: [PATCH 34/62] remove noopt flag from 1 file --- components/homme/src/preqx_kokkos/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/preqx_kokkos/CMakeLists.txt b/components/homme/src/preqx_kokkos/CMakeLists.txt index ff37d09ba804..70c87f528c03 100644 --- a/components/homme/src/preqx_kokkos/CMakeLists.txt +++ b/components/homme/src/preqx_kokkos/CMakeLists.txt @@ -236,7 +236,7 @@ MACRO(PREQX_KOKKOS_SETUP) if(${HIP}) #remove opt for this file. with opt >O0 asserts fail here. - set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -O0") + set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -O3") endif() IF (HOMME_USE_TRILINOS) From ccb4f03c53cd7f4ab64ce107de06b271ab467f00 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 19 Jul 2022 15:21:07 -0400 Subject: [PATCH 35/62] use inout for some vars --- components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 | 5 +++-- components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 b/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 index 8d0c271526b9..388e8c0b5ffa 100644 --- a/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 +++ b/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 @@ -142,8 +142,9 @@ subroutine prim_run_subcycle_c(tstep,nstep,nm1,n0,np1,next_output_step,nsplit_it ! ! Inputs ! - integer(kind=c_int), intent(in) :: nstep, nm1, n0, np1, next_output_step, nsplit_iteration - real (kind=c_double), intent(in) :: tstep + integer(kind=c_int), intent(inout) :: nstep, nm1, n0, np1 + integer(kind=c_int), intent(in) :: next_output_step, nsplit_iteration + real (kind=c_double), intent(in) :: tstep end subroutine prim_run_subcycle_c ! Copy results from C++ views back to f90 arrays diff --git a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 index 6ac9ee5a552f..17aa79690d6f 100644 --- a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 +++ b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 @@ -173,8 +173,8 @@ subroutine prim_run_subcycle_c(tstep,nstep,nm1,n0,np1,next_output_step,nsplit_it ! ! Inputs ! - integer(kind=c_int), intent(inout) :: nstep - integer(kind=c_int), intent(in) :: nm1, n0, np1, next_output_step, nsplit_iter + integer(kind=c_int), intent(inout) :: nstep, nm1, n0, np1 + integer(kind=c_int), intent(in) :: next_output_step, nsplit_iter real (kind=c_double), intent(in) :: tstep end subroutine prim_run_subcycle_c From 90ea3f2ac4ab8c961ce1b5df8a56abec3b8ed702 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 19 Jul 2022 20:58:44 -0400 Subject: [PATCH 36/62] start on organizing gpu build vars --- components/homme/CMakeLists.txt | 15 +++++++------- .../cmake/machineFiles/crusher-gpumpi.cmake | 20 +++++++++++++------ .../homme/src/preqx_kokkos/config.h.cmake.in | 4 ++-- components/homme/src/share/cxx/Config.hpp | 4 ++-- components/homme/src/share/cxx/Dimensions.hpp | 4 ++-- .../src/share/cxx/EulerStepFunctorImpl.hpp | 4 ++-- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 8 ++++---- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 10 +++++----- .../homme/src/share/cxx/KernelVariables.hpp | 4 ++-- .../homme/src/share/cxx/kokkos_utils.hpp | 2 +- components/homme/src/share/cxx/profiling.hpp | 5 +++-- .../homme/src/theta-l/config.h.cmake.in | 4 ++-- .../src/theta-l_kokkos/config.h.cmake.in | 4 ++-- .../test_execs/share_kokkos_ut/CMakeLists.txt | 4 ++-- 14 files changed, 50 insertions(+), 42 deletions(-) diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index 78dc1e3c98c2..e661ca6f0c9e 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -196,7 +196,10 @@ IF (${HOMME_USE_KOKKOS}) SET (HOMMEXX_EXEC_SPACE "Default" CACHE STRING "Select the kokkos exec space") + #is HOMMEXX_EXEC_SPACE set anywhere? in scream? + STRING (TOUPPER ${HOMMEXX_EXEC_SPACE} HOMMEXX_EXEC_SPACE_UPPER) + IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") SET (HOMMEXX_HIP_SPACE ON) ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") @@ -276,17 +279,13 @@ MESSAGE(STATUS "C Flags = ${CMAKE_C_FLAGS}") MESSAGE(STATUS "CXX Flags = ${CMAKE_CXX_FLAGS}") MESSAGE(STATUS "Linker Flags = ${CMAKE_EXE_LINKER_FLAGS}") -IF (${HOMME_USE_KOKKOS}) +SET (HOMMEXX_ENABLE_GPU FALSE) -#set HIP to ON in cmake machine file - if(HIP) - add_definitions(-DHIP_BUILD) - #does not seem to be needed because kokkos::default exec space will be picked up - #SET (HOMMEXX_HIP_SPACE ON) - endif() +IF (${HOMME_USE_KOKKOS}) - IF (CUDA_BUILD OR HIP) + IF (CUDA_BUILD OR HIP_BUILD) SET (DEFAULT_VECTOR_SIZE 1) + SET (HOMMEXX_ENABLE_GPU TRUE) ELSE () SET (DEFAULT_VECTOR_SIZE 8) ENDIF() diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake index 587dd6462a6b..3c060f31df51 100644 --- a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -26,18 +26,26 @@ SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") -SET(HIP TRUE CACHE BOOL "") +#CUDA_BUILD is set in SetCompilersFlags, after findPackage(Cuda) +#i haven't extend it to hip, set it here instead +SET(HIP_BUILD TRUE CACHE BOOL "") + +#set this to true if using external kokkos build +SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc" CACHE STRING "") + +#uncomment this if using internal kokkos build +#SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "") +####SET(CMAKE_CXX_STANDARD "14" CACHE STRING "") +#SET(Kokkos_ENABLE_DEBUG OFF CACHE BOOL "") +#SET(Kokkos_ARCH_VEGA90A ON CACHE BOOL "") #SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") -#SET(Kokkos_ENABLE_CUDA OFF CACHE BOOL "") -#SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") -#SET(Kokkos_ARCH_VEGA908 ON CACHE BOOL "") +#SET(Kokkos_ENABLE_HIP ON CACHE BOOL "") +####SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") #SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") SET(CMAKE_C_COMPILER "cc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") -#SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") -SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc" CACHE STRING "") #not the proper way!!! SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") diff --git a/components/homme/src/preqx_kokkos/config.h.cmake.in b/components/homme/src/preqx_kokkos/config.h.cmake.in index 688f8a4151d3..9d54ea6e31c5 100644 --- a/components/homme/src/preqx_kokkos/config.h.cmake.in +++ b/components/homme/src/preqx_kokkos/config.h.cmake.in @@ -48,8 +48,8 @@ /* Zoltan partitioning library */ #cmakedefine01 HAVE_ZOLTAN -/* Identifies whether this is a cuda build */ -#cmakedefine CUDA_BUILD +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU /* TRILINOS library */ #cmakedefine01 HAVE_TRILINOS diff --git a/components/homme/src/share/cxx/Config.hpp b/components/homme/src/share/cxx/Config.hpp index e9a95f04f34e..684f9143beaf 100644 --- a/components/homme/src/share/cxx/Config.hpp +++ b/components/homme/src/share/cxx/Config.hpp @@ -14,7 +14,7 @@ # endif #else // Establish a good candidate vector size for eam builds -# ifdef CUDA_BUILD +# ifdef HOMMEXX_ENABLE_GPU # define HOMMEXX_VECTOR_SIZE 1 # else # define HOMMEXX_VECTOR_SIZE 8 @@ -31,7 +31,7 @@ #include -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU # ifndef HOMMEXX_CUDA_MIN_WARP_PER_TEAM # define HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 # endif diff --git a/components/homme/src/share/cxx/Dimensions.hpp b/components/homme/src/share/cxx/Dimensions.hpp index e596e1ec5af6..6367ca4bbd79 100644 --- a/components/homme/src/share/cxx/Dimensions.hpp +++ b/components/homme/src/share/cxx/Dimensions.hpp @@ -14,7 +14,7 @@ namespace Homme { // Until whenever CUDA supports constexpr properly -#if defined(CUDA_BUILD) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU #ifdef CAM #define QSIZE_D PCNST @@ -53,7 +53,7 @@ namespace Homme { static constexpr const int NUM_TIME_LEVELS = 3; static constexpr const int Q_NUM_TIME_LEVELS = 2; -#endif // CUDA_BUILD +#endif // GPU_BUILD template struct ColInfo { diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index cd54b56183e8..73f442c8d0ce 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -42,7 +42,7 @@ struct SerialLimiter { const Array2GllLvl& irwrk); }; // GPU doesn't have a serial impl. -#if defined KOKKOS_ENABLE_CUDA +#ifdef KOKKOS_ENABLE_CUDA template <> struct SerialLimiter { template { }; #endif -#ifdef HIP_BUILD +#ifdef KOKKOS_ENABLE_HIP template <> struct SerialLimiter { template #endif -#ifdef HIP_BUILD +#ifdef KOKKOS_ENABLE_HIP #include #endif @@ -53,7 +53,7 @@ void initialize_kokkos () { str.back() = 0; args.push_back(const_cast(str.data())); #endif -#ifdef HIP_BUILD +#ifdef KOKKOS_ENABLE_HIP int nd; const auto ret = hipGetDeviceCount(&nd); if (ret != hipSuccess) { @@ -123,7 +123,7 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); -#if !defined(HIP_BUILD) +#if !defined(KOKKOS_ENABLE_HIP) int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -206,7 +206,7 @@ team_num_threads_vectors (const int num_parallel_iterations, const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); # endif -#elif defined(HIP_BUILD) +#elif defined(KOKKOS_ENABLE_HIP) //use 64 wavefronts per CU and 120 CUs const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 5cf66296a181..8cdaca09b69a 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -21,13 +21,13 @@ namespace Homme // Some in-house names for Kokkos exec spaces, which are // always defined, possibly as alias of void -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU #ifdef KOKKOS_ENABLE_CUDA using Hommexx_Cuda = Kokkos::Cuda; #endif -#ifdef HIP_BUILD +#ifdef KOKKOS_ENABLE_HIP using Hommexx_Cuda = Kokkos::Experimental::HIP; #endif @@ -55,7 +55,7 @@ using Hommexx_Serial = Kokkos::Serial; using Hommexx_Serial = void; #endif -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU # define HOMMEXX_STATIC #else # define HOMMEXX_STATIC static @@ -63,7 +63,7 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space -#if defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) +#ifdef defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) using ExecSpace = Hommexx_Cuda; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; @@ -317,7 +317,7 @@ VECTOR_SIMD_LOOP } }; -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU template <> struct Dispatch { using ExeSpace = Hommexx_Cuda; diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 15a5c296dd71..086a9bde3987 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -28,7 +28,7 @@ struct KernelVariables { return 0; } -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU //#ifdef __CUDA_ARCH__ template static KOKKOS_INLINE_FUNCTION typename std::enable_if< @@ -44,7 +44,7 @@ struct KernelVariables { assert(false); // should never happen return -1; } -#endif // KOKKOS_ENABLE_CUDA or HIP +#endif // HOMMEXX_ENABLE_GPU #ifdef KOKKOS_ENABLE_OPENMP template diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index d0ff20285502..b2d85c0694fb 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -104,7 +104,7 @@ class TeamUtils : public _TeamUtilsCommonBase /* * Specialization for Cuda execution space. */ -#if defined(KOKKOS_ENABLE_CUDA) || defined(HIP_BUILD) +#ifdef HOMMEXX_ENABLE_GPU template <> class TeamUtils : public _TeamUtilsCommonBase { diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index 8684a873852d..207dc0f7f993 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -13,8 +13,9 @@ #include "gptl.h" //OG not sure about timers and HIP, probably the same as with CUDA -#if defined(HIP_BUILD) || defined(HOMMEXX_CUDA_SPACE) || \ - (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) // Can't use GPTL timers on CUDA +#if defined(HOMMEXX_HIP_SPACE) || defined(HOMMEXX_CUDA_SPACE) || \ + (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) || \ + (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_HIP)) #define start_timer(name) {} #define stop_timer(name) {} #else diff --git a/components/homme/src/theta-l/config.h.cmake.in b/components/homme/src/theta-l/config.h.cmake.in index ed1933721b18..0fcd5e1e024d 100644 --- a/components/homme/src/theta-l/config.h.cmake.in +++ b/components/homme/src/theta-l/config.h.cmake.in @@ -63,8 +63,8 @@ /* Whether to use OpenMP4 */ #cmakedefine OMP4 -/* Whether this is a build for CUDA (used mostly for f90-vs-cxx bfb math functions impl) */ -#cmakedefine CUDA_BUILD +/* Whether this is a build for CUDA/HIP (used mostly for f90-vs-cxx bfb math functions impl) */ +#cmakedefine HOMMEXX_ENABLE_GPU /* temporary flag to compare f90-cxx until we settle on either cxx or f90 implementation */ #cmakedefine XX_NONBFB_COMING diff --git a/components/homme/src/theta-l_kokkos/config.h.cmake.in b/components/homme/src/theta-l_kokkos/config.h.cmake.in index 141c60d0d32e..f5cacd509ff6 100644 --- a/components/homme/src/theta-l_kokkos/config.h.cmake.in +++ b/components/homme/src/theta-l_kokkos/config.h.cmake.in @@ -63,8 +63,8 @@ /* temporary flag to compare f90-cxx until we settle on either cxx or f90 implementation */ #cmakedefine XX_NONBFB_COMING -/* Identifies whether this is a cuda build */ -#cmakedefine CUDA_BUILD +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU /* When doing BFB testing, we occasionally must use modified code. */ /* Use this flag to protect such code. */ diff --git a/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt index 74fa179df0d7..5f0f41c5e21b 100644 --- a/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt @@ -9,8 +9,8 @@ SET(UTILS_TIMING_DIR ${HOMME_SOURCE_DIR}/utils/cime/CIME/non_py/src/timing) # Note: need CUDA_BUILD and HOMMEXX_BFB_TESTING here, since the share # unit tests do not include a config.h file SET (COMMON_DEFINITIONS NP=4 NC=4) -IF (CUDA_BUILD) - SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} CUDA_BUILD) +IF (CUDA_BUILD OR HIP_BUILD) + SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} HOMMEXX_ENABLE_GPU) ENDIF() IF (HOMMEXX_BFB_TESTING) SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} HOMMEXX_BFB_TESTING) From 24ca04cd6c6bda58cfb495be0f1a0d65898a8b5d Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 19 Jul 2022 21:04:51 -0400 Subject: [PATCH 37/62] remove duplicated code and fix ifdef --- .../src/share/cxx/EulerStepFunctorImpl.hpp | 18 ++---------------- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 2 +- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index 73f442c8d0ce..e1b6bfd9104f 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -42,23 +42,9 @@ struct SerialLimiter { const Array2GllLvl& irwrk); }; // GPU doesn't have a serial impl. -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU template <> -struct SerialLimiter { - template - KOKKOS_INLINE_FUNCTION static void - run (const ArrayGll& sphweights, const ArrayGllLvl& idpmass, - const Array2Lvl& iqlim, const ArrayGllLvl& iptens, - const Array2GllLvl& irwrk) { - Kokkos::abort("SerialLimiter::run: Should not be called on GPU."); - } -}; -#endif - -#ifdef KOKKOS_ENABLE_HIP -template <> -struct SerialLimiter { +struct SerialLimiter { template KOKKOS_INLINE_FUNCTION static void diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 8cdaca09b69a..e9c8340d619f 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -63,7 +63,7 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space -#ifdef defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) +#if defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) using ExecSpace = Hommexx_Cuda; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; From 3a4b951cc95e5b5075dbdd115931096e74b24002 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 19 Jul 2022 21:11:29 -0400 Subject: [PATCH 38/62] replace Hommexx_Cuda with HommexxGPU --- .../src/preqx_kokkos/cxx/CaarFunctorImpl.hpp | 12 ++++----- .../src/share/cxx/EulerStepFunctorImpl.hpp | 2 +- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 2 +- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 26 +++++++++---------- .../homme/src/share/cxx/KernelVariables.hpp | 2 +- .../homme/src/share/cxx/kokkos_utils.hpp | 4 +-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp index c7a10a660844..bfc8bd9ddf06 100644 --- a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp @@ -753,7 +753,7 @@ struct CaarFunctorImpl { private: template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type compute_pressure_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -787,7 +787,7 @@ struct CaarFunctorImpl { template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type compute_pressure_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -821,7 +821,7 @@ struct CaarFunctorImpl { template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type preq_hydrostatic_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -877,7 +877,7 @@ struct CaarFunctorImpl { // CUDA version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type preq_hydrostatic_impl(KernelVariables &kv) const { assert_vector_size_1(); Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), @@ -932,7 +932,7 @@ struct CaarFunctorImpl { // CUDA version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type preq_omega_ps_impl(KernelVariables &kv) const { assert_vector_size_1(); #ifdef DEBUG_TRACE @@ -986,7 +986,7 @@ struct CaarFunctorImpl { // Non-CUDA version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type preq_omega_ps_impl(KernelVariables &kv) const { m_sphere_ops.gradient_sphere( kv, Homme::subview(m_buffers.pressure, kv.team_idx), diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index e1b6bfd9104f..f0e40c550650 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -44,7 +44,7 @@ struct SerialLimiter { // GPU doesn't have a serial impl. #ifdef HOMMEXX_ENABLE_GPU template <> -struct SerialLimiter { +struct SerialLimiter { template KOKKOS_INLINE_FUNCTION static void diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 29db042ab6bf..f3a710d28a45 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -183,7 +183,7 @@ return std::make_pair( 16,4 ); } // namespace Parallel std::pair -DefaultThreadsDistribution:: +DefaultThreadsDistribution:: team_num_threads_vectors (const int num_parallel_iterations, const ThreadPreferences tp) { // It appears we can't use Kokkos to tell us this. On current devices, using diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index e9c8340d619f..28d8e2265d30 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -24,15 +24,15 @@ namespace Homme #ifdef HOMMEXX_ENABLE_GPU #ifdef KOKKOS_ENABLE_CUDA -using Hommexx_Cuda = Kokkos::Cuda; +using HommexxGPU = Kokkos::Cuda; #endif #ifdef KOKKOS_ENABLE_HIP -using Hommexx_Cuda = Kokkos::Experimental::HIP; +using HommexxGPU = Kokkos::Experimental::HIP; #endif #else -using Hommexx_Cuda = void; +using HommexxGPU = void; #endif @@ -64,7 +64,7 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space #if defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) -using ExecSpace = Hommexx_Cuda; +using ExecSpace = HommexxGPU; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; #elif defined(HOMMEXX_THREADS_SPACE) @@ -84,7 +84,7 @@ template struct OnGpu { enum : bool { value = false }; }; template <> -struct OnGpu { enum : bool { value = true }; }; +struct OnGpu { enum : bool { value = true }; }; // Call this instead of Kokkos::initialize. void initialize_kokkos(); @@ -155,7 +155,7 @@ struct DefaultThreadsDistribution { // Specialization for a GPU, where threads can't be viewed as existing simply in // a pool. template <> -struct DefaultThreadsDistribution { +struct DefaultThreadsDistribution { static std::pair team_num_threads_vectors(const int num_parallel_iterations, const ThreadPreferences tp = ThreadPreferences()); @@ -241,12 +241,12 @@ struct Memory { }; template <> -struct Memory { - enum : bool { on_gpu = OnGpu::value }; +struct Memory { + enum : bool { on_gpu = OnGpu::value }; template KOKKOS_INLINE_FUNCTION static - Scalar* get_shmem (const Kokkos::TeamPolicy::member_type& team, + Scalar* get_shmem (const Kokkos::TeamPolicy::member_type& team, const size_t n = 0) { return static_cast(team.team_shmem().get_shmem(n*sizeof(Scalar))); } @@ -319,8 +319,8 @@ VECTOR_SIMD_LOOP #ifdef HOMMEXX_ENABLE_GPU template <> -struct Dispatch { - using ExeSpace = Hommexx_Cuda; +struct Dispatch { + using ExeSpace = HommexxGPU; template KOKKOS_FORCEINLINE_FUNCTION @@ -363,7 +363,7 @@ struct Dispatch { static KOKKOS_FORCEINLINE_FUNCTION void parallel_for_NP2 ( //const Kokkos::TeamPolicy::member_type& team, - const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, NP*NP), lambda); @@ -372,7 +372,7 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_reduce_NP2 ( - const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, //const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda, ValueType& result) { diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 086a9bde3987..83a8e22190d9 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -19,7 +19,7 @@ struct KernelVariables { template static KOKKOS_INLINE_FUNCTION - typename std::enable_if::value && + typename std::enable_if::value && !std::is_same::value, int >::type diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index b2d85c0694fb..85e2ca31dc52 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -106,7 +106,7 @@ class TeamUtils : public _TeamUtilsCommonBase */ #ifdef HOMMEXX_ENABLE_GPU template <> -class TeamUtils : public _TeamUtilsCommonBase +class TeamUtils : public _TeamUtilsCommonBase { #ifdef HOMMEXX_CUDA_SHARE_BUFFER using Device = Kokkos::Device; @@ -124,7 +124,7 @@ class TeamUtils : public _TeamUtilsCommonBase public: template TeamUtils(const TeamPolicy& policy, const Real& overprov_factor = 1.25) : - _TeamUtilsCommonBase(policy) + _TeamUtilsCommonBase(policy) #ifdef HOMMEXX_CUDA_SHARE_BUFFER , _num_ws_slots(_league_size > _num_teams ? (overprov_factor * _num_teams > _league_size ? _league_size : overprov_factor * _num_teams) From 0b014998b3ffd67a081f14a1382ab803861c864c Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 20 Jul 2022 13:25:46 -0400 Subject: [PATCH 39/62] clean up and fixes, to be tested --- .../homme/src/preqx_kokkos/CMakeLists.txt | 45 ------------------- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 12 ++--- .../homme/src/share/cxx/ExecSpaceDefs.hpp | 2 - .../homme/src/share/cxx/KernelVariables.hpp | 9 ---- components/homme/src/share/cxx/profiling.hpp | 8 ++-- .../src/theta-l_kokkos/cxx/Diagnostics.hpp | 13 ++++-- 6 files changed, 18 insertions(+), 71 deletions(-) diff --git a/components/homme/src/preqx_kokkos/CMakeLists.txt b/components/homme/src/preqx_kokkos/CMakeLists.txt index 70c87f528c03..ac5599e82f68 100644 --- a/components/homme/src/preqx_kokkos/CMakeLists.txt +++ b/components/homme/src/preqx_kokkos/CMakeLists.txt @@ -194,51 +194,6 @@ MACRO(PREQX_KOKKOS_SETUP) ${SRC_SHARE_DIR}/cxx/utilities/BfbUtils.cpp ) -#debugging which file causes issues when opt flags are used: - -#this works -#set_source_files_properties(${PREQX_DEPS_CXX} PROPERTIES COMPILE_FLAGS -O1) - -#these work, except i prob should have done "${CMAKE_CXX_FLAGS} -O3" instead of just "-O3" - -#set_source_files_properties( ${TARGET_DIR}/cxx/cxx_f90_interface_preqx.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/CamForcing.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/Diagnostics.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/ElementsForcing.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/ElementsState.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/HyperviscosityFunctorImpl.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${TARGET_DIR}/cxx/prim_advance_exp.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/CaarFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Context.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Elements.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ElementsDerivedState.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ElementsGeometry.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ErrorDefs.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/EulerStepFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ExecSpaceDefs.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/FunctorsBuffersManager.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Hommexx_Session.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/HybridVCoord.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/HyperviscosityFunctor.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/ReferenceElement.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/Tracers.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/BoundaryExchange.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/Comm.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/Connectivity.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/MpiBuffersManager.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/mpi/mpi_cxx_f90_interface.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_advec_tracers_remap.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_driver.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/prim_step.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/vertical_remap.cpp PROPERTIES COMPILE_FLAGS -O3) -#set_source_files_properties( ${SRC_SHARE_DIR}/cxx/utilities/BfbUtils.cpp PROPERTIES COMPILE_FLAGS -O3) - - if(${HIP}) - #remove opt for this file. with opt >O0 asserts fail here. - set_source_files_properties( ${SRC_SHARE_DIR}/cxx/VerticalRemapManager.cpp PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -O3") - endif() - IF (HOMME_USE_TRILINOS) SET (PREQX_SRCS_ZOLTAN ${TRILINOS_ZOLTAN_DIR}/zoltan_interface.c ${TRILINOS_ZOLTAN_DIR}/zoltan_cppinterface.cpp) SET_SOURCE_FILES_PROPERTIES( ${TRILINOS_ZOLTAN_DIR}/zoltan_cppinterface.cpp PROPERTIES LANGUAGE CXX ) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index f3a710d28a45..7db1cedcf308 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -45,21 +45,15 @@ void initialize_kokkos () { // It isn't a big deal if we can't get the device count. nd = 1; } - std::stringstream ss; - ss << "--kokkos-num-devices=" << nd; - const auto key = ss.str(); - std::vector str(key.size()+1); - std::copy(key.begin(), key.end(), str.begin()); - str.back() = 0; - args.push_back(const_cast(str.data())); -#endif -#ifdef KOKKOS_ENABLE_HIP +#elif defined(KOKKOS_ENABLE_HIP) int nd; const auto ret = hipGetDeviceCount(&nd); if (ret != hipSuccess) { // It isn't a big deal if we can't get the device count. nd = 1; } +#endif +#if defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_HIP) std::stringstream ss; ss << "--kokkos-num-devices=" << nd; const auto key = ss.str(); diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index 28d8e2265d30..a80f993e2d64 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -362,7 +362,6 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_for_NP2 ( - //const Kokkos::TeamPolicy::member_type& team, const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda) { @@ -373,7 +372,6 @@ struct Dispatch { static KOKKOS_FORCEINLINE_FUNCTION void parallel_reduce_NP2 ( const Kokkos::TeamPolicy::member_type& team, - //const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda, ValueType& result) { parallel_reduce(team, Kokkos::ThreadVectorRange(team, NP*NP), diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 83a8e22190d9..65d0eaec40c2 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -29,21 +29,12 @@ struct KernelVariables { } #ifdef HOMMEXX_ENABLE_GPU -//#ifdef __CUDA_ARCH__ template static KOKKOS_INLINE_FUNCTION typename std::enable_if< OnGpu::value, int>::type get_team_idx(const int /*team_size*/, const int league_rank) { return league_rank; } -#else - template - static KOKKOS_INLINE_FUNCTION typename std::enable_if< - OnGpu::value, int>::type - get_team_idx(const int /*team_size*/, const int /*league_rank*/) { - assert(false); // should never happen - return -1; - } #endif // HOMMEXX_ENABLE_GPU #ifdef KOKKOS_ENABLE_OPENMP diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index 207dc0f7f993..f079e9d07761 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -13,9 +13,11 @@ #include "gptl.h" //OG not sure about timers and HIP, probably the same as with CUDA -#if defined(HOMMEXX_HIP_SPACE) || defined(HOMMEXX_CUDA_SPACE) || \ - (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) || \ - (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_HIP)) +//#if defined(HOMMEXX_HIP_SPACE) || defined(HOMMEXX_CUDA_SPACE) || \ +// (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) || \ +// (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_HIP)) + +#ifdef HOMMEXX_ENABLE_GPU #define start_timer(name) {} #define stop_timer(name) {} #else diff --git a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp index 7ee7fb2041ae..8d9e00ecf924 100644 --- a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp @@ -13,6 +13,10 @@ #include "utilities/SubviewUtils.hpp" #include "utilities/ViewUtils.hpp" +#if ! defined(NDEBUG) +#define RESOLVE_ISSUE_WITH_ASSERTS +#endif + namespace Homme { @@ -35,7 +39,7 @@ class Diagnostics ExecViewUnmanaged dpnh_dp_i; }; - +#if defined RESOLVE_ISSUE_WITH_ASSERTS template typename std::enable_if::value == false, Kokkos::TeamPolicy >::type @@ -53,7 +57,7 @@ class Diagnostics tp.prefer_larger_team = true; return Homme::get_default_team_policy(num_exec, tp); } - +#endif @@ -61,8 +65,11 @@ class Diagnostics Diagnostics (const int num_elems, const bool theta_hydrostatic_mode) : - //m_policy(Homme::get_default_team_policy(num_elems)), +#if ! defined(RESOLVE_ISSUE_WITH_ASSERTS) + m_policy(Homme::get_default_team_policy(num_elems)), +#else m_policy(d_team_policy(num_elems)), +#endif m_tu(m_policy), m_num_elems(num_elems), m_theta_hydrostatic_mode(theta_hydrostatic_mode) From cc33ef072c8871023ccde982f64b16cd151c015d Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Thu, 21 Jul 2022 15:16:41 -0400 Subject: [PATCH 40/62] fixing bfb; confirmed for fhs1 test --- components/homme/src/share/cxx/utilities/BfbUtils.hpp | 2 +- components/homme/src/share/cxx/utilities/bfb_mod.F90 | 2 +- components/homme/src/theta-l/share/element_ops.F90 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/homme/src/share/cxx/utilities/BfbUtils.hpp b/components/homme/src/share/cxx/utilities/BfbUtils.hpp index afd64a843d27..e3570874e266 100644 --- a/components/homme/src/share/cxx/utilities/BfbUtils.hpp +++ b/components/homme/src/share/cxx/utilities/BfbUtils.hpp @@ -111,7 +111,7 @@ typename std::enable_if::value && !std::is_reference::value, ScalarType>::type bfb_pow_impl (ScalarType val, ExpType e) { -#ifdef CUDA_BUILD +#ifdef HOMMEXX_ENABLE_GPU // Note: this function is tailored (or taylored...eheh) // for -1 Date: Mon, 8 Aug 2022 16:43:25 -0400 Subject: [PATCH 41/62] Hommexx: Fix some CMake issues. Using link_to_kokkos from Kokkos.cmake provides the correct include and lib(64) directories. Use this macro for Hommexx targets. As a result, crusher-gpumpi.cmake can be trimmed down substantially. Remove lines in that file depending on E3SM_KOKKOS_PATH. Instead, set E3SM_KOKKOS_PATH in your configuration script. --- components/homme/cmake/HommeMacros.cmake | 2 +- .../homme/cmake/machineFiles/crusher-gpumpi.cmake | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/components/homme/cmake/HommeMacros.cmake b/components/homme/cmake/HommeMacros.cmake index 025898d338df..2639e641e363 100644 --- a/components/homme/cmake/HommeMacros.cmake +++ b/components/homme/cmake/HommeMacros.cmake @@ -155,7 +155,7 @@ macro(createTestExec execName execType macroNP macroNC ENDIF () IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${execName} kokkos) + link_to_kokkos(${execName}) ENDIF () # Move the module files out of the way so the parallel build diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake index 3c060f31df51..8ee446ae1000 100644 --- a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -20,7 +20,6 @@ SET(USE_QUEUING FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") -SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") #SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") @@ -30,9 +29,6 @@ SET(USE_TRILINOS OFF CACHE BOOL "") #i haven't extend it to hip, set it here instead SET(HIP_BUILD TRUE CACHE BOOL "") -#set this to true if using external kokkos build -SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc" CACHE STRING "") - #uncomment this if using internal kokkos build #SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "") ####SET(CMAKE_CXX_STANDARD "14" CACHE STRING "") @@ -52,10 +48,10 @@ SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") -SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_CXX_FLAGS "-std=c++14 -O3 -DNDEBUG --amdgpu-target=gfx90a -fno-gpu-rdc ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") +SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_CXX_FLAGS "-std=c++14 -O3 -DNDEBUG --amdgpu-target=gfx90a -fno-gpu-rdc ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") set (ENABLE_OPENMP OFF CACHE BOOL "") From f0bf95e5bfd7ec1df745ba5f669dda07cc0fcba1 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 16:44:35 -0400 Subject: [PATCH 42/62] Hommexx/SL: Clean up use of HOMME_ENABLE_COMPOSE. Include Config.hpp to see the symbol. Move some uses around. --- components/homme/src/share/compose_mod.F90 | 2 +- .../homme/src/share/cxx/ComposeTransport.cpp | 4 ++-- .../homme/src/share/cxx/ComposeTransport.hpp | 5 ++--- .../homme/src/share/cxx/ComposeTransportImpl.hpp | 5 ++--- .../src/share/cxx/ComposeTransportImplGeneral.cpp | 5 ++--- .../src/share/cxx/ComposeTransportImplHypervis.cpp | 4 ++-- .../src/share/cxx/ComposeTransportImplTest2D.cpp | 4 ++-- .../share/cxx/ComposeTransportImplTrajectory.cpp | 4 ++-- .../cxx/ComposeTransportImplVerticalRemap.cpp | 4 ++-- .../src/share/cxx/prim_advec_tracers_remap.cpp | 14 +++++--------- 10 files changed, 22 insertions(+), 29 deletions(-) diff --git a/components/homme/src/share/compose_mod.F90 b/components/homme/src/share/compose_mod.F90 index e7f95af04f66..8b134def384b 100644 --- a/components/homme/src/share/compose_mod.F90 +++ b/components/homme/src/share/compose_mod.F90 @@ -374,9 +374,9 @@ end subroutine compose_init subroutine compose_finalize(finalize_kokkos) logical, optional, intent(in) :: finalize_kokkos +#ifdef HOMME_ENABLE_COMPOSE logical :: call_finalize_kokkos -#ifdef HOMME_ENABLE_COMPOSE call cedr_finalize() call slmm_finalize() diff --git a/components/homme/src/share/cxx/ComposeTransport.cpp b/components/homme/src/share/cxx/ComposeTransport.cpp index 7a69b8007d8c..760a3dd06fa4 100644 --- a/components/homme/src/share/cxx/ComposeTransport.cpp +++ b/components/homme/src/share/cxx/ComposeTransport.cpp @@ -4,6 +4,7 @@ * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory *******************************************************************************/ +#include "Config.hpp" #ifdef HOMME_ENABLE_COMPOSE #include "ComposeTransport.hpp" @@ -87,5 +88,4 @@ void ComposeTransport::test_2d (const bool bfb, const int nstep, std::vector(); if (params.transport_alg > 0) { -#if defined(MODEL_THETA_L) && defined(HOMME_ENABLE_COMPOSE) prim_advec_tracers_remap_compose(dt); -#else - Errors::runtime_abort("prim_advec_tracers_remap: " - "transport_alg > 0 not supported for non-theta-l builds."); -#endif } else { prim_advec_tracers_remap_RK2(dt); } @@ -96,8 +89,8 @@ static void prim_advec_tracers_remap_RK2 (const Real dt) GPTLstop("tl-at prim_advec_tracers_remap_RK2"); } -#if defined(MODEL_THETA_L) && defined(HOMME_ENABLE_COMPOSE) static void prim_advec_tracers_remap_compose (const Real dt) { +#if defined MODEL_THETA_L && defined HOMME_ENABLE_COMPOSE GPTLstart("tl-at prim_advec_tracers_compose"); const auto& params = Context::singleton().get(); assert(params.params_set); @@ -107,7 +100,10 @@ static void prim_advec_tracers_remap_compose (const Real dt) { ct.reset(params); ct.run(tl, dt); GPTLstop("tl-at prim_advec_tracers_compose"); -} +#else + Errors::runtime_abort("prim_advec_tracers_remap_compose: " + "transport_alg > 0 not supported in this build."); #endif +} } // namespace Homme From 1deaa7912fbac5c63c8de903c89744de720fcbb2 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 16:45:00 -0400 Subject: [PATCH 43/62] Hommexx/SL: Generalize Cuda to GPU spaces. Handle Cuda, HIP, and perhaps SYCL. --- .../share/compose/cedr_bfb_tree_allreduce.cpp | 4 +-- .../homme/src/share/compose/cedr_caas.cpp | 4 +-- .../homme/src/share/compose/cedr_kokkos.hpp | 34 ++++++++++++++----- .../homme/src/share/compose/cedr_qlt.cpp | 4 +-- .../homme/src/share/compose/compose.hpp | 20 +++++++++-- .../homme/src/share/compose/compose_homme.hpp | 12 +++---- .../src/share/compose/compose_kokkos.hpp | 18 +++++----- .../homme/src/share/compose/compose_slmm.cpp | 2 +- 8 files changed, 65 insertions(+), 33 deletions(-) diff --git a/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp b/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp index aadbfaecb004..9d2cb3df8c8d 100644 --- a/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp +++ b/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp @@ -231,8 +231,8 @@ template class cedr::BfbTreeAllReducer; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::BfbTreeAllReducer; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::BfbTreeAllReducer; +#ifdef CEDR_ENABLE_GPU +template class cedr::BfbTreeAllReducer; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::BfbTreeAllReducer; diff --git a/components/homme/src/share/compose/cedr_caas.cpp b/components/homme/src/share/compose/cedr_caas.cpp index c16d392065c4..87506f6e9a05 100644 --- a/components/homme/src/share/compose/cedr_caas.cpp +++ b/components/homme/src/share/compose/cedr_caas.cpp @@ -401,8 +401,8 @@ template class cedr::caas::CAAS; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::caas::CAAS; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::caas::CAAS; +#ifdef CEDR_ENABLE_GPU +template class cedr::caas::CAAS; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::caas::CAAS; diff --git a/components/homme/src/share/compose/cedr_kokkos.hpp b/components/homme/src/share/compose/cedr_kokkos.hpp index 0d50c298bdff..891d9f763ecb 100644 --- a/components/homme/src/share/compose/cedr_kokkos.hpp +++ b/components/homme/src/share/compose/cedr_kokkos.hpp @@ -6,6 +6,22 @@ #include +#if defined KOKKOS_ENABLE_CUDA || defined KOKKOS_ENABLE_HIP || defined KOKKOS_ENABLE_SYCL +# define CEDR_ENABLE_GPU +# if defined KOKKOS_ENABLE_CUDA +typedef Kokkos::Cuda CedrGpuExeSpace; +typedef Kokkos::CudaSpace CedrGpuSpace; +# endif +# if defined KOKKOS_ENABLE_HIP +typedef Kokkos::Experimental::HIP CedrGpuExeSpace; +typedef Kokkos::Experimental::HIPSpace CedrGpuSpace; +# endif +# if defined KOKKOS_ENABLE_SYCL +typedef Kokkos::Experimental::SYCL CedrGpuExeSpace; +typedef Kokkos::Experimental::SYCL> CedrGpuSpace; +# endif +#endif + #define KIF KOKKOS_INLINE_FUNCTION // Clarify that a class member type is meant to be private but is @@ -74,11 +90,11 @@ struct DeviceType { typename ExeSpace::memory_space> type; }; -#ifdef KOKKOS_ENABLE_CUDA -typedef Kokkos::Device DefaultDeviceType; +#ifdef CEDR_ENABLE_GPU +typedef Kokkos::Device DefaultDeviceType; -template <> struct DeviceType { +template <> struct DeviceType { typedef DefaultDeviceType type; }; #else @@ -95,8 +111,8 @@ template struct OnGpu { #endif }; }; -#ifdef KOKKOS_ENABLE_CUDA -template <> struct OnGpu { enum : bool { value = true }; }; +#ifdef CEDR_ENABLE_GPU +template <> struct OnGpu { enum : bool { value = true }; }; #endif template @@ -120,10 +136,10 @@ struct ExeSpaceUtils { } }; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef CEDR_ENABLE_GPU template <> -struct ExeSpaceUtils { - using TeamPolicy = Kokkos::TeamPolicy; +struct ExeSpaceUtils { + using TeamPolicy = Kokkos::TeamPolicy; using Member = typename TeamPolicy::member_type; static TeamPolicy get_default_team_policy (int outer, int inner) { return TeamPolicy(outer, std::min(128, 32*((inner + 31)/32)), 1); diff --git a/components/homme/src/share/compose/cedr_qlt.cpp b/components/homme/src/share/compose/cedr_qlt.cpp index 67f8e2c6d43f..c3728093ce56 100644 --- a/components/homme/src/share/compose/cedr_qlt.cpp +++ b/components/homme/src/share/compose/cedr_qlt.cpp @@ -807,8 +807,8 @@ template class cedr::qlt::QLT; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::qlt::QLT; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::qlt::QLT; +#ifdef CEDR_ENABLE_GPU +template class cedr::qlt::QLT; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::qlt::QLT; diff --git a/components/homme/src/share/compose/compose.hpp b/components/homme/src/share/compose/compose.hpp index 915596aa8911..01be2635fcf6 100644 --- a/components/homme/src/share/compose/compose.hpp +++ b/components/homme/src/share/compose/compose.hpp @@ -11,6 +11,22 @@ # include #endif +#if defined KOKKOS_ENABLE_CUDA || defined KOKKOS_ENABLE_HIP || defined KOKKOS_ENABLE_SYCL +# define COMPOSE_ENABLE_GPU +# if defined KOKKOS_ENABLE_CUDA +typedef Kokkos::Cuda ComposeGpuExeSpace; +typedef Kokkos::CudaSpace ComposeGpuSpace; +# endif +# if defined KOKKOS_ENABLE_HIP +typedef Kokkos::Experimental::HIP ComposeGpuExeSpace; +typedef Kokkos::Experimental::HIPSpace ComposeGpuSpace; +# endif +# if defined KOKKOS_ENABLE_SYCL +typedef Kokkos::Experimental::SYCL ComposeGpuExeSpace; +typedef Kokkos::Experimental::SYCL> ComposeGpuSpace; +# endif +#endif + // Options #ifdef NDEBUG @@ -42,11 +58,11 @@ # if defined COMPOSE_HORIZ_OPENMP || defined COMPOSE_COLUMN_OPENMP "This should not happen." # endif -# ifndef KOKKOS_ENABLE_CUDA +# ifndef COMPOSE_ENABLE_GPU // Mimic GPU threading on host to debug race conditions on a regular CPU. //# define COMPOSE_MIMIC_GPU # endif -# if defined COMPOSE_MIMIC_GPU || defined KOKKOS_ENABLE_CUDA +# if defined COMPOSE_MIMIC_GPU || defined COMPOSE_ENABLE_GPU // If defined, then certain buffers need explicit mirroring and copying. # define COMPOSE_PORT_SEPARATE_VIEWS // If defined, do pass1 routines on host. This is for performance checking. diff --git a/components/homme/src/share/compose/compose_homme.hpp b/components/homme/src/share/compose/compose_homme.hpp index d53cf90ccf2d..a3b40a204a9a 100644 --- a/components/homme/src/share/compose/compose_homme.hpp +++ b/components/homme/src/share/compose/compose_homme.hpp @@ -144,7 +144,7 @@ struct HommeFormatArray { assert(ie_data_ptr[ie]); // These routines are not used on the GPU, but they can be called from // KOKKOS_FUNCTIONs on CPU in GPU builds. Avoid nvcc warnings as follows: -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + i); @@ -157,7 +157,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + lev*np2 + k); @@ -172,7 +172,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev, q_or_timelev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + (q_or_timelev*nlev + lev)*np2 + k); @@ -188,7 +188,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev, q, timelev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + ((timelev*qsized + q)*nlev + lev)*np2 + k); @@ -200,7 +200,7 @@ struct HommeFormatArray { std::vector ie_data_ptr; const Int nlev, qsized, ntimelev; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU COMPOSE_INLINE_FUNCTION static T& unused () { static T unused = 0; assert(0); @@ -211,7 +211,7 @@ struct HommeFormatArray { COMPOSE_FORCEINLINE_FUNCTION void check (Int ie, Int k = -1, Int lev = -1, Int q_or_timelev = -1, Int timelev = -1) const { -#if defined COMPOSE_BOUNDS_CHECK && ! defined __CUDA_ARCH__ +#if defined COMPOSE_BOUNDS_CHECK && ! (defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__) assert(ie >= 0 && ie < static_cast(ie_data_ptr.size())); if (k >= 0) assert(k < np2); if (lev >= 0) assert(lev < nlev); diff --git a/components/homme/src/share/compose/compose_kokkos.hpp b/components/homme/src/share/compose/compose_kokkos.hpp index 6e683253df45..9dd2515bc671 100644 --- a/components/homme/src/share/compose/compose_kokkos.hpp +++ b/components/homme/src/share/compose/compose_kokkos.hpp @@ -31,7 +31,7 @@ template struct NumericTraits; template <> struct NumericTraits { KOKKOS_INLINE_FUNCTION static double epsilon () { return -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU 2.2204460492503131e-16 #else std::numeric_limits::epsilon() @@ -43,7 +43,7 @@ template <> struct NumericTraits { template <> struct NumericTraits { KOKKOS_INLINE_FUNCTION static float epsilon () { return -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU 1.1920928955078125e-07 #else std::numeric_limits::epsilon() @@ -58,11 +58,11 @@ struct DeviceType { typename ExeSpace::memory_space> type; }; -#ifdef KOKKOS_ENABLE_CUDA -typedef Kokkos::Device DefaultDeviceType; +#ifdef COMPOSE_ENABLE_GPU +typedef Kokkos::Device DefaultDeviceType; -template <> struct DeviceType { +template <> struct DeviceType { typedef DefaultDeviceType type; }; #else @@ -92,8 +92,8 @@ template struct OnGpu { #endif }; }; -#ifdef KOKKOS_ENABLE_CUDA -template <> struct OnGpu { enum : bool { value = true }; }; +#ifdef COMPOSE_ENABLE_GPU +template <> struct OnGpu { enum : bool { value = true }; }; template <> struct OnGpu {}; // flag as an error at compile time #endif @@ -140,7 +140,7 @@ template const View& unmanaged ( #endif // Copy by ref if not Cuda build. -#if defined COMPOSE_PORT && defined KOKKOS_ENABLE_CUDA +#if defined COMPOSE_PORT && defined COMPOSE_ENABLE_GPU # define COMPOSE_LAMBDA KOKKOS_LAMBDA # define COMPOSE_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION # define COMPOSE_FORCEINLINE_FUNCTION KOKKOS_FORCEINLINE_FUNCTION diff --git a/components/homme/src/share/compose/compose_slmm.cpp b/components/homme/src/share/compose/compose_slmm.cpp index 7006ff14c618..b8277b5423c6 100644 --- a/components/homme/src/share/compose/compose_slmm.cpp +++ b/components/homme/src/share/compose/compose_slmm.cpp @@ -240,7 +240,7 @@ static void initialize_kokkos () { if (Kokkos::is_initialized()) return; in_charge_of_kokkos = true; std::vector args; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU int nd; const auto ret = cudaGetDeviceCount(&nd); if (ret != cudaSuccess) { From 632c46ac96c686d599fd054f9df659933a8c98ee Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 16:53:56 -0400 Subject: [PATCH 44/62] Hommexx: Update DIRK solver's tridiag solver to handle HIP. --- .../share/cxx/utilities/scream_tridiag.hpp | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/components/homme/src/share/cxx/utilities/scream_tridiag.hpp b/components/homme/src/share/cxx/utilities/scream_tridiag.hpp index 0fc28a104b98..e18bbc4e7e27 100644 --- a/components/homme/src/share/cxx/utilities/scream_tridiag.hpp +++ b/components/homme/src/share/cxx/utilities/scream_tridiag.hpp @@ -120,10 +120,11 @@ int get_team_nthr (const TeamMember& team) { return team.team_size(); } -#ifdef KOKKOS_ENABLE_CUDA -KOKKOS_INLINE_FUNCTION -int get_thread_id_within_team (const Kokkos::Impl::CudaTeamMember& team) { -#ifdef __CUDA_ARCH__ +// Impl details for Nvidia and AMD GPUs. + +template KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team_gpu (const TeamMember& team) { +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ // Can't use team.team_rank() here because vector direction also uses physical // threads but TeamMember types don't expose that information. return blockDim.x * threadIdx.y + threadIdx.x; @@ -133,17 +134,33 @@ int get_thread_id_within_team (const Kokkos::Impl::CudaTeamMember& team) { #endif } -KOKKOS_INLINE_FUNCTION -int get_team_nthr (const Kokkos::Impl::CudaTeamMember& team) { -#ifdef __CUDA_ARCH__ +template KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr_gpu (const TeamMember& team) { +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return blockDim.x * blockDim.y; #else assert(0); return -1; #endif } -#endif +#ifdef KOKKOS_ENABLE_CUDA +KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team (const Kokkos::Impl::CudaTeamMember& team) +{ return get_thread_id_within_team_gpu(team); } +KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr (const Kokkos::Impl::CudaTeamMember& team) +{ return get_team_nthr_gpu(team); } +#endif // KOKKOS_ENABLE_CUDA + +#ifdef KOKKOS_ENABLE_HIP +KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team (const Kokkos::Impl::HIPTeamMember& team) +{ return get_thread_id_within_team_gpu(team); } +KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr (const Kokkos::Impl::HIPTeamMember& team) +{ return get_team_nthr_gpu(team); } +#endif // KOKKOS_ENABLE_HIP template KOKKOS_INLINE_FUNCTION const T& min (const T& a, const T& b) { return a < b ? a : b; } From 8105e7ae37c22cc365c1f6663c7de1b9a34c854b Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 17:33:27 -0400 Subject: [PATCH 45/62] Hommexx: Fix a warning in VR. --- components/homme/src/share/cxx/RemapFunctor.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/homme/src/share/cxx/RemapFunctor.hpp b/components/homme/src/share/cxx/RemapFunctor.hpp index ba6a4cd0e82d..cec96657d35b 100644 --- a/components/homme/src/share/cxx/RemapFunctor.hpp +++ b/components/homme/src/share/cxx/RemapFunctor.hpp @@ -524,11 +524,11 @@ struct RemapFunctor : public Remapper { Kokkos::parallel_for(get_default_team_policy(ne*nv), r); } - int requested_buffer_size () const { + int requested_buffer_size () const override { return m_fields_provider.requested_buffer_size(); } - void init_buffers(const FunctorsBuffersManager& fbm) { + void init_buffers(const FunctorsBuffersManager& fbm) override { m_fields_provider.init_buffers(fbm); } From 6c4cf8957512879cfff96be8aa52b253bb0e84f7 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 17:53:26 -0400 Subject: [PATCH 46/62] Hommexx: Make more link lines use link_to_kokkos. link_to_kokkos can handle the lib layouts of various versions of Kokkos, wherease linking directly to libkokkos is an error with some Kokkos versions. --- components/homme/test/unit_tests/CMakeLists.txt | 3 ++- components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt | 2 +- components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/components/homme/test/unit_tests/CMakeLists.txt b/components/homme/test/unit_tests/CMakeLists.txt index 966be2007c47..b86675d03088 100644 --- a/components/homme/test/unit_tests/CMakeLists.txt +++ b/components/homme/test/unit_tests/CMakeLists.txt @@ -15,7 +15,8 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con #ENDIF() SET_TESTS_PROPERTIES(${target_name}_test PROPERTIES LABELS "unit") - TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkos) + TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + link_to_kokkos(${target_name}) IF (HOMME_USE_MKL) TARGET_COMPILE_OPTIONS (${target_name} PUBLIC -mkl) TARGET_LINK_LIBRARIES (${target_name} -mkl) diff --git a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt index 2bafd5592bad..715566c37b5c 100644 --- a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt @@ -23,7 +23,7 @@ ADD_LIBRARY(preqx_kokkos_ut_lib ${PREQX_DEPS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(preqx_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkos) +link_to_kokkos(preqx_kokkos_ut_lib) TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (preqx_kokkos_ut_lib -mkl) diff --git a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt index ad1c10b15c36..4102efe89ca0 100644 --- a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt @@ -25,7 +25,7 @@ ADD_LIBRARY(thetal_kokkos_ut_lib ${THETAL_DEPS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(thetal_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkos) +link_to_kokkos(thetal_kokkos_ut_lib) TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing ${COMPOSE_LIBRARY_CPP} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (thetal_kokkos_ut_lib -mkl) From 44dade3ea5b20e613014a2f921b61116111ce7d2 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 19:31:12 -0400 Subject: [PATCH 47/62] Homme: Slightly relax a tolerance in a unit test. --- components/homme/src/share/gllfvremap_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/share/gllfvremap_mod.F90 b/components/homme/src/share/gllfvremap_mod.F90 index c14bc357a208..455af7a005b8 100644 --- a/components/homme/src/share/gllfvremap_mod.F90 +++ b/components/homme/src/share/gllfvremap_mod.F90 @@ -2797,7 +2797,7 @@ function test_sphere2ref() result(nerr) call ref2sphere(corners, refin(1), refin(2), sphere) call sphere2ref(corners, sphere, refout(1), refout(2)) err = abs(refin(1) - refout(1)) + abs(refin(2) - refout(2)) - if (err > 10*eps .or. & + if (err > 15*eps .or. & maxval(abs(refout)) > 1 + 5*eps .or. & any(refout /= refout)) then write(iulog,*) refin(1), refin(2) From 62fc2e38c68ba562539179a2f763aad0044d0153 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 20:13:15 -0400 Subject: [PATCH 48/62] Homme/SL: Slightly relax a tolerance in a unit test. --- components/homme/src/share/compose/cedr_test_randomized.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/share/compose/cedr_test_randomized.cpp b/components/homme/src/share/compose/cedr_test_randomized.cpp index ea8c84e32156..7e51ec5e768b 100644 --- a/components/homme/src/share/compose/cedr_test_randomized.cpp +++ b/components/homme/src/share/compose/cedr_test_randomized.cpp @@ -395,7 +395,7 @@ ::check (const std::string& cdr_name, const mpi::Parallel& p, local_violated.size(), MPI_MAX, p.root()); if (p.amroot()) { - const Real tol = 1e2*std::numeric_limits::epsilon(); + const Real tol = 5e2*std::numeric_limits::epsilon(); for (size_t ti = 0; ti < ts.size(); ++ti) { // Check mass conservation. const Real desired_mass = glbl_mass[3*ti], actual_mass = glbl_mass[3*ti+1], From b304124164864593b6c330872c5b3389a0d11e43 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 21:42:31 -0400 Subject: [PATCH 49/62] Hommexx/SL: Fix a variable for lambda capture in two spots. Worked on various Cuda devices, but Crusher needs this to be explicitly not a reference to capture it correctly. --- .../homme/src/share/compose/compose_cedr_sl_run_global.cpp | 2 +- .../homme/src/share/compose/compose_cedr_sl_run_local.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp index aa1c202792da..a8ced6c3f131 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp @@ -84,7 +84,7 @@ void run_global (CDR& cdr, CDRT* cedr_cdr_p, const auto& nonnegs = cdr.nonneg; const auto& ie2lci = cdr.ie2lci; const auto& ie2gci = cdr.ie2gci; - const typename CDRT::DeviceOp& cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp cedr_cdr = cedr_cdr_p->get_device_op(); if (cedr::impl::OnGpu::value) { const Int n = ta.nelemd*nlev*qsize*np2; ko::View q_min_1d(q_min.data(), n); diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp index 6db595ea31c3..bfd9ff5b0add 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp @@ -186,7 +186,7 @@ void run_local (CDR& cdr, CDRT* cedr_cdr_p, const Int nsuplev = cdr.nsuplev; const auto cdr_over_super_levels = cdr.cdr_over_super_levels; const auto caas_in_suplev = cdr.caas_in_suplev; - const typename CDRT::DeviceOp& cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp cedr_cdr = cedr_cdr_p->get_device_op(); const auto& ie2lci = cdr.ie2lci; // Loop differently due to performance diff on CPU. #ifdef COMPOSE_PORT From a0af755f74cbabc2885b33d7e99eb6ba33abcfff Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 8 Aug 2022 22:29:51 -0400 Subject: [PATCH 50/62] Hommexx: Get standalone Homme to build its own Kokkos. This capability seemed broken because algorithms/src wasn't being included. Fix this. Also update summit-gpumpi.cmake to always build COMPOSE. --- components/homme/cmake/Kokkos.cmake | 2 +- components/homme/cmake/machineFiles/summit-gpumpi.cmake | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/components/homme/cmake/Kokkos.cmake b/components/homme/cmake/Kokkos.cmake index b0a32cf1cf4b..6bea395a4403 100644 --- a/components/homme/cmake/Kokkos.cmake +++ b/components/homme/cmake/Kokkos.cmake @@ -51,10 +51,10 @@ macro(install_kokkos_if_needed) endmacro() macro(link_to_kokkos targetName) + target_include_directories(${targetName} SYSTEM PUBLIC ${KOKKOS_INCLUDE_DIR}) if (KOKKOS_LIBS_ARE_TARGETS) target_link_libraries (${targetName} ${KOKKOS_LIBRARIES}) else() - target_include_directories(${targetName} SYSTEM PUBLIC ${KOKKOS_INCLUDE_DIR}) target_link_libraries(${targetName} ${KOKKOS_TPL_LIBRARIES} ${KOKKOS_LIBRARIES} -L${KOKKOS_LIBRARY_DIR}) endif() endmacro(link_to_kokkos) diff --git a/components/homme/cmake/machineFiles/summit-gpumpi.cmake b/components/homme/cmake/machineFiles/summit-gpumpi.cmake index 644b7840d94c..46474a5a2fb7 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi.cmake @@ -20,8 +20,6 @@ SET(USE_QUEUING FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") -#SET(HOMME_ENABLE_COMPOSE TRUE CACHE BOOL "") -SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") #SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") From d3adb964aff134e278a9f9fd6c87688166642514 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Tue, 23 Aug 2022 17:51:23 -0400 Subject: [PATCH 51/62] Hommexx: CUDA_ARCH equivalent in GllFvRemap. --- components/homme/src/share/cxx/GllFvRemapImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/share/cxx/GllFvRemapImpl.hpp b/components/homme/src/share/cxx/GllFvRemapImpl.hpp index b4bab84e6091..4d3e4caf68fd 100644 --- a/components/homme/src/share/cxx/GllFvRemapImpl.hpp +++ b/components/homme/src/share/cxx/GllFvRemapImpl.hpp @@ -385,7 +385,7 @@ struct GllFvRemapImpl { // coalesced memory access on the GPU. Kokkos doesn't expose the number of // threads in a team, so we have to go to the lower-level API here. const int nthr_per_team = -#if defined __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ blockDim.x, #else 1, From 406a01e37b20d5bf039e6f84a61a51d11ff1e899 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 13 Sep 2022 10:52:46 -0600 Subject: [PATCH 52/62] fix preqx build --- components/homme/src/preqx/config.h.cmake.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/components/homme/src/preqx/config.h.cmake.in b/components/homme/src/preqx/config.h.cmake.in index c9bcbb432d6f..2e6f6ad0add8 100644 --- a/components/homme/src/preqx/config.h.cmake.in +++ b/components/homme/src/preqx/config.h.cmake.in @@ -66,3 +66,7 @@ /* When doing BFB testing, we occasionally must use modified code. */ /* Use this flag to protect such code. */ #cmakedefine HOMMEXX_BFB_TESTING + +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU + From 28711acc8bf2d36a3bb4f220b5fbf95bbc5e0982 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 13 Sep 2022 17:18:52 -0400 Subject: [PATCH 53/62] remove string args from prim_state calls --- .../src/theta-l/share/prim_state_mod.F90 | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/components/homme/src/theta-l/share/prim_state_mod.F90 b/components/homme/src/theta-l/share/prim_state_mod.F90 index 4a9b492cdaab..1bbe9c003277 100644 --- a/components/homme/src/theta-l/share/prim_state_mod.F90 +++ b/components/homme/src/theta-l/share/prim_state_mod.F90 @@ -213,7 +213,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) SUM(elem(ie)%spheremp*elem(ie)%state%Qdp(:,:,k,q,n0q)) enddo enddo - call wrap_repro_sum(nvars=1, comm=hybrid%par%comm, which="qdp") + call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) qvsum_p(q) = global_shared_sum(1) enddo @@ -353,7 +353,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) psmin_p = ParallelMin(psmin_local,hybrid) psmax_p = ParallelMax(psmax_local,hybrid) - call wrap_repro_sum(nvars=12, comm=hybrid%par%comm,which="u to w_i") + call wrap_repro_sum(nvars=12, comm=hybrid%par%comm) usum_p = global_shared_sum(1) vsum_p = global_shared_sum(2) tsum_p = global_shared_sum(3) @@ -511,7 +511,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) do ie=nets,nete tmp(:,:,ie)=elem(ie)%accum%Qmass(:,:,q,n) enddo - Qmass(q,n) = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which="qmass") + Qmass(q,n) = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) Qmass(q,n) = Qmass(q,n)*scale if (n==2) then @@ -535,108 +535,105 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) tmp(:,:,ie) = elem(ie)%accum%KEu_horiz1 enddo !if(hybrid%masterthread) print *,'KEH1' - - - !crash happens here - KEH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_horiz1') + KEH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEH1 = KEH1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_horiz2 enddo !if(hybrid%masterthread) print *,'KEH2' - KEH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_horiz2') + KEH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEH2 = KEH2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_vert1 enddo !if(hybrid%masterthread) print *,'KEV1' - KEV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_vert1') + KEV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEV1 = KEV1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_vert2 enddo !if(hybrid%masterthread) print *,'KEV2' - KEV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEu_vert2') + KEV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEV2 = KEV2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz1 enddo !if(hybrid%masterthread) print *,'KEwH1' - KEwH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz1') + KEwH1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEwH1 = KEwH1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz2 enddo !if(hybrid%masterthread) print *,'KEwH2' - KEwH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz2') + KEwH2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEwH2 = KEwH2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_horiz3 enddo !if(hybrid%masterthread) print *,'KEwH3' - KEwH3 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_horiz3') + KEwH3 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEwH3 = KEwH3*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_vert1 enddo !if(hybrid%masterthread) print *,'KEwV1' - KEwV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_vert1') + KEwV1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEwV1 = KEwV1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEw_vert2 enddo !if(hybrid%masterthread) print *,'KEwV2' - KEwV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='KEw_vert2') + KEwV2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) KEwV2 = KEwV2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%IEvert1 enddo !if(hybrid%masterthread) print *,'IEvert1' - IEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='IEvert1') + IEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) IEvert1 = IEvert1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%IEvert2 enddo !if(hybrid%masterthread) print *,'IEvert2' - IEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='IEvert2') + IEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) IEvert2 = IEvert2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEhoriz1 enddo !if(hybrid%masterthread) print *,'PEhoriz1' - PEhorz1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEhoriz1') + PEhorz1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) PEhorz1 = PEhorz1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEhoriz2 enddo !if(hybrid%masterthread) print *,'PEhoriz2' - PEhorz2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEhoriz2') + PEhorz2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) PEhorz2 = PEhorz2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEvert1 enddo !if(hybrid%masterthread) print *,'PEver1' - PEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEvert1') + PEvert1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) PEvert1 = PEvert1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%PEvert2 enddo !if(hybrid%masterthread) print *,'PEver2' - PEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='PEvert2') + PEvert2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) PEvert2 = PEvert2*scale ! KE->IE @@ -644,42 +641,42 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) tmp(:,:,ie) = elem(ie)%accum%T01 enddo !if(hybrid%masterthread) print *,'T01' - T1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='T01') + T1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) T1 = T1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%T2 enddo !if(hybrid%masterthread) print *,'T2' - T2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='T2') + T2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) T2 = T2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%S1 enddo !if(hybrid%masterthread) print *,'S1' - S1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='S1') + S1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) S1 = S1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%S2 enddo !if(hybrid%masterthread) print *,'S2' - S2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='S2') + S2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) S2 = S2*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%P1 enddo !if(hybrid%masterthread) print *,'P1' - P1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='P1') + P1 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) P1 = P1*scale do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%P2 enddo !if(hybrid%masterthread) print *,'P2' - P2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete,which='P2') + P2 = global_integral(elem, tmp(:,:,nets:nete),hybrid,npts,nets,nete) P2 = P2*scale #else From 885b3bcad44b67cec39fc08143d9ca8841b02e6f Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 13 Sep 2022 17:21:45 -0400 Subject: [PATCH 54/62] disable code around energy vars that are not used in cxx version --- components/homme/src/theta-l/share/prim_state_mod.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/homme/src/theta-l/share/prim_state_mod.F90 b/components/homme/src/theta-l/share/prim_state_mod.F90 index 1bbe9c003277..b707f29c56fe 100644 --- a/components/homme/src/theta-l/share/prim_state_mod.F90 +++ b/components/homme/src/theta-l/share/prim_state_mod.F90 @@ -530,7 +530,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) ! ! All of these transport terms are at time-tstep = (time1+time2)/2 ! Vertical transport terms -#ifdef ENERGY_DIAGNOSTICS +#if defined(ENERGY_DIAGNOSTICS) && !defined (HOMMEXX_ENABLE_GPU) do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_horiz1 enddo @@ -702,7 +702,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) write(iulog,'(3a25)') "**DYNAMICS** J/m^2"," W/m^2","W/m^2 " if (ftype==4) & write(iulog,*) "NOTE:ftype=4 so d/dt and diss diagnostics include effects of forcing" -#ifdef ENERGY_DIAGNOSTICS +#if defined(ENERGY_DIAGNOSTICS) && !defined (HOMMEXX_ENABLE_GPU) ! terms computed during prim_advance, if ENERGY_DIAGNOSTICS is enabled if (theta_hydrostatic_mode) then write(iulog,'(a,2e22.14)')'KEu h-adv,sum=0:',KEH1,KEH2 From e7b7eb53da46bd6a0d0d8a4fe9bb12d3ca2e060c Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 13 Sep 2022 17:49:51 -0400 Subject: [PATCH 55/62] remove string args from glob_norms calls --- .../homme/src/share/global_norms_mod.F90 | 28 +++---------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/components/homme/src/share/global_norms_mod.F90 b/components/homme/src/share/global_norms_mod.F90 index 6f4be6584a83..c3084e32803f 100644 --- a/components/homme/src/share/global_norms_mod.F90 +++ b/components/homme/src/share/global_norms_mod.F90 @@ -38,7 +38,7 @@ module global_norms_mod ! ! ================================ ! -------------------------- - function global_integral(elem, h,hybrid,npts,nets,nete,which) result(I_sphere) + function global_integral(elem, h,hybrid,npts,nets,nete) result(I_sphere) use kinds, only : real_kind use hybrid_mod, only : hybrid_t use element_mod, only : element_t @@ -57,8 +57,6 @@ function global_integral(elem, h,hybrid,npts,nets,nete,which) result(I_sphere) real (kind=real_kind) :: I_shared common /gblintcom/I_shared - character(len=*) , intent(in) ,optional :: which - ! Local variables integer :: ie,j,i @@ -85,13 +83,8 @@ function global_integral(elem, h,hybrid,npts,nets,nete,which) result(I_sphere) global_shared_buf(ie,1) = J_tmp(ie) enddo !JMD print *,'global_integral: before wrap_repro_sum' - - if (present(which))then - call wrap_repro_sum(nvars=1, comm=hybrid%par%comm, which=which) - else - call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) + call wrap_repro_sum(nvars=1, comm=hybrid%par%comm) !JMD print *,'global_integral: after wrap_repro_sum' - endif I_tmp = global_shared_sum(1) !JMD print *,'global_integral: after global_shared_sum' @@ -826,7 +819,7 @@ function linf_vnorm(elem,v,vt,hybrid,npts,nets,nete) result(linf) end function linf_vnorm - subroutine wrap_repro_sum (nvars, comm, nsize, which) + subroutine wrap_repro_sum (nvars, comm, nsize) use dimensions_mod, only: nelemd #ifdef CAM use shr_reprosum_mod, only: repro_sum => shr_reprosum_calc @@ -841,8 +834,6 @@ subroutine wrap_repro_sum (nvars, comm, nsize, which) integer :: comm ! mpi communicator integer, optional :: nsize ! local buffer size (defaults to nelemd - number of elements in mpi task) - character(len=*), optional :: which - integer nsize_use,n,i if (present(nsize)) then @@ -862,19 +853,8 @@ subroutine wrap_repro_sum (nvars, comm, nsize, which) do n=1,nvars do i=1,nsize_use if (global_shared_buf(i,n) /= global_shared_buf(i,n) ) then - - print *, "var,nvars:",n,nvars - print *, 'failed for',n,' out of ',nvars - print *, 'which is ', which - if (present(which)) then - write(iulog,'(a26,i5,a8,i5)') 'wrap_repro_sum failed for nvar=',n,' out of ',nvars - call abortmp('NaNs detected in repro sum input and marker is '//which) - else - write(iulog,'(a26,i5,a8,i5)') 'wrap_repro_sum failed for nvar=',n,' out of ',nvars + print *, "var,nvars:",n,nvars call abortmp('NaNs detected in repro sum input') - endif - - !call abortmp('NaNs detected in repro sum input') endif enddo enddo From d0cef398385a19aacba2cba4a02bda49c43b1958 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Mon, 19 Sep 2022 14:05:02 -0400 Subject: [PATCH 56/62] a comment --- components/homme/cmake/machineFiles/crusher-gpumpi.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake index 8ee446ae1000..0b054b6dd792 100644 --- a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -4,6 +4,8 @@ #cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ +#cmake -C ~/acme-MASTER-GB/components/homme/cmake/machineFiles/crusher-gpumpi.cmake -DE3SM_KOKKOS_PATH=/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc ~/acme-MASTER-GB/components/homme/ + #SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") From 0c0bc401a77d9f8811c7b039da2785f2b7f7dc83 Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Mon, 19 Sep 2022 14:05:20 -0400 Subject: [PATCH 57/62] cleaning team size logic --- components/homme/src/share/cxx/ExecSpaceDefs.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 7db1cedcf308..671917f5c767 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -53,7 +53,7 @@ void initialize_kokkos () { nd = 1; } #endif -#if defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_HIP) +#ifdef HOMMEXX_ENABLE_GPU std::stringstream ss; ss << "--kokkos-num-devices=" << nd; const auto key = ss.str(); @@ -117,7 +117,6 @@ team_num_threads_vectors_for_gpu ( assert(num_warps_total >= max_num_warps); assert(tp.max_threads_usable >= 1 && tp.max_vectors_usable >= 1); -#if !defined(KOKKOS_ENABLE_HIP) int num_warps; if (tp.prefer_larger_team) { const int num_warps_usable = @@ -152,8 +151,6 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_threads_usable ); -//printf("tp.prefer_threads: %4d %4d \n",num_threads, prevpow2(num_device_threads / num_threads)); - return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); } else { @@ -161,17 +158,9 @@ team_num_threads_vectors_for_gpu ( num_device_threads : tp.max_vectors_usable ); -//printf("NOT tp. prefer_threads: %4d %4d \n",num_device_threads / num_vectors, -// num_vectors); - return std::make_pair( num_device_threads / num_vectors, num_vectors ); } -#else -//manual override for HIP -return std::make_pair( 16,4 ); -#endif - } } // namespace Parallel @@ -204,7 +193,7 @@ team_num_threads_vectors (const int num_parallel_iterations, //use 64 wavefronts per CU and 120 CUs const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); - const int max_num_warps = 40; //cores per CU, SM ///HOMMEXX_CUDA_MAX_WARP_PER_TEAM; + const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; #else From e3bd20adab9a620d18ce84493ef63adcb07f20cd Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Mon, 19 Sep 2022 14:15:43 -0400 Subject: [PATCH 58/62] clean 1 line for HOMMEXX_ENABLE_GPU --- components/homme/src/share/cxx/ExecSpaceDefs.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index a80f993e2d64..80e2e6ba4dde 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -63,7 +63,7 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space -#if defined(HOMMEXX_CUDA_SPACE) || defined(HOMMEXX_HIP_SPACE) +#ifdef HOMMEXX_ENABLE_GPU using ExecSpace = HommexxGPU; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; From eff19f5b50d9b199ae50573acadb421842298b3c Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 20 Sep 2022 01:05:58 -0400 Subject: [PATCH 59/62] address comments --- components/homme/CMakeLists.txt | 2 - components/homme/cmake/SetCompilerFlags.cmake | 8 +-- .../cmake/machineFiles/spock-gpumpi.cmake | 62 ------------------- .../src/preqx_kokkos/cxx/CaarFunctorImpl.hpp | 6 +- components/homme/src/share/cxx/profiling.hpp | 5 -- .../src/theta-l_kokkos/cxx/Diagnostics.hpp | 1 + 6 files changed, 8 insertions(+), 76 deletions(-) delete mode 100644 components/homme/cmake/machineFiles/spock-gpumpi.cmake diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index e661ca6f0c9e..7d5c7117d393 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -196,8 +196,6 @@ IF (${HOMME_USE_KOKKOS}) SET (HOMMEXX_EXEC_SPACE "Default" CACHE STRING "Select the kokkos exec space") - #is HOMMEXX_EXEC_SPACE set anywhere? in scream? - STRING (TOUPPER ${HOMMEXX_EXEC_SPACE} HOMMEXX_EXEC_SPACE_UPPER) IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") diff --git a/components/homme/cmake/SetCompilerFlags.cmake b/components/homme/cmake/SetCompilerFlags.cmake index aa063b48ba54..a7e6328559b5 100644 --- a/components/homme/cmake/SetCompilerFlags.cmake +++ b/components/homme/cmake/SetCompilerFlags.cmake @@ -67,11 +67,11 @@ IF (${HOMME_USE_CXX}) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") INCLUDE(CheckCXXCompilerFlag) - CHECK_CXX_COMPILER_FLAG("-std=c++17" CXX17_SUPPORTED) - IF (${CXX17_SUPPORTED}) - SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + CHECK_CXX_COMPILER_FLAG("-std=c++14" CXX14_SUPPORTED) + IF (CXX14_SUPPORTED) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") ELSEIF (${HOMME_USE_KOKKOS}) - MESSAGE (FATAL_ERROR "Kokkos needs C++17, but the C++ compiler does not support it.") + MESSAGE (FATAL_ERROR "Kokkos needs C++14, but the C++ compiler does not support it.") ENDIF () CHECK_CXX_COMPILER_FLAG("-cxxlib" CXXLIB_SUPPORTED) IF (CXXLIB_SUPPORTED) diff --git a/components/homme/cmake/machineFiles/spock-gpumpi.cmake b/components/homme/cmake/machineFiles/spock-gpumpi.cmake deleted file mode 100644 index 4b288ef5c1cd..000000000000 --- a/components/homme/cmake/machineFiles/spock-gpumpi.cmake +++ /dev/null @@ -1,62 +0,0 @@ -#interactive job -#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash - - -#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ - -#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") -SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") - -SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") -SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") - -SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") - -SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") - -SET(WITH_PNETCDF FALSE CACHE FILEPATH "") - -SET(USE_QUEUING FALSE CACHE BOOL "") - -SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") -SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") -SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") - -#SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") - -SET(USE_TRILINOS OFF CACHE BOOL "") - -SET(HIP TRUE CACHE BOOL "") -#SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") -#SET(Kokkos_ENABLE_CUDA OFF CACHE BOOL "") -#SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") -#SET(Kokkos_ARCH_VEGA908 ON CACHE BOOL "") -#SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") - -SET(CMAKE_C_COMPILER "cc" CACHE STRING "") -SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") -SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") -#SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") -SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos-example-spock-hipcc2/bld-hipcc" CACHE STRING "") - -#not the proper way!!! -SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") - -SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") - -SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_CXX_FLAGS "-O3 -DNDEBUG --amdgpu-target=gfx908 -fno-gpu-rdc ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") -SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY} -I${E3SM_KOKKOS_PATH}/include -L${E3SM_KOKKOS_PATH}/lib64" CACHE STRING "") - - -set (ENABLE_OPENMP OFF CACHE BOOL "") -set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") -set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") - -set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") - -set (USE_NUM_PROCS 4 CACHE STRING "") - -#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") -SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp index bfc8bd9ddf06..0cafba8ed4c8 100644 --- a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp @@ -874,7 +874,7 @@ struct CaarFunctorImpl { #endif } - // CUDA version + // GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< std::is_same::value, void>::type @@ -929,7 +929,7 @@ struct CaarFunctorImpl { kv.team_barrier(); } - // CUDA version + // GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< std::is_same::value, void>::type @@ -983,7 +983,7 @@ struct CaarFunctorImpl { #endif } - // Non-CUDA version + // Non-GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< !std::is_same::value, void>::type diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index f079e9d07761..e7ecbbb449e0 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -12,11 +12,6 @@ #include "gptl.h" -//OG not sure about timers and HIP, probably the same as with CUDA -//#if defined(HOMMEXX_HIP_SPACE) || defined(HOMMEXX_CUDA_SPACE) || \ -// (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) || \ -// (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_HIP)) - #ifdef HOMMEXX_ENABLE_GPU #define start_timer(name) {} #define stop_timer(name) {} diff --git a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp index 8d9e00ecf924..770431f60f67 100644 --- a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp @@ -39,6 +39,7 @@ class Diagnostics ExecViewUnmanaged dpnh_dp_i; }; +//In debug regime with asserts default team sizes are too big. #if defined RESOLVE_ISSUE_WITH_ASSERTS template typename std::enable_if::value == false, From 4802901e96e1b056e3d326c5d5d4f0f7f075d45b Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Tue, 20 Sep 2022 14:01:48 -0500 Subject: [PATCH 60/62] fix linking for ut after merge --- components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt | 2 +- components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt index 1c0b731b0339..3e01da434388 100644 --- a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt @@ -37,7 +37,7 @@ TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(preqx_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") link_to_kokkos(preqx_kokkos_ut_lib) -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing csm_share ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (preqx_kokkos_ut_lib -mkl) ENDIF() diff --git a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt index 62712b6525d2..065672100b73 100644 --- a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt @@ -40,7 +40,7 @@ TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(thetal_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") link_to_kokkos(thetal_kokkos_ut_lib) -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing ${COMPOSE_LIBRARY_CPP} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing csm_share ${COMPOSE_LIBRARY_CPP} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (thetal_kokkos_ut_lib -mkl) ENDIF() From b3f04b53054520613d2e7ab5b7a14b20a4765a4e Mon Sep 17 00:00:00 2001 From: Oksana Guba Date: Wed, 28 Sep 2022 13:18:46 -0400 Subject: [PATCH 61/62] fix homme standalone build not via cime --- components/homme/cmake/SetCompilerFlags.cmake | 1 + components/homme/utils/csm_share/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/components/homme/cmake/SetCompilerFlags.cmake b/components/homme/cmake/SetCompilerFlags.cmake index a7e6328559b5..49e79a463c41 100644 --- a/components/homme/cmake/SetCompilerFlags.cmake +++ b/components/homme/cmake/SetCompilerFlags.cmake @@ -19,6 +19,7 @@ ELSE () ELSE () SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -w -fallow-argument-mismatch -ffree-line-length-none") endif() + ADD_DEFINITIONS(-DCPRGNU) ELSEIF (CMAKE_Fortran_COMPILER_ID STREQUAL PGI) SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mextend -Mflushz") # Needed by csm_share diff --git a/components/homme/utils/csm_share/CMakeLists.txt b/components/homme/utils/csm_share/CMakeLists.txt index 41dade8bf8ff..b5f9bf5c41c1 100644 --- a/components/homme/utils/csm_share/CMakeLists.txt +++ b/components/homme/utils/csm_share/CMakeLists.txt @@ -28,7 +28,7 @@ if (NOT TARGET csm_share) add_custom_command ( OUTPUT ${SRC_FILE_OUT} COMMAND ${GENF90} ${SRC_FILE} > ${SRC_FILE_OUT} - DEPENDS ${SRC_FILE} genf90) + DEPENDS ${SRC_FILE}) list(APPEND CSM_SHARE_SOURCES ${SRC_FILE_OUT}) endforeach () From 9522305d69e286d43032bc7ea90710822eb2505a Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sat, 8 Oct 2022 16:24:34 -0400 Subject: [PATCH 62/62] Hommexx/SL: (Re)fix a variable for lambda capture in two spots. HIP can't handle the reference, but we still need it for the F90 dycore with HORIZ_OPENMP enabled, since in that code path we enter all of our routines within a threaded region and so cannot create new Kokkos::Serial views during time stepping. (Recall Kokkos::Serial is necessary b/c Homme's threading is not compatible with Kokkos::OpenMP.) Thus, if COMPOSE_PORT is not defined, take a reference. --- .../homme/src/share/compose/compose_cedr_sl_run_global.cpp | 6 +++++- .../homme/src/share/compose/compose_cedr_sl_run_local.cpp | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp index a8ced6c3f131..69fab390726b 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp @@ -84,7 +84,11 @@ void run_global (CDR& cdr, CDRT* cedr_cdr_p, const auto& nonnegs = cdr.nonneg; const auto& ie2lci = cdr.ie2lci; const auto& ie2gci = cdr.ie2gci; - const typename CDRT::DeviceOp cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp +#ifndef COMPOSE_PORT + & // When running F90 Homme's threading scheme, we can't create new views. +#endif + cedr_cdr = cedr_cdr_p->get_device_op(); if (cedr::impl::OnGpu::value) { const Int n = ta.nelemd*nlev*qsize*np2; ko::View q_min_1d(q_min.data(), n); diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp index bfd9ff5b0add..393b3f943ec8 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp @@ -186,7 +186,11 @@ void run_local (CDR& cdr, CDRT* cedr_cdr_p, const Int nsuplev = cdr.nsuplev; const auto cdr_over_super_levels = cdr.cdr_over_super_levels; const auto caas_in_suplev = cdr.caas_in_suplev; - const typename CDRT::DeviceOp cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp +#ifndef COMPOSE_PORT + & +#endif + cedr_cdr = cedr_cdr_p->get_device_op(); const auto& ie2lci = cdr.ie2lci; // Loop differently due to performance diff on CPU. #ifdef COMPOSE_PORT