diff --git a/components/homme/CMakeLists.txt b/components/homme/CMakeLists.txt index d07789535f2c..0d56dbfe2dba 100644 --- a/components/homme/CMakeLists.txt +++ b/components/homme/CMakeLists.txt @@ -197,7 +197,10 @@ IF (${HOMME_USE_KOKKOS}) SET (HOMMEXX_EXEC_SPACE "Default" CACHE STRING "Select the kokkos exec space") STRING (TOUPPER ${HOMMEXX_EXEC_SPACE} HOMMEXX_EXEC_SPACE_UPPER) - IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") + + IF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "HIP") + SET (HOMMEXX_HIP_SPACE ON) + ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "CUDA") SET (HOMMEXX_CUDA_SPACE ON) ELSEIF (${HOMMEXX_EXEC_SPACE_UPPER} STREQUAL "OPENMP") SET (HOMMEXX_OPENMP_SPACE ON) @@ -287,16 +290,19 @@ MESSAGE(STATUS "C Flags = ${CMAKE_C_FLAGS}") MESSAGE(STATUS "CXX Flags = ${CMAKE_CXX_FLAGS}") MESSAGE(STATUS "Linker Flags = ${CMAKE_EXE_LINKER_FLAGS}") +SET (HOMMEXX_ENABLE_GPU FALSE) + IF (${HOMME_USE_KOKKOS}) - IF (CUDA_BUILD) + IF (CUDA_BUILD OR HIP_BUILD) SET (DEFAULT_VECTOR_SIZE 1) + SET (HOMMEXX_ENABLE_GPU TRUE) ELSE () SET (DEFAULT_VECTOR_SIZE 8) ENDIF() SET (HOMMEXX_VECTOR_SIZE ${DEFAULT_VECTOR_SIZE} CACHE STRING - "If AVX or Cuda don't take priority, use this software vector size.") + "If AVX or Cuda or HIP don't take priority, use this software vector size.") IF (CMAKE_BUILD_TYPE_UPPER MATCHES "DEBUG" OR CMAKE_BUILD_TYPE_UPPER MATCHES "RELWITHDEBINFO") SET (HOMMEXX_DEBUG ON) diff --git a/components/homme/cmake/HommeMacros.cmake b/components/homme/cmake/HommeMacros.cmake index 5db6812d33a6..92b52fcce4a1 100644 --- a/components/homme/cmake/HommeMacros.cmake +++ b/components/homme/cmake/HommeMacros.cmake @@ -156,7 +156,7 @@ macro(createTestExec execName execType macroNP macroNC ENDIF () IF (HOMME_USE_KOKKOS) - TARGET_LINK_LIBRARIES(${execName} kokkos) + link_to_kokkos(${execName}) ENDIF () # Move the module files out of the way so the parallel build diff --git a/components/homme/cmake/Kokkos.cmake b/components/homme/cmake/Kokkos.cmake index b0a32cf1cf4b..6bea395a4403 100644 --- a/components/homme/cmake/Kokkos.cmake +++ b/components/homme/cmake/Kokkos.cmake @@ -51,10 +51,10 @@ macro(install_kokkos_if_needed) endmacro() macro(link_to_kokkos targetName) + target_include_directories(${targetName} SYSTEM PUBLIC ${KOKKOS_INCLUDE_DIR}) if (KOKKOS_LIBS_ARE_TARGETS) target_link_libraries (${targetName} ${KOKKOS_LIBRARIES}) else() - target_include_directories(${targetName} SYSTEM PUBLIC ${KOKKOS_INCLUDE_DIR}) target_link_libraries(${targetName} ${KOKKOS_TPL_LIBRARIES} ${KOKKOS_LIBRARIES} -L${KOKKOS_LIBRARY_DIR}) endif() endmacro(link_to_kokkos) diff --git a/components/homme/cmake/SetCompilerFlags.cmake b/components/homme/cmake/SetCompilerFlags.cmake index b772321c7d30..49e79a463c41 100644 --- a/components/homme/cmake/SetCompilerFlags.cmake +++ b/components/homme/cmake/SetCompilerFlags.cmake @@ -19,6 +19,7 @@ ELSE () ELSE () SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -w -fallow-argument-mismatch -ffree-line-length-none") endif() + ADD_DEFINITIONS(-DCPRGNU) ELSEIF (CMAKE_Fortran_COMPILER_ID STREQUAL PGI) SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mextend -Mflushz") # Needed by csm_share @@ -68,7 +69,7 @@ IF (${HOMME_USE_CXX}) INCLUDE(CheckCXXCompilerFlag) CHECK_CXX_COMPILER_FLAG("-std=c++14" CXX14_SUPPORTED) - IF (${CXX14_SUPPORTED}) + IF (CXX14_SUPPORTED) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") ELSEIF (${HOMME_USE_KOKKOS}) MESSAGE (FATAL_ERROR "Kokkos needs C++14, but the C++ compiler does not support it.") diff --git a/components/homme/cmake/machineFiles/crusher-gpumpi.cmake b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake new file mode 100644 index 000000000000..0b054b6dd792 --- /dev/null +++ b/components/homme/cmake/machineFiles/crusher-gpumpi.cmake @@ -0,0 +1,68 @@ +#interactive job +#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash + + +#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ + +#cmake -C ~/acme-MASTER-GB/components/homme/cmake/machineFiles/crusher-gpumpi.cmake -DE3SM_KOKKOS_PATH=/ccs/home/onguba/kokkos-crusher-june2022/bld-hipcc ~/acme-MASTER-GB/components/homme/ + +#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") + +SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +#SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +#CUDA_BUILD is set in SetCompilersFlags, after findPackage(Cuda) +#i haven't extend it to hip, set it here instead +SET(HIP_BUILD TRUE CACHE BOOL "") + +#uncomment this if using internal kokkos build +#SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "") +####SET(CMAKE_CXX_STANDARD "14" CACHE STRING "") +#SET(Kokkos_ENABLE_DEBUG OFF CACHE BOOL "") +#SET(Kokkos_ARCH_VEGA90A ON CACHE BOOL "") +#SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_HIP ON CACHE BOOL "") +####SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "cc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") + +#not the proper way!!! +SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0" CACHE STRING "") + +SET(Extrae_LIBRARY "-I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "") + +SET(ADD_Fortran_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_C_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_CXX_FLAGS "-std=c++14 -O3 -DNDEBUG --amdgpu-target=gfx90a -fno-gpu-rdc ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_LINKER_FLAGS "-O3 -DNDEBUG ${Extrae_LIBRARY}" CACHE STRING "") + + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") + +set (USE_NUM_PROCS 4 CACHE STRING "") + +#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") +SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") diff --git a/components/homme/cmake/machineFiles/summit-bfb.cmake b/components/homme/cmake/machineFiles/summit-bfb.cmake new file mode 100644 index 000000000000..c9313f346a54 --- /dev/null +++ b/components/homme/cmake/machineFiles/summit-bfb.cmake @@ -0,0 +1,62 @@ +#interactive job +#bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash + +#1 gpu on 1 node +#jsrun -n 1 -r 1 -l gpu-gpu -b packed:1 -d plane:1 -a1 -c7 -g1 --smpiargs "-gpu" EXEC < ${nlname} +#6 gpus on 1 node +#jsrun -n 6 -r 6 -l gpu-gpu -b packed:1 -d plane:1 -a1 -c7 -g1 --smpiargs "-gpu" EXEC < ${nlname} +#6 ranks, cpu only on 1 node, not 42 rank, NEEDS visible GPU due to some kokkos init, so, -g1, not -g0 +#jsrun -n 6 -r 6 -l cpu-cpu -b packed:1 -d plane:1 -a1 -c7 -g1 EXEC < ${nlname} + + +set(CMAKE_C_FLAGS "-w" CACHE STRING "") +set(ADD_CXX_FLAGS "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored -Wno-unknown-pragmas --fmad=false -O0" CACHE STRING "") +set(ADD_Fortran_FLAGS " -ffp-contract=off -O0" CACHE STRING "") +set(OPT_FLAGS "-O0" CACHE STRING "") +set(DEBUG_FLAGS "-ffp-contract=off -g"CACHE STRING "") + + +#SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +SET (NetCDF_Fortran_PATH $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE STRING "") +SET (NetCDF_C_PATH $ENV{OLCF_NETCDF_C_ROOT} CACHE STRING "") +#SET(NetCDF_C_LIBRARY "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/lib" CACHE STRING "") +#SET(NetCDF_C_INCLUDE_DIR "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/include" CACHE STRING "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") +SET(HOMME_ENABLE_COMPOSE TRUE CACHE BOOL "") +#SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") + +SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +SET(Kokkos_ENABLE_CUDA ON CACHE BOOL "") +SET(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "") +SET(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "") +SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "mpicc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "${CMAKE_CURRENT_SOURCE_DIR}/../../externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "dev" CACHE STRING "") + +set (USE_NUM_PROCS 4 CACHE STRING "") + +#set (OPT_FLAGS "-mcpu=power9 -mtune=power9" CACHE STRING "") +SET (USE_MPI_OPTIONS "--bind-to core" CACHE FILEPATH "") diff --git a/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake b/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake index c860dffe8dd5..6de0d6f040be 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi-movies.cmake @@ -19,8 +19,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_SWEQX FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX_ACC FALSE CACHE BOOL "") SET(BUILD_HOMME_PREQX FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-gpumpi.cmake b/components/homme/cmake/machineFiles/summit-gpumpi.cmake index 7bd33ca9bc27..46474a5a2fb7 100644 --- a/components/homme/cmake/machineFiles/summit-gpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-gpumpi.cmake @@ -2,13 +2,13 @@ #bsub -W 2:00 -nnodes 1 -P cli115 -Is /bin/bash -#cmake -C ~/acme-fork-lb/components/homme/cmake/machineFiles/summit.cmake -DHOMMEXX_MPI_ON_DEVICE=FALSE ~/acme-fork-lb/components/homme/ - #SET (HOMMEXX_MPI_ON_DEVICE FALSE CACHE BOOL "") SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") -SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") -SET (HDF5_DIR $ENV{OLCF_HDF5_ROOT} CACHE FILEPATH "") +#SET (NETCDF_DIR $ENV{OLCF_NETCDF_FORTRAN_ROOT} CACHE FILEPATH "") +#SET (NetCDF_Fortran_PATH "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-fortran-4.4.5-e2hkh7w3253wz5uubjxbbvh56a7xjl7n" CACHE STRING "") +#SET(NetCDF_C_LIBRARY "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/lib" CACHE STRING "") +#SET(NetCDF_C_INCLUDE_DIR "/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-7.5.0/netcdf-c-4.8.0-pwi4jbrnwv4lrrjxdu5czbos5uvvjgvr/include" CACHE STRING "") SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") @@ -18,11 +18,8 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") -SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") #SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") @@ -36,7 +33,7 @@ SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") SET(CMAKE_C_COMPILER "mpicc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "") -SET(CMAKE_CXX_COMPILER "/ccs/home/onguba/kokkos/bin/nvcc_wrapper" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "${CMAKE_CURRENT_SOURCE_DIR}/../../externals/kokkos/bin/nvcc_wrapper" CACHE STRING "") set (ENABLE_OPENMP OFF CACHE BOOL "") set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-nogpumpi.cmake b/components/homme/cmake/machineFiles/summit-nogpumpi.cmake index 892c0637852f..3b649122c534 100644 --- a/components/homme/cmake/machineFiles/summit-nogpumpi.cmake +++ b/components/homme/cmake/machineFiles/summit-nogpumpi.cmake @@ -16,8 +16,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos/build-serial-cuda-nodebug" CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/summit-p9-noomp.cmake b/components/homme/cmake/machineFiles/summit-p9-noomp.cmake index 70602695a33b..1d741db175fe 100644 --- a/components/homme/cmake/machineFiles/summit-p9-noomp.cmake +++ b/components/homme/cmake/machineFiles/summit-p9-noomp.cmake @@ -15,8 +15,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-p9-omp.cmake b/components/homme/cmake/machineFiles/summit-p9-omp.cmake index 70ade1f043cc..05d37d12154a 100644 --- a/components/homme/cmake/machineFiles/summit-p9-omp.cmake +++ b/components/homme/cmake/machineFiles/summit-p9-omp.cmake @@ -15,8 +15,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") SET(HOMME_ENABLE_COMPOSE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/summit-p9.cmake b/components/homme/cmake/machineFiles/summit-p9.cmake index c240b3e39c11..9136dc0bfbdc 100644 --- a/components/homme/cmake/machineFiles/summit-p9.cmake +++ b/components/homme/cmake/machineFiles/summit-p9.cmake @@ -21,8 +21,6 @@ SET(WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(E3SM_KOKKOS_PATH "/ccs/home/onguba/kokkos/build-serial-p9-nodebug" CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/summit.cmake b/components/homme/cmake/machineFiles/summit.cmake index e44ce08db16f..ee449c90c125 100644 --- a/components/homme/cmake/machineFiles/summit.cmake +++ b/components/homme/cmake/machineFiles/summit.cmake @@ -15,8 +15,6 @@ SET (WITH_PNETCDF FALSE CACHE FILEPATH "") SET(USE_QUEUING FALSE CACHE BOOL "") -SET(ENABLE_CUDA FALSE CACHE BOOL "") - SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") SET(USE_TRILINOS OFF CACHE BOOL "") SET(KOKKOS_PATH "$ENV{HOME}/kokkos/build-serial-cuda-nodebug" CACHE STRING "") diff --git a/components/homme/src/preqx/config.h.cmake.in b/components/homme/src/preqx/config.h.cmake.in index c9bcbb432d6f..2e6f6ad0add8 100644 --- a/components/homme/src/preqx/config.h.cmake.in +++ b/components/homme/src/preqx/config.h.cmake.in @@ -66,3 +66,7 @@ /* When doing BFB testing, we occasionally must use modified code. */ /* Use this flag to protect such code. */ #cmakedefine HOMMEXX_BFB_TESTING + +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU + diff --git a/components/homme/src/preqx_kokkos/config.h.cmake.in b/components/homme/src/preqx_kokkos/config.h.cmake.in index 688f8a4151d3..9d54ea6e31c5 100644 --- a/components/homme/src/preqx_kokkos/config.h.cmake.in +++ b/components/homme/src/preqx_kokkos/config.h.cmake.in @@ -48,8 +48,8 @@ /* Zoltan partitioning library */ #cmakedefine01 HAVE_ZOLTAN -/* Identifies whether this is a cuda build */ -#cmakedefine CUDA_BUILD +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU /* TRILINOS library */ #cmakedefine01 HAVE_TRILINOS diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp index c7a10a660844..0cafba8ed4c8 100644 --- a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl.hpp @@ -753,7 +753,7 @@ struct CaarFunctorImpl { private: template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type compute_pressure_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -787,7 +787,7 @@ struct CaarFunctorImpl { template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type compute_pressure_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -821,7 +821,7 @@ struct CaarFunctorImpl { template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type preq_hydrostatic_impl(KernelVariables &kv) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), [&](const int loop_idx) { @@ -874,10 +874,10 @@ struct CaarFunctorImpl { #endif } - // CUDA version + // GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type preq_hydrostatic_impl(KernelVariables &kv) const { assert_vector_size_1(); Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP), @@ -929,10 +929,10 @@ struct CaarFunctorImpl { kv.team_barrier(); } - // CUDA version + // GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - std::is_same::value, void>::type + std::is_same::value, void>::type preq_omega_ps_impl(KernelVariables &kv) const { assert_vector_size_1(); #ifdef DEBUG_TRACE @@ -983,10 +983,10 @@ struct CaarFunctorImpl { #endif } - // Non-CUDA version + // Non-GPU version template KOKKOS_INLINE_FUNCTION typename std::enable_if< - !std::is_same::value, void>::type + !std::is_same::value, void>::type preq_omega_ps_impl(KernelVariables &kv) const { m_sphere_ops.gradient_sphere( kv, Homme::subview(m_buffers.pressure, kv.team_idx), diff --git a/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 b/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 index 8d0c271526b9..388e8c0b5ffa 100644 --- a/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 +++ b/components/homme/src/preqx_kokkos/preqx_f2c_mod.F90 @@ -142,8 +142,9 @@ subroutine prim_run_subcycle_c(tstep,nstep,nm1,n0,np1,next_output_step,nsplit_it ! ! Inputs ! - integer(kind=c_int), intent(in) :: nstep, nm1, n0, np1, next_output_step, nsplit_iteration - real (kind=c_double), intent(in) :: tstep + integer(kind=c_int), intent(inout) :: nstep, nm1, n0, np1 + integer(kind=c_int), intent(in) :: next_output_step, nsplit_iteration + real (kind=c_double), intent(in) :: tstep end subroutine prim_run_subcycle_c ! Copy results from C++ views back to f90 arrays diff --git a/components/homme/src/prim_main.F90 b/components/homme/src/prim_main.F90 index 1463df36acfc..bfbe57e8b317 100644 --- a/components/homme/src/prim_main.F90 +++ b/components/homme/src/prim_main.F90 @@ -26,7 +26,9 @@ program prim_main #if (defined MODEL_THETA_L && defined ARKODE) use arkode_mod, only: calc_nonlinear_stats, finalize_nonlinear_stats #endif +#ifdef HOMME_ENABLE_COMPOSE use compose_test_mod, only: compose_test +#endif use test_mod, only: print_test_results #ifdef PIO_INTERP @@ -220,7 +222,9 @@ end subroutine finalize_kokkos_f90 #endif endif +#ifdef HOMME_ENABLE_COMPOSE call compose_test(par, hvcoord, dom_mt, elem) +#endif if(par%masterproc) print *,"Entering main timestepping loop" call t_startf('prim_main_loop') diff --git a/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp b/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp index aadbfaecb004..9d2cb3df8c8d 100644 --- a/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp +++ b/components/homme/src/share/compose/cedr_bfb_tree_allreduce.cpp @@ -231,8 +231,8 @@ template class cedr::BfbTreeAllReducer; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::BfbTreeAllReducer; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::BfbTreeAllReducer; +#ifdef CEDR_ENABLE_GPU +template class cedr::BfbTreeAllReducer; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::BfbTreeAllReducer; diff --git a/components/homme/src/share/compose/cedr_caas.cpp b/components/homme/src/share/compose/cedr_caas.cpp index c16d392065c4..87506f6e9a05 100644 --- a/components/homme/src/share/compose/cedr_caas.cpp +++ b/components/homme/src/share/compose/cedr_caas.cpp @@ -401,8 +401,8 @@ template class cedr::caas::CAAS; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::caas::CAAS; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::caas::CAAS; +#ifdef CEDR_ENABLE_GPU +template class cedr::caas::CAAS; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::caas::CAAS; diff --git a/components/homme/src/share/compose/cedr_kokkos.hpp b/components/homme/src/share/compose/cedr_kokkos.hpp index 0d50c298bdff..891d9f763ecb 100644 --- a/components/homme/src/share/compose/cedr_kokkos.hpp +++ b/components/homme/src/share/compose/cedr_kokkos.hpp @@ -6,6 +6,22 @@ #include +#if defined KOKKOS_ENABLE_CUDA || defined KOKKOS_ENABLE_HIP || defined KOKKOS_ENABLE_SYCL +# define CEDR_ENABLE_GPU +# if defined KOKKOS_ENABLE_CUDA +typedef Kokkos::Cuda CedrGpuExeSpace; +typedef Kokkos::CudaSpace CedrGpuSpace; +# endif +# if defined KOKKOS_ENABLE_HIP +typedef Kokkos::Experimental::HIP CedrGpuExeSpace; +typedef Kokkos::Experimental::HIPSpace CedrGpuSpace; +# endif +# if defined KOKKOS_ENABLE_SYCL +typedef Kokkos::Experimental::SYCL CedrGpuExeSpace; +typedef Kokkos::Experimental::SYCL> CedrGpuSpace; +# endif +#endif + #define KIF KOKKOS_INLINE_FUNCTION // Clarify that a class member type is meant to be private but is @@ -74,11 +90,11 @@ struct DeviceType { typename ExeSpace::memory_space> type; }; -#ifdef KOKKOS_ENABLE_CUDA -typedef Kokkos::Device DefaultDeviceType; +#ifdef CEDR_ENABLE_GPU +typedef Kokkos::Device DefaultDeviceType; -template <> struct DeviceType { +template <> struct DeviceType { typedef DefaultDeviceType type; }; #else @@ -95,8 +111,8 @@ template struct OnGpu { #endif }; }; -#ifdef KOKKOS_ENABLE_CUDA -template <> struct OnGpu { enum : bool { value = true }; }; +#ifdef CEDR_ENABLE_GPU +template <> struct OnGpu { enum : bool { value = true }; }; #endif template @@ -120,10 +136,10 @@ struct ExeSpaceUtils { } }; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef CEDR_ENABLE_GPU template <> -struct ExeSpaceUtils { - using TeamPolicy = Kokkos::TeamPolicy; +struct ExeSpaceUtils { + using TeamPolicy = Kokkos::TeamPolicy; using Member = typename TeamPolicy::member_type; static TeamPolicy get_default_team_policy (int outer, int inner) { return TeamPolicy(outer, std::min(128, 32*((inner + 31)/32)), 1); diff --git a/components/homme/src/share/compose/cedr_qlt.cpp b/components/homme/src/share/compose/cedr_qlt.cpp index 67f8e2c6d43f..c3728093ce56 100644 --- a/components/homme/src/share/compose/cedr_qlt.cpp +++ b/components/homme/src/share/compose/cedr_qlt.cpp @@ -807,8 +807,8 @@ template class cedr::qlt::QLT; #ifdef KOKKOS_ENABLE_OPENMP template class cedr::qlt::QLT; #endif -#ifdef KOKKOS_ENABLE_CUDA -template class cedr::qlt::QLT; +#ifdef CEDR_ENABLE_GPU +template class cedr::qlt::QLT; #endif #ifdef KOKKOS_ENABLE_THREADS template class cedr::qlt::QLT; diff --git a/components/homme/src/share/compose/cedr_test_randomized.cpp b/components/homme/src/share/compose/cedr_test_randomized.cpp index ea8c84e32156..7e51ec5e768b 100644 --- a/components/homme/src/share/compose/cedr_test_randomized.cpp +++ b/components/homme/src/share/compose/cedr_test_randomized.cpp @@ -395,7 +395,7 @@ ::check (const std::string& cdr_name, const mpi::Parallel& p, local_violated.size(), MPI_MAX, p.root()); if (p.amroot()) { - const Real tol = 1e2*std::numeric_limits::epsilon(); + const Real tol = 5e2*std::numeric_limits::epsilon(); for (size_t ti = 0; ti < ts.size(); ++ti) { // Check mass conservation. const Real desired_mass = glbl_mass[3*ti], actual_mass = glbl_mass[3*ti+1], diff --git a/components/homme/src/share/compose/compose.hpp b/components/homme/src/share/compose/compose.hpp index 915596aa8911..01be2635fcf6 100644 --- a/components/homme/src/share/compose/compose.hpp +++ b/components/homme/src/share/compose/compose.hpp @@ -11,6 +11,22 @@ # include #endif +#if defined KOKKOS_ENABLE_CUDA || defined KOKKOS_ENABLE_HIP || defined KOKKOS_ENABLE_SYCL +# define COMPOSE_ENABLE_GPU +# if defined KOKKOS_ENABLE_CUDA +typedef Kokkos::Cuda ComposeGpuExeSpace; +typedef Kokkos::CudaSpace ComposeGpuSpace; +# endif +# if defined KOKKOS_ENABLE_HIP +typedef Kokkos::Experimental::HIP ComposeGpuExeSpace; +typedef Kokkos::Experimental::HIPSpace ComposeGpuSpace; +# endif +# if defined KOKKOS_ENABLE_SYCL +typedef Kokkos::Experimental::SYCL ComposeGpuExeSpace; +typedef Kokkos::Experimental::SYCL> ComposeGpuSpace; +# endif +#endif + // Options #ifdef NDEBUG @@ -42,11 +58,11 @@ # if defined COMPOSE_HORIZ_OPENMP || defined COMPOSE_COLUMN_OPENMP "This should not happen." # endif -# ifndef KOKKOS_ENABLE_CUDA +# ifndef COMPOSE_ENABLE_GPU // Mimic GPU threading on host to debug race conditions on a regular CPU. //# define COMPOSE_MIMIC_GPU # endif -# if defined COMPOSE_MIMIC_GPU || defined KOKKOS_ENABLE_CUDA +# if defined COMPOSE_MIMIC_GPU || defined COMPOSE_ENABLE_GPU // If defined, then certain buffers need explicit mirroring and copying. # define COMPOSE_PORT_SEPARATE_VIEWS // If defined, do pass1 routines on host. This is for performance checking. diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp index aa1c202792da..69fab390726b 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_global.cpp @@ -84,7 +84,11 @@ void run_global (CDR& cdr, CDRT* cedr_cdr_p, const auto& nonnegs = cdr.nonneg; const auto& ie2lci = cdr.ie2lci; const auto& ie2gci = cdr.ie2gci; - const typename CDRT::DeviceOp& cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp +#ifndef COMPOSE_PORT + & // When running F90 Homme's threading scheme, we can't create new views. +#endif + cedr_cdr = cedr_cdr_p->get_device_op(); if (cedr::impl::OnGpu::value) { const Int n = ta.nelemd*nlev*qsize*np2; ko::View q_min_1d(q_min.data(), n); diff --git a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp index 6db595ea31c3..393b3f943ec8 100644 --- a/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp +++ b/components/homme/src/share/compose/compose_cedr_sl_run_local.cpp @@ -186,7 +186,11 @@ void run_local (CDR& cdr, CDRT* cedr_cdr_p, const Int nsuplev = cdr.nsuplev; const auto cdr_over_super_levels = cdr.cdr_over_super_levels; const auto caas_in_suplev = cdr.caas_in_suplev; - const typename CDRT::DeviceOp& cedr_cdr = cedr_cdr_p->get_device_op(); + const typename CDRT::DeviceOp +#ifndef COMPOSE_PORT + & +#endif + cedr_cdr = cedr_cdr_p->get_device_op(); const auto& ie2lci = cdr.ie2lci; // Loop differently due to performance diff on CPU. #ifdef COMPOSE_PORT diff --git a/components/homme/src/share/compose/compose_homme.hpp b/components/homme/src/share/compose/compose_homme.hpp index d53cf90ccf2d..a3b40a204a9a 100644 --- a/components/homme/src/share/compose/compose_homme.hpp +++ b/components/homme/src/share/compose/compose_homme.hpp @@ -144,7 +144,7 @@ struct HommeFormatArray { assert(ie_data_ptr[ie]); // These routines are not used on the GPU, but they can be called from // KOKKOS_FUNCTIONs on CPU in GPU builds. Avoid nvcc warnings as follows: -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + i); @@ -157,7 +157,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + lev*np2 + k); @@ -172,7 +172,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev, q_or_timelev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + (q_or_timelev*nlev + lev)*np2 + k); @@ -188,7 +188,7 @@ struct HommeFormatArray { assert(lev >= 0); assert(ie_data_ptr[ie]); check(ie, k, lev, q, timelev); -#ifdef __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return unused(); #else return *(ie_data_ptr[ie] + ((timelev*qsized + q)*nlev + lev)*np2 + k); @@ -200,7 +200,7 @@ struct HommeFormatArray { std::vector ie_data_ptr; const Int nlev, qsized, ntimelev; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU COMPOSE_INLINE_FUNCTION static T& unused () { static T unused = 0; assert(0); @@ -211,7 +211,7 @@ struct HommeFormatArray { COMPOSE_FORCEINLINE_FUNCTION void check (Int ie, Int k = -1, Int lev = -1, Int q_or_timelev = -1, Int timelev = -1) const { -#if defined COMPOSE_BOUNDS_CHECK && ! defined __CUDA_ARCH__ +#if defined COMPOSE_BOUNDS_CHECK && ! (defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__) assert(ie >= 0 && ie < static_cast(ie_data_ptr.size())); if (k >= 0) assert(k < np2); if (lev >= 0) assert(lev < nlev); diff --git a/components/homme/src/share/compose/compose_kokkos.hpp b/components/homme/src/share/compose/compose_kokkos.hpp index 6e683253df45..9dd2515bc671 100644 --- a/components/homme/src/share/compose/compose_kokkos.hpp +++ b/components/homme/src/share/compose/compose_kokkos.hpp @@ -31,7 +31,7 @@ template struct NumericTraits; template <> struct NumericTraits { KOKKOS_INLINE_FUNCTION static double epsilon () { return -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU 2.2204460492503131e-16 #else std::numeric_limits::epsilon() @@ -43,7 +43,7 @@ template <> struct NumericTraits { template <> struct NumericTraits { KOKKOS_INLINE_FUNCTION static float epsilon () { return -#ifdef KOKKOS_ENABLE_CUDA +#ifdef COMPOSE_ENABLE_GPU 1.1920928955078125e-07 #else std::numeric_limits::epsilon() @@ -58,11 +58,11 @@ struct DeviceType { typename ExeSpace::memory_space> type; }; -#ifdef KOKKOS_ENABLE_CUDA -typedef Kokkos::Device DefaultDeviceType; +#ifdef COMPOSE_ENABLE_GPU +typedef Kokkos::Device DefaultDeviceType; -template <> struct DeviceType { +template <> struct DeviceType { typedef DefaultDeviceType type; }; #else @@ -92,8 +92,8 @@ template struct OnGpu { #endif }; }; -#ifdef KOKKOS_ENABLE_CUDA -template <> struct OnGpu { enum : bool { value = true }; }; +#ifdef COMPOSE_ENABLE_GPU +template <> struct OnGpu { enum : bool { value = true }; }; template <> struct OnGpu {}; // flag as an error at compile time #endif @@ -140,7 +140,7 @@ template const View& unmanaged ( #endif // Copy by ref if not Cuda build. -#if defined COMPOSE_PORT && defined KOKKOS_ENABLE_CUDA +#if defined COMPOSE_PORT && defined COMPOSE_ENABLE_GPU # define COMPOSE_LAMBDA KOKKOS_LAMBDA # define COMPOSE_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION # define COMPOSE_FORCEINLINE_FUNCTION KOKKOS_FORCEINLINE_FUNCTION diff --git a/components/homme/src/share/compose/compose_slmm.cpp b/components/homme/src/share/compose/compose_slmm.cpp index 7006ff14c618..b8277b5423c6 100644 --- a/components/homme/src/share/compose/compose_slmm.cpp +++ b/components/homme/src/share/compose/compose_slmm.cpp @@ -240,7 +240,7 @@ static void initialize_kokkos () { if (Kokkos::is_initialized()) return; in_charge_of_kokkos = true; std::vector args; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU int nd; const auto ret = cudaGetDeviceCount(&nd); if (ret != cudaSuccess) { diff --git a/components/homme/src/share/compose_mod.F90 b/components/homme/src/share/compose_mod.F90 index e7f95af04f66..8b134def384b 100644 --- a/components/homme/src/share/compose_mod.F90 +++ b/components/homme/src/share/compose_mod.F90 @@ -374,9 +374,9 @@ end subroutine compose_init subroutine compose_finalize(finalize_kokkos) logical, optional, intent(in) :: finalize_kokkos +#ifdef HOMME_ENABLE_COMPOSE logical :: call_finalize_kokkos -#ifdef HOMME_ENABLE_COMPOSE call cedr_finalize() call slmm_finalize() diff --git a/components/homme/src/share/cxx/ComposeTransport.cpp b/components/homme/src/share/cxx/ComposeTransport.cpp index 255445dbe6be..760a3dd06fa4 100644 --- a/components/homme/src/share/cxx/ComposeTransport.cpp +++ b/components/homme/src/share/cxx/ComposeTransport.cpp @@ -4,6 +4,9 @@ * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory *******************************************************************************/ +#include "Config.hpp" +#ifdef HOMME_ENABLE_COMPOSE + #include "ComposeTransport.hpp" #include "ComposeTransportImpl.hpp" #include "Context.hpp" @@ -84,3 +87,5 @@ void ComposeTransport::test_2d (const bool bfb, const int nstep, std::vector -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU # ifndef HOMMEXX_CUDA_MIN_WARP_PER_TEAM # define HOMMEXX_CUDA_MIN_WARP_PER_TEAM 8 # endif diff --git a/components/homme/src/share/cxx/Dimensions.hpp b/components/homme/src/share/cxx/Dimensions.hpp index 44adeb5d9b00..6367ca4bbd79 100644 --- a/components/homme/src/share/cxx/Dimensions.hpp +++ b/components/homme/src/share/cxx/Dimensions.hpp @@ -14,7 +14,7 @@ namespace Homme { // Until whenever CUDA supports constexpr properly -#ifdef CUDA_BUILD +#ifdef HOMMEXX_ENABLE_GPU #ifdef CAM #define QSIZE_D PCNST @@ -53,7 +53,7 @@ namespace Homme { static constexpr const int NUM_TIME_LEVELS = 3; static constexpr const int Q_NUM_TIME_LEVELS = 2; -#endif // CUDA_BUILD +#endif // GPU_BUILD template struct ColInfo { diff --git a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp index f723633d3856..f0e40c550650 100644 --- a/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp +++ b/components/homme/src/share/cxx/EulerStepFunctorImpl.hpp @@ -42,9 +42,9 @@ struct SerialLimiter { const Array2GllLvl& irwrk); }; // GPU doesn't have a serial impl. -#if defined KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU template <> -struct SerialLimiter { +struct SerialLimiter { template KOKKOS_INLINE_FUNCTION static void @@ -56,6 +56,7 @@ struct SerialLimiter { }; #endif + class EulerStepFunctorImpl { struct EulerStepData { EulerStepData () @@ -423,7 +424,9 @@ class EulerStepFunctorImpl { ExecSpace::impl_static_fence(); m_kernel_will_run_limiters = true; Kokkos::parallel_for( - Homme::get_default_team_policy( + //to play with launch bounds + //Homme::get_default_team_policy >( + Homme::get_default_team_policy( m_geometry.num_elems() * m_data.qsize, m_tpref), *this); ExecSpace::impl_static_fence(); diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 8e775653ff44..671917f5c767 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -17,6 +17,10 @@ # include #endif +#ifdef KOKKOS_ENABLE_HIP +#include +#endif + namespace Homme { // Since we're initializing from inside a Fortran code and don't have access to @@ -41,6 +45,15 @@ void initialize_kokkos () { // It isn't a big deal if we can't get the device count. nd = 1; } +#elif defined(KOKKOS_ENABLE_HIP) + int nd; + const auto ret = hipGetDeviceCount(&nd); + if (ret != hipSuccess) { + // It isn't a big deal if we can't get the device count. + nd = 1; + } +#endif +#ifdef HOMMEXX_ENABLE_GPU std::stringstream ss; ss << "--kokkos-num-devices=" << nd; const auto key = ss.str(); @@ -50,6 +63,7 @@ void initialize_kokkos () { args.push_back(const_cast(str.data())); #endif + const char* silence = "--kokkos-disable-warnings"; args.push_back(const_cast(silence)); @@ -136,12 +150,14 @@ team_num_threads_vectors_for_gpu ( const int num_threads = ( (tp.max_threads_usable > num_device_threads) ? num_device_threads : tp.max_threads_usable ); + return std::make_pair( num_threads, prevpow2(num_device_threads / num_threads) ); } else { const int num_vectors = prevpow2( (tp.max_vectors_usable > num_device_threads) ? num_device_threads : tp.max_vectors_usable ); + return std::make_pair( num_device_threads / num_vectors, num_vectors ); } @@ -150,7 +166,7 @@ team_num_threads_vectors_for_gpu ( } // namespace Parallel std::pair -DefaultThreadsDistribution:: +DefaultThreadsDistribution:: team_num_threads_vectors (const int num_parallel_iterations, const ThreadPreferences tp) { // It appears we can't use Kokkos to tell us this. On current devices, using @@ -172,12 +188,22 @@ team_num_threads_vectors (const int num_parallel_iterations, # else const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); # endif + +#elif defined(KOKKOS_ENABLE_HIP) + + //use 64 wavefronts per CU and 120 CUs + const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); + const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; + const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; + #else + // I want thread-distribution rules to be unit-testable even when Cuda is // off. Thus, make up a P100-like machine: const int num_warps_device = 1792; const int num_threads_warp = 32; const int max_num_warps = 16; + #endif return Parallel::team_num_threads_vectors_for_gpu( diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.hpp b/components/homme/src/share/cxx/ExecSpaceDefs.hpp index d3f2cd43ae25..80e2e6ba4dde 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.hpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.hpp @@ -20,12 +20,23 @@ namespace Homme // Some in-house names for Kokkos exec spaces, which are // always defined, possibly as alias of void + +#ifdef HOMMEXX_ENABLE_GPU + #ifdef KOKKOS_ENABLE_CUDA -using Hommexx_Cuda = Kokkos::Cuda; +using HommexxGPU = Kokkos::Cuda; +#endif + +#ifdef KOKKOS_ENABLE_HIP +using HommexxGPU = Kokkos::Experimental::HIP; +#endif + #else -using Hommexx_Cuda = void; +using HommexxGPU = void; #endif + + #ifdef KOKKOS_ENABLE_OPENMP using Hommexx_OpenMP = Kokkos::OpenMP; #else @@ -44,7 +55,7 @@ using Hommexx_Serial = Kokkos::Serial; using Hommexx_Serial = void; #endif -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU # define HOMMEXX_STATIC #else # define HOMMEXX_STATIC static @@ -52,8 +63,8 @@ using Hommexx_Serial = void; // Selecting the execution space. If no specific request, use Kokkos default // exec space -#if defined(HOMMEXX_CUDA_SPACE) -using ExecSpace = Hommexx_Cuda; +#ifdef HOMMEXX_ENABLE_GPU +using ExecSpace = HommexxGPU; #elif defined(HOMMEXX_OPENMP_SPACE) using ExecSpace = Hommexx_OpenMP; #elif defined(HOMMEXX_THREADS_SPACE) @@ -73,7 +84,7 @@ template struct OnGpu { enum : bool { value = false }; }; template <> -struct OnGpu { enum : bool { value = true }; }; +struct OnGpu { enum : bool { value = true }; }; // Call this instead of Kokkos::initialize. void initialize_kokkos(); @@ -144,7 +155,7 @@ struct DefaultThreadsDistribution { // Specialization for a GPU, where threads can't be viewed as existing simply in // a pool. template <> -struct DefaultThreadsDistribution { +struct DefaultThreadsDistribution { static std::pair team_num_threads_vectors(const int num_parallel_iterations, const ThreadPreferences tp = ThreadPreferences()); @@ -230,12 +241,12 @@ struct Memory { }; template <> -struct Memory { - enum : bool { on_gpu = OnGpu::value }; +struct Memory { + enum : bool { on_gpu = OnGpu::value }; template KOKKOS_INLINE_FUNCTION static - Scalar* get_shmem (const Kokkos::TeamPolicy::member_type& team, + Scalar* get_shmem (const Kokkos::TeamPolicy::member_type& team, const size_t n = 0) { return static_cast(team.team_shmem().get_shmem(n*sizeof(Scalar))); } @@ -306,10 +317,10 @@ VECTOR_SIMD_LOOP } }; -#if defined KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU template <> -struct Dispatch { - using ExeSpace = Kokkos::Cuda; +struct Dispatch { + using ExeSpace = HommexxGPU; template KOKKOS_FORCEINLINE_FUNCTION @@ -351,7 +362,7 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_for_NP2 ( - const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, NP*NP), lambda); @@ -360,7 +371,7 @@ struct Dispatch { template static KOKKOS_FORCEINLINE_FUNCTION void parallel_reduce_NP2 ( - const Kokkos::TeamPolicy::member_type& team, + const Kokkos::TeamPolicy::member_type& team, const Lambda& lambda, ValueType& result) { parallel_reduce(team, Kokkos::ThreadVectorRange(team, NP*NP), diff --git a/components/homme/src/share/cxx/GllFvRemapImpl.hpp b/components/homme/src/share/cxx/GllFvRemapImpl.hpp index 9ea0a8c863b6..11738b2bf455 100644 --- a/components/homme/src/share/cxx/GllFvRemapImpl.hpp +++ b/components/homme/src/share/cxx/GllFvRemapImpl.hpp @@ -389,7 +389,7 @@ struct GllFvRemapImpl { // coalesced memory access on the GPU. Kokkos doesn't expose the number of // threads in a team, so we have to go to the lower-level API here. const int nthr_per_team = -#if defined __CUDA_ARCH__ +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ blockDim.x, #else 1, diff --git a/components/homme/src/share/cxx/KernelVariables.hpp b/components/homme/src/share/cxx/KernelVariables.hpp index 8a6f20ff6cfe..65d0eaec40c2 100644 --- a/components/homme/src/share/cxx/KernelVariables.hpp +++ b/components/homme/src/share/cxx/KernelVariables.hpp @@ -19,7 +19,7 @@ struct KernelVariables { template static KOKKOS_INLINE_FUNCTION - typename std::enable_if::value && + typename std::enable_if::value && !std::is_same::value, int >::type @@ -28,24 +28,14 @@ struct KernelVariables { return 0; } -#ifdef KOKKOS_ENABLE_CUDA -#ifdef __CUDA_ARCH__ +#ifdef HOMMEXX_ENABLE_GPU template static KOKKOS_INLINE_FUNCTION typename std::enable_if< OnGpu::value, int>::type get_team_idx(const int /*team_size*/, const int league_rank) { return league_rank; } -#else - template - static KOKKOS_INLINE_FUNCTION typename std::enable_if< - OnGpu::value, int>::type - get_team_idx(const int /*team_size*/, const int /*league_rank*/) { - assert(false); // should never happen - return -1; - } -#endif // __CUDA_ARCH__ -#endif // KOKKOS_ENABLE_CUDA +#endif // HOMMEXX_ENABLE_GPU #ifdef KOKKOS_ENABLE_OPENMP template diff --git a/components/homme/src/share/cxx/RemapFunctor.hpp b/components/homme/src/share/cxx/RemapFunctor.hpp index ba6a4cd0e82d..cec96657d35b 100644 --- a/components/homme/src/share/cxx/RemapFunctor.hpp +++ b/components/homme/src/share/cxx/RemapFunctor.hpp @@ -524,11 +524,11 @@ struct RemapFunctor : public Remapper { Kokkos::parallel_for(get_default_team_policy(ne*nv), r); } - int requested_buffer_size () const { + int requested_buffer_size () const override { return m_fields_provider.requested_buffer_size(); } - void init_buffers(const FunctorsBuffersManager& fbm) { + void init_buffers(const FunctorsBuffersManager& fbm) override { m_fields_provider.init_buffers(fbm); } diff --git a/components/homme/src/share/cxx/kokkos_utils.hpp b/components/homme/src/share/cxx/kokkos_utils.hpp index f2791153aac3..85e2ca31dc52 100644 --- a/components/homme/src/share/cxx/kokkos_utils.hpp +++ b/components/homme/src/share/cxx/kokkos_utils.hpp @@ -104,9 +104,9 @@ class TeamUtils : public _TeamUtilsCommonBase /* * Specialization for Cuda execution space. */ -#ifdef KOKKOS_ENABLE_CUDA +#ifdef HOMMEXX_ENABLE_GPU template <> -class TeamUtils : public _TeamUtilsCommonBase +class TeamUtils : public _TeamUtilsCommonBase { #ifdef HOMMEXX_CUDA_SHARE_BUFFER using Device = Kokkos::Device; @@ -124,7 +124,7 @@ class TeamUtils : public _TeamUtilsCommonBase public: template TeamUtils(const TeamPolicy& policy, const Real& overprov_factor = 1.25) : - _TeamUtilsCommonBase(policy) + _TeamUtilsCommonBase(policy) #ifdef HOMMEXX_CUDA_SHARE_BUFFER , _num_ws_slots(_league_size > _num_teams ? (overprov_factor * _num_teams > _league_size ? _league_size : overprov_factor * _num_teams) diff --git a/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp b/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp index 9f5944e326e2..99e9ca73b57b 100644 --- a/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp +++ b/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp @@ -15,9 +15,7 @@ namespace Homme { static void prim_advec_tracers_remap_RK2 (const Real dt); -#ifdef MODEL_THETA_L static void prim_advec_tracers_remap_compose (const Real dt); -#endif // ----------- IMPLEMENTATION ---------- // @@ -25,12 +23,7 @@ void prim_advec_tracers_remap (const Real dt) { SimulationParams& params = Context::singleton().get(); if (params.transport_alg > 0) { -#ifdef MODEL_THETA_L prim_advec_tracers_remap_compose(dt); -#else - Errors::runtime_abort("prim_advec_tracers_remap: " - "transport_alg > 0 not supported for non-theta-l builds."); -#endif } else { prim_advec_tracers_remap_RK2(dt); } @@ -96,8 +89,8 @@ static void prim_advec_tracers_remap_RK2 (const Real dt) GPTLstop("tl-at prim_advec_tracers_remap_RK2"); } -#ifdef MODEL_THETA_L static void prim_advec_tracers_remap_compose (const Real dt) { +#if defined MODEL_THETA_L && defined HOMME_ENABLE_COMPOSE GPTLstart("tl-at prim_advec_tracers_compose"); const auto& params = Context::singleton().get(); assert(params.params_set); @@ -107,7 +100,10 @@ static void prim_advec_tracers_remap_compose (const Real dt) { ct.reset(params); ct.run(tl, dt); GPTLstop("tl-at prim_advec_tracers_compose"); -} +#else + Errors::runtime_abort("prim_advec_tracers_remap_compose: " + "transport_alg > 0 not supported in this build."); #endif +} } // namespace Homme diff --git a/components/homme/src/share/cxx/prim_cxx_driver_base.F90 b/components/homme/src/share/cxx/prim_cxx_driver_base.F90 index 79e1e05842f9..57e18673a642 100644 --- a/components/homme/src/share/cxx/prim_cxx_driver_base.F90 +++ b/components/homme/src/share/cxx/prim_cxx_driver_base.F90 @@ -32,9 +32,13 @@ subroutine prim_init1(elem, par, dom_mt, tl) use time_mod, only : TimeLevel_t, TimeLevel_init use prim_driver_base, only : prim_init1_geometry, prim_init1_elem_arrays, & prim_init1_cleanup, prim_init1_buffers, & - prim_init1_compose, & MetaVertex, GridEdge, deriv1 + +#ifdef HOMME_ENABLE_COMPOSE + use prim_driver_base, only : prim_init1_compose use compose_mod, only : compose_control_kokkos_init_and_fin +#endif + #if !defined(CAM) && !defined(SCREAM) use prim_driver_base, only : prim_init1_no_cam #endif @@ -65,7 +69,9 @@ end subroutine initialize_hommexx_session ! Initialize kokkos before any environment changes from the Fortran call initialize_hommexx_session() ! Don't let any other components that use Kokkos control init/finalization. +#ifdef HOMME_ENABLE_COMPOSE call compose_control_kokkos_init_and_fin(.false.) +#endif #if !defined(CAM) && !defined(SCREAM) call prim_init1_no_cam(par) @@ -96,7 +102,9 @@ end subroutine initialize_hommexx_session ! ================================== call prim_init1_elem_arrays(elem,par) +#ifdef HOMME_ENABLE_COMPOSE call prim_init1_compose(par,elem) +#endif ! ================================== ! Initialize the buffers for exchanges diff --git a/components/homme/src/share/cxx/prim_step.cpp b/components/homme/src/share/cxx/prim_step.cpp index 8ab37cda79b6..8b82a65db77f 100644 --- a/components/homme/src/share/cxx/prim_step.cpp +++ b/components/homme/src/share/cxx/prim_step.cpp @@ -196,9 +196,11 @@ void prim_step_flexible (const Real dt, const bool compute_diagnostics) { context.get().run_diagnostics(false, 3); // Remap tracers. +#ifdef HOMME_ENABLE_COMPOSE if (params.qsize > 0) Context::singleton().get().remap_q(tl); - +#endif + GPTLstop("tl-s prim_step_flexible"); #else Errors::runtime_abort("prim_step_flexible not supported in non-theta-l builds."); diff --git a/components/homme/src/share/cxx/profiling.hpp b/components/homme/src/share/cxx/profiling.hpp index 80c93fdc3d17..e7ecbbb449e0 100644 --- a/components/homme/src/share/cxx/profiling.hpp +++ b/components/homme/src/share/cxx/profiling.hpp @@ -12,8 +12,7 @@ #include "gptl.h" -#if defined(HOMMEXX_CUDA_SPACE) || \ - (defined(HOMMEXX_DEFAULT_SPACE) && defined(KOKKOS_ENABLE_CUDA)) // Can't use GPTL timers on CUDA +#ifdef HOMMEXX_ENABLE_GPU #define start_timer(name) {} #define stop_timer(name) {} #else diff --git a/components/homme/src/share/cxx/utilities/BfbUtils.hpp b/components/homme/src/share/cxx/utilities/BfbUtils.hpp index afd64a843d27..e3570874e266 100644 --- a/components/homme/src/share/cxx/utilities/BfbUtils.hpp +++ b/components/homme/src/share/cxx/utilities/BfbUtils.hpp @@ -111,7 +111,7 @@ typename std::enable_if::value && !std::is_reference::value, ScalarType>::type bfb_pow_impl (ScalarType val, ExpType e) { -#ifdef CUDA_BUILD +#ifdef HOMMEXX_ENABLE_GPU // Note: this function is tailored (or taylored...eheh) // for -1 KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team_gpu (const TeamMember& team) { +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ // Can't use team.team_rank() here because vector direction also uses physical // threads but TeamMember types don't expose that information. return blockDim.x * threadIdx.y + threadIdx.x; @@ -133,17 +134,33 @@ int get_thread_id_within_team (const Kokkos::Impl::CudaTeamMember& team) { #endif } -KOKKOS_INLINE_FUNCTION -int get_team_nthr (const Kokkos::Impl::CudaTeamMember& team) { -#ifdef __CUDA_ARCH__ +template KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr_gpu (const TeamMember& team) { +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ return blockDim.x * blockDim.y; #else assert(0); return -1; #endif } -#endif +#ifdef KOKKOS_ENABLE_CUDA +KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team (const Kokkos::Impl::CudaTeamMember& team) +{ return get_thread_id_within_team_gpu(team); } +KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr (const Kokkos::Impl::CudaTeamMember& team) +{ return get_team_nthr_gpu(team); } +#endif // KOKKOS_ENABLE_CUDA + +#ifdef KOKKOS_ENABLE_HIP +KOKKOS_FORCEINLINE_FUNCTION +int get_thread_id_within_team (const Kokkos::Impl::HIPTeamMember& team) +{ return get_thread_id_within_team_gpu(team); } +KOKKOS_FORCEINLINE_FUNCTION +int get_team_nthr (const Kokkos::Impl::HIPTeamMember& team) +{ return get_team_nthr_gpu(team); } +#endif // KOKKOS_ENABLE_HIP template KOKKOS_INLINE_FUNCTION const T& min (const T& a, const T& b) { return a < b ? a : b; } diff --git a/components/homme/src/share/gllfvremap_mod.F90 b/components/homme/src/share/gllfvremap_mod.F90 index 35d8efb9ba2c..a62dc126057b 100644 --- a/components/homme/src/share/gllfvremap_mod.F90 +++ b/components/homme/src/share/gllfvremap_mod.F90 @@ -2798,7 +2798,7 @@ function test_sphere2ref() result(nerr) call ref2sphere(corners, refin(1), refin(2), sphere) call sphere2ref(corners, sphere, refout(1), refout(2)) err = abs(refin(1) - refout(1)) + abs(refin(2) - refout(2)) - if (err > 10*eps .or. & + if (err > 15*eps .or. & maxval(abs(refout)) > 1 + 5*eps .or. & any(refout /= refout)) then write(iulog,*) refin(1), refin(2) diff --git a/components/homme/src/share/sl_advection.F90 b/components/homme/src/share/sl_advection.F90 index c05fc528046d..fd14e437d4a9 100644 --- a/components/homme/src/share/sl_advection.F90 +++ b/components/homme/src/share/sl_advection.F90 @@ -18,7 +18,9 @@ module sl_advection use perf_mod, only : t_startf, t_stopf, t_barrierf ! _EXTERNAL use parallel_mod, only : abortmp, parallel_t use coordinate_systems_mod, only : cartesian3D_t +#ifdef HOMME_ENABLE_COMPOSE use compose_mod +#endif implicit none @@ -165,7 +167,9 @@ subroutine prim_advec_tracers_remap_ALE(elem, deriv, hvcoord, hybrid, dt, tl, ne ! For DCMIP16 supercell test case. use control_mod, only : dcmip16_mu_q use prim_advection_base, only : advance_physical_vis +#ifdef HOMME_ENABLE_COMPOSE use compose_mod, only : compose_h2d, compose_d2h +#endif use iso_c_binding, only : c_bool implicit none @@ -193,9 +197,12 @@ subroutine prim_advec_tracers_remap_ALE(elem, deriv, hvcoord, hybrid, dt, tl, ne ! Until I get the DSS onto GPU, always need to h<->d. !h2d = hybrid%par%nprocs > 1 .or. semi_lagrange_cdr_check .or. & (semi_lagrange_hv_q > 0 .and. nu_q > 0) h2d = .true. +#ifdef HOMME_ENABLE_COMPOSE d2h = compose_d2h .or. h2d h2d = compose_h2d .or. h2d - +#else + d2h = h2d +#endif call TimeLevel_Qdp(tl, dt_tracer_factor, n0_qdp, np1_qdp) call calc_trajectory(elem, deriv, hvcoord, hybrid, dt, tl, & diff --git a/components/homme/src/theta-l/config.h.cmake.in b/components/homme/src/theta-l/config.h.cmake.in index ed1933721b18..0fcd5e1e024d 100644 --- a/components/homme/src/theta-l/config.h.cmake.in +++ b/components/homme/src/theta-l/config.h.cmake.in @@ -63,8 +63,8 @@ /* Whether to use OpenMP4 */ #cmakedefine OMP4 -/* Whether this is a build for CUDA (used mostly for f90-vs-cxx bfb math functions impl) */ -#cmakedefine CUDA_BUILD +/* Whether this is a build for CUDA/HIP (used mostly for f90-vs-cxx bfb math functions impl) */ +#cmakedefine HOMMEXX_ENABLE_GPU /* temporary flag to compare f90-cxx until we settle on either cxx or f90 implementation */ #cmakedefine XX_NONBFB_COMING diff --git a/components/homme/src/theta-l/share/element_ops.F90 b/components/homme/src/theta-l/share/element_ops.F90 index 03638d96a5d0..d32d5ac820fb 100644 --- a/components/homme/src/theta-l/share/element_ops.F90 +++ b/components/homme/src/theta-l/share/element_ops.F90 @@ -716,7 +716,7 @@ subroutine tests_finalize(elem,hvcoord,ie) ! Disable the following check in CUDA bfb builds, ! since the calls to pow are inexact -#if !(defined(HOMMEXX_BFB_TESTING) && defined(CUDA_BUILD)) +#if !(defined(HOMMEXX_BFB_TESTING) && defined(HOMMEXX_ENABLE_GPU)) ! verify discrete hydrostatic balance call pnh_and_exner_from_eos(hvcoord,elem%state%vtheta_dp(:,:,:,tl),& elem%state%dp3d(:,:,:,tl),elem%state%phinh_i(:,:,:,tl),pnh,exner,dpnh_dp_i) diff --git a/components/homme/src/theta-l/share/prim_state_mod.F90 b/components/homme/src/theta-l/share/prim_state_mod.F90 index 1bbe9c003277..b707f29c56fe 100644 --- a/components/homme/src/theta-l/share/prim_state_mod.F90 +++ b/components/homme/src/theta-l/share/prim_state_mod.F90 @@ -530,7 +530,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) ! ! All of these transport terms are at time-tstep = (time1+time2)/2 ! Vertical transport terms -#ifdef ENERGY_DIAGNOSTICS +#if defined(ENERGY_DIAGNOSTICS) && !defined (HOMMEXX_ENABLE_GPU) do ie=nets,nete tmp(:,:,ie) = elem(ie)%accum%KEu_horiz1 enddo @@ -702,7 +702,7 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete) write(iulog,'(3a25)') "**DYNAMICS** J/m^2"," W/m^2","W/m^2 " if (ftype==4) & write(iulog,*) "NOTE:ftype=4 so d/dt and diss diagnostics include effects of forcing" -#ifdef ENERGY_DIAGNOSTICS +#if defined(ENERGY_DIAGNOSTICS) && !defined (HOMMEXX_ENABLE_GPU) ! terms computed during prim_advance, if ENERGY_DIAGNOSTICS is enabled if (theta_hydrostatic_mode) then write(iulog,'(a,2e22.14)')'KEu h-adv,sum=0:',KEH1,KEH2 diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt index f4696b446669..f86498c5cb08 100644 --- a/components/homme/src/theta-l_kokkos/CMakeLists.txt +++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt @@ -23,6 +23,9 @@ MACRO(THETAL_KOKKOS_SETUP) ${SRC_SHARE_DIR}/compose ${CMAKE_BINARY_DIR}/src/share/cxx ) + IF (NOT HOMME_ENABLE_COMPOSE) + LIST(REMOVE_ITEM EXEC_LIB_INCLUDE_DIRS ${SRC_SHARE_DIR}/compose) + ENDIF() SET (EXEC_INCLUDE_DIRS ${EXEC_LIB_INCLUDE_DIRS} @@ -105,6 +108,11 @@ MACRO(THETAL_KOKKOS_SETUP) ${SRC_SHARE_DIR}/planar_mesh_mod.F90 ) + IF (NOT HOMME_ENABLE_COMPOSE) + LIST(REMOVE_ITEM SRC_SHARE_F90 ${SRC_SHARE_DIR}/compose_mod.F90) + LIST(REMOVE_ITEM SRC_SHARE_F90 ${SRC_SHARE_DIR}/compose_test_mod.F90) + ENDIF() + SET(TEST_SRC_F90 ${TEST_SRC_DIR}/asp_tests.F90 ${TEST_SRC_DIR}/baroclinic_inst_mod.F90 diff --git a/components/homme/src/theta-l_kokkos/config.h.cmake.in b/components/homme/src/theta-l_kokkos/config.h.cmake.in index 141c60d0d32e..f5cacd509ff6 100644 --- a/components/homme/src/theta-l_kokkos/config.h.cmake.in +++ b/components/homme/src/theta-l_kokkos/config.h.cmake.in @@ -63,8 +63,8 @@ /* temporary flag to compare f90-cxx until we settle on either cxx or f90 implementation */ #cmakedefine XX_NONBFB_COMING -/* Identifies whether this is a cuda build */ -#cmakedefine CUDA_BUILD +/* Identifies whether this is a cuda/hip build */ +#cmakedefine HOMMEXX_ENABLE_GPU /* When doing BFB testing, we occasionally must use modified code. */ /* Use this flag to protect such code. */ diff --git a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp index 9598a30d73bc..770431f60f67 100644 --- a/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/Diagnostics.hpp @@ -13,6 +13,10 @@ #include "utilities/SubviewUtils.hpp" #include "utilities/ViewUtils.hpp" +#if ! defined(NDEBUG) +#define RESOLVE_ISSUE_WITH_ASSERTS +#endif + namespace Homme { @@ -35,10 +39,38 @@ class Diagnostics ExecViewUnmanaged dpnh_dp_i; }; +//In debug regime with asserts default team sizes are too big. +#if defined RESOLVE_ISSUE_WITH_ASSERTS + template + typename std::enable_if::value == false, + Kokkos::TeamPolicy >::type + d_team_policy(const int num_exec) { + return Homme::get_default_team_policy(num_exec); + } + + template + typename std::enable_if::value == true, + Kokkos::TeamPolicy >::type + d_team_policy(const int num_exec) { + ThreadPreferences tp; + tp.max_threads_usable = 8; //16 + tp.max_vectors_usable = 32; //32 + tp.prefer_larger_team = true; + return Homme::get_default_team_policy(num_exec, tp); + } +#endif + + + public: + Diagnostics (const int num_elems, const bool theta_hydrostatic_mode) : +#if ! defined(RESOLVE_ISSUE_WITH_ASSERTS) m_policy(Homme::get_default_team_policy(num_elems)), +#else + m_policy(d_team_policy(num_elems)), +#endif m_tu(m_policy), m_num_elems(num_elems), m_theta_hydrostatic_mode(theta_hydrostatic_mode) diff --git a/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp b/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp index 6a050b9b74ab..9d8c0c5959ab 100644 --- a/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp +++ b/components/homme/src/theta-l_kokkos/cxx/cxx_f90_interface_theta.cpp @@ -342,7 +342,9 @@ void init_functors_c (const bool& allocate_buffer) // use the create_if_not_there() function. auto& caar = c.create_if_not_there(elems,tracers,ref_FE,hvcoord,sph_op,params); if (params.transport_alg == 0) c.create_if_not_there(); +#ifdef HOMME_ENABLE_COMPOSE else c.create_if_not_there(); +#endif auto& hvf = c.create_if_not_there(); auto& ff = c.create_if_not_there(); auto& diag = c.create_if_not_there (elems.num_elems(),params.theta_hydrostatic_mode); @@ -360,8 +362,10 @@ void init_functors_c (const bool& allocate_buffer) auto& esf = c.get(); if (esf.setup_needed()) esf.setup(); } else { +#ifdef HOMME_ENABLE_COMPOSE auto& ct = c.get(); if (ct.setup_needed()) ct.setup(); +#endif } if (hvf.setup_needed()) { hvf.setup(geometry, state, derived); @@ -389,8 +393,10 @@ void init_functors_c (const bool& allocate_buffer) fbm.request_size(caar.requested_buffer_size()); if (params.transport_alg == 0) fbm.request_size(c.get().requested_buffer_size()); - else +#ifdef HOMME_ENABLE_COMPOSE + else fbm.request_size(c.get().requested_buffer_size()); +#endif fbm.request_size(hvf.requested_buffer_size()); fbm.request_size(diag.requested_buffer_size()); fbm.request_size(ff.requested_buffer_size()); @@ -407,8 +413,10 @@ void init_functors_c (const bool& allocate_buffer) caar.init_buffers(fbm); if (params.transport_alg == 0) Context::singleton().get().init_buffers(fbm); +#ifdef HOMME_ENABLE_COMPOSE else Context::singleton().get().init_buffers(fbm); +#endif hvf.init_buffers(fbm); diag.init_buffers(fbm); ff.init_buffers(fbm); @@ -562,9 +570,11 @@ void init_boundary_exchanges_c () esf.reset(params); esf.init_boundary_exchanges(); } else { +#ifdef HOMME_ENABLE_COMPOSE auto& ct = c.get(); ct.reset(params); ct.init_boundary_exchanges(); +#endif } // RK stages BE's diff --git a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 index 4f0277a45114..17aa79690d6f 100644 --- a/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 +++ b/components/homme/src/theta-l_kokkos/theta_f2c_mod.F90 @@ -173,8 +173,9 @@ subroutine prim_run_subcycle_c(tstep,nstep,nm1,n0,np1,next_output_step,nsplit_it ! ! Inputs ! - integer(kind=c_int), intent(in) :: nstep, nm1, n0, np1, next_output_step, nsplit_iter - real (kind=c_double), intent(in) :: tstep + integer(kind=c_int), intent(inout) :: nstep, nm1, n0, np1 + integer(kind=c_int), intent(in) :: next_output_step, nsplit_iter + real (kind=c_double), intent(in) :: tstep end subroutine prim_run_subcycle_c ! Copy results from C++ views back to f90 arrays diff --git a/components/homme/test/unit_tests/CMakeLists.txt b/components/homme/test/unit_tests/CMakeLists.txt index 48b3065b6062..d23262560759 100644 --- a/components/homme/test/unit_tests/CMakeLists.txt +++ b/components/homme/test/unit_tests/CMakeLists.txt @@ -15,7 +15,8 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con #ENDIF() SET_TESTS_PROPERTIES(${target_name}_test PROPERTIES LABELS "unit") - TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} kokkos) + TARGET_LINK_LIBRARIES(${target_name} timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + link_to_kokkos(${target_name}) IF (HOMME_USE_MKL) TARGET_COMPILE_OPTIONS (${target_name} PUBLIC -mkl) TARGET_LINK_LIBRARIES (${target_name} -mkl) @@ -38,6 +39,8 @@ macro(cxx_unit_test target_name target_f90_srcs target_cxx_srcs include_dirs con ELSE () TARGET_LINK_LIBRARIES(${target_name} pio) ENDIF () + ELSE () + ADD_DEFINITIONS(-DHOMME_WITHOUT_PIOLIBRARY) ENDIF () IF (NOT "${target_f90_srcs}" EQUAL "") diff --git a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt index 8ace99f9c596..3e01da434388 100644 --- a/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/preqx_kokkos_ut/CMakeLists.txt @@ -36,8 +36,8 @@ ADD_LIBRARY(preqx_kokkos_ut_lib TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(preqx_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(preqx_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib kokkos csm_share) -TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +link_to_kokkos(preqx_kokkos_ut_lib) +TARGET_LINK_LIBRARIES(preqx_kokkos_ut_lib timing csm_share ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (preqx_kokkos_ut_lib -mkl) ENDIF() diff --git a/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt index dcb8b0460832..35a25989f0ad 100644 --- a/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/share_kokkos_ut/CMakeLists.txt @@ -8,8 +8,8 @@ SET(UTILS_TIMING_DIR ${HOMME_SOURCE_DIR}/utils/cime/CIME/non_py/src/timing) # Note: need CUDA_BUILD and HOMMEXX_BFB_TESTING here, since the share # unit tests do not include a config.h file SET (COMMON_DEFINITIONS NP=4 NC=4) -IF (CUDA_BUILD) - SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} CUDA_BUILD) +IF (CUDA_BUILD OR HIP_BUILD) + SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} HOMMEXX_ENABLE_GPU) ENDIF() IF (HOMMEXX_BFB_TESTING) SET(COMMON_DEFINITIONS ${COMMON_DEFINITIONS} HOMMEXX_BFB_TESTING) diff --git a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt index e3cb502e5a0b..065672100b73 100644 --- a/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt +++ b/components/homme/test_execs/thetal_kokkos_ut/CMakeLists.txt @@ -39,8 +39,8 @@ ADD_LIBRARY(thetal_kokkos_ut_lib TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${EXEC_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(thetal_kokkos_ut_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) TARGET_COMPILE_DEFINITIONS(thetal_kokkos_ut_lib PUBLIC "HAVE_CONFIG_H") -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib kokkos csm_share) -TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing ${COMPOSE_LIBRARY_CPP} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +link_to_kokkos(thetal_kokkos_ut_lib) +TARGET_LINK_LIBRARIES(thetal_kokkos_ut_lib timing csm_share ${COMPOSE_LIBRARY_CPP} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) IF (HOMME_USE_MKL) TARGET_LINK_LIBRARIES (thetal_kokkos_ut_lib -mkl) ENDIF() diff --git a/components/homme/utils/csm_share/CMakeLists.txt b/components/homme/utils/csm_share/CMakeLists.txt index 41dade8bf8ff..b5f9bf5c41c1 100644 --- a/components/homme/utils/csm_share/CMakeLists.txt +++ b/components/homme/utils/csm_share/CMakeLists.txt @@ -28,7 +28,7 @@ if (NOT TARGET csm_share) add_custom_command ( OUTPUT ${SRC_FILE_OUT} COMMAND ${GENF90} ${SRC_FILE} > ${SRC_FILE_OUT} - DEPENDS ${SRC_FILE} genf90) + DEPENDS ${SRC_FILE}) list(APPEND CSM_SHARE_SOURCES ${SRC_FILE_OUT}) endforeach ()