Skip to content
15 changes: 9 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,17 @@ option (EKAT_DISABLE_TPL_WARNINGS "Whether we should suppress warnings when comp
set (EKAT_TPL_LIBRARIES_INTERNAL)

# A good chunk of EKAT is a utility layer over kokkos
BuildKokkos()
list (APPEND EKAT_TPL_LIBRARIES_INTERNAL ${Kokkos_LIBRARIES})
# Note: for SYCL/HIP backend, bleeding edge of kokkos/develop might be needed to test some functionality
# Kokkos_ROOT can point to custom installation directory
if (Kokkos_ROOT)
find_package(Kokkos REQUIRED)
Comment thread
bartgol marked this conversation as resolved.
else()
BuildKokkos()
list (APPEND EKAT_TPL_LIBRARIES_INTERNAL ${Kokkos_LIBRARIES})
endif()

set (EKAT_ENABLE_GPU False)
if (Kokkos_ENABLE_CUDA)
set (EKAT_ENABLE_GPU True)
endif ()
if (Kokkos_ENABLE_HIP)
if (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP OR Kokkos_ENABLE_SYCL)
set (EKAT_ENABLE_GPU True)
endif ()

Expand Down
6 changes: 5 additions & 1 deletion cmake/EkatSetCompilerFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ macro (SetWarningFlags)
set (FFLAGS -Wall)
set (CXXFLAGS -Wall)
endif()

SetFlags(FFLAGS ${FFLAGS} CFLAGS -Wall CXXFLAGS ${CXXFLAGS})
endmacro()

Expand Down Expand Up @@ -217,6 +217,8 @@ macro (SetReleaseFlags)
string (APPEND CMAKE_CXX_FLAGS_RELEASE " -fopenmp-simd")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
string (APPEND CMAKE_CXX_FLAGS_RELEASE " -qopenmp-simd")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
string (APPEND CMAKE_CXX_FLAGS_RELEASE " -fsycl -fsycl-unnamed-lambda -sycl-std=2020 -qopenmp-simd -Wsycl-strict -fsycl-device-code-split=per_kernel")
endif()
endif ()
endif ()
Expand Down Expand Up @@ -275,6 +277,8 @@ macro (SetDebugFlags)
string (APPEND CMAKE_CXX_FLAGS_DEBUG " -fopenmp-simd")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
string (APPEND CMAKE_CXX_FLAGS_DEBUG " -qopenmp-simd")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
string (APPEND CMAKE_CXX_FLAGS_DEBUG " -qopenmp-simd -fsycl -fsycl-unnamed-lambda -sycl-std=2020 -Wsycl-strict -fno-sycl-dead-args-optimization")
endif()
endif ()
endif ()
Expand Down
3 changes: 3 additions & 0 deletions cmake/machine-files/jlse-gen9.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Load gen9 arch with SYCL backend for kokkos
set(CMAKE_CXX_STANDARD 17)
include (${CMAKE_CURRENT_LIST_DIR}/kokkos/intel-gen9.cmake)
2 changes: 2 additions & 0 deletions cmake/machine-files/jlse-xehp.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Load XEHP arch with SYCL backend for kokkos
include (${CMAKE_CURRENT_LIST_DIR}/kokkos/intel-xehp.cmake)
5 changes: 5 additions & 0 deletions cmake/machine-files/kokkos/intel-gen9.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include (${CMAKE_CURRENT_LIST_DIR}/generic.cmake)

option(Kokkos_ARCH_INTEL_GEN9 "" ON)
set(Kokkos_ENABLE_SYCL TRUE CACHE BOOL "")
set(Kokkos_ENABLE_DEPRECATED_CODE_3 FALSE CACHE BOOL "")
5 changes: 5 additions & 0 deletions cmake/machine-files/kokkos/intel-xehp.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include (${CMAKE_CURRENT_LIST_DIR}/generic.cmake)

option(Kokkos_ARCH_INTEL_XEHP "" ON)
set(Kokkos_ENABLE_SYCL TRUE CACHE BOOL "")
set(Kokkos_ENABLE_DEPRECATED_CODE_3 FALSE CACHE BOOL "")
7 changes: 6 additions & 1 deletion src/ekat/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,12 @@ set(EKAT_HEADERS
add_library(ekat ${EKAT_SOURCES})

# Link MPI
target_link_libraries (ekat PUBLIC MPI::MPI_C)
if (Kokkos_ROOT)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This if statement is not necessary if we do the mod to CMakeLists.txt suggested in the comment above.

# Link Kokkos in-addition if not using the submodule
target_link_libraries (ekat PUBLIC Kokkos::kokkos MPI::MPI_C)
else ()
target_link_libraries (ekat PUBLIC MPI::MPI_C)
endif ()

target_include_directories(ekat PUBLIC
$<BUILD_INTERFACE:${EKAT_SOURCE_DIR}/src>
Expand Down
4 changes: 3 additions & 1 deletion src/ekat/ekat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ static constexpr bool ekatBFB = false;
typedef Kokkos::Cuda EkatGpuSpace;
# elif defined KOKKOS_ENABLE_HIP
typedef Kokkos::Experimental::HIP EkatGpuSpace;
# elif defined KOKKOS_ENABLE_SYCL
typedef Kokkos::Experimental::SYCL EkatGpuSpace;
# else
error "EKAT does not recognize a GPU space other than Cuda and HIP."
error "EKAT does not recognize a GPU space other than Cuda, HIP and SYCL".
# endif
#endif

Expand Down
24 changes: 18 additions & 6 deletions src/ekat/ekat_assert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,25 @@
} \
} while(0)

#define IMPL_KERNEL_THROW(condition, msg) \
do { \
if ( ! (condition) ) { \
printf("KERNEL CHECK FAILED:\n %s\n %s\n",#condition,msg); \
Kokkos::abort(""); \
} \
// SYCL cannot printf like the other backends quite yet
#ifdef __SYCL_DEVICE_ONLY__
#define IMPL_KERNEL_THROW(condition, msg) \
do { \
if ( ! (condition) ) { \
const __attribute__((opencl_constant)) char format[] = "KERNEL CHECK FAILED:\n %s\n"; \
Comment thread
abagusetty marked this conversation as resolved.
Outdated
sycl::ext::oneapi::experimental::printf(format,msg); \
Kokkos::abort(""); \
} \
} while (0)
#else
#define IMPL_KERNEL_THROW(condition, msg) \
do { \
if ( ! (condition) ) { \
printf("KERNEL CHECK FAILED:\n %s\n %s\n",#condition,msg); \
Kokkos::abort(""); \
} \
} while (0)
#endif

#ifndef NDEBUG
#define EKAT_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error)
Expand Down
2 changes: 1 addition & 1 deletion src/ekat/ekat_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
// Decide whether ekat defaults to BFB behavior when possible/appropriate
#cmakedefine EKAT_FPE

// A GPU space has been enabled in Kokkos, e.g., CUDA or HIP.
// A GPU space has been enabled in Kokkos, e.g., CUDA or HIP OR SYCL.
#cmakedefine EKAT_ENABLE_GPU

#endif // EKAT_CONFIG_H
12 changes: 12 additions & 0 deletions src/ekat/ekat_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ void initialize_kokkos () {
# elif defined KOKKOS_ENABLE_HIP
const auto ret = hipGetDeviceCount(&nd);
const bool ok = ret == hipSuccess;
# elif defined KOKKOS_ENABLE_SYCL
nd = 0;
auto gpu_devs = sycl::device::get_devices(sycl::info::device_type::gpu);
Comment thread
bartgol marked this conversation as resolved.
for (auto &dev : gpu_devs) {
if (dev.get_info<sycl::info::device::partition_max_sub_devices>() > 0) {
auto subDevs = dev.create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(sycl::info::partition_affinity_domain::numa);
nd += subDevs.size();
} else {
nd++;
}
}
const bool ok = true;
# else
error "No valid GPU space, yet EKAT_ENABLE_GPU is defined."
# endif
Expand Down
8 changes: 6 additions & 2 deletions src/ekat/kokkos/ekat_kokkos_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ namespace impl {
* detail, and, normally, should not be used by customer apps
*/
template <bool Serialize, typename TeamMember, typename Lambda, typename ValueType>
#ifdef KOKKOS_ENABLE_SYCL
KOKKOS_INLINE_FUNCTION SYCL_EXTERNAL

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have lots of KOKKOS_INLINE_FUNCTION in EKAT. Why does this particular case require a special handling? And what does this macro do differently from KOKKOS_INLINE_FUNCTION?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a bit of limitation on how static device-side functions and linking can be setup with SYCL. There are majorly two changes suggested for this function alone and has nothing to do with KOKKOS_INLINE_FUNCTION.

Regarding the need of SYCL_EXTERNAL: In the case of linking C++ functions to a SYCL application, where the definitions are not available in the same translation unit of the compiler, then the macro SYCL_EXTERNAL has to be provided. More info here

I have made some changes to cleanup this. Thanks.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I misread the two macros as just a single one.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But still, why do we need external here? The parallel_reduce fcn is defined inline here, so it should be available in all translation units. Am I missing something?

@abagusetty abagusetty Jul 5, 2022

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is exactly right. The definition available to all other translation units during linking. Mostly a temporary one, this will be improved in the future revisions of the specifications and the compiler implementations.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then we should not need SYCL_EXTERNAL at all, right?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making the function static was being problematic to device-side kernel. SYCL_EXTERNAL attr and removing static was suggested. I have slightly modified the function signature to look as below, please let me know if that looks okay

template <bool Serialize, typename TeamMember, typename Lambda, typename ValueType>
#ifdef KOKKOS_ENABLE_SYCL
SYCL_EXTERNAL
#else
static
#endif
KOKKOS_INLINE_FUNCTION
void parallel_reduce (const TeamMember& team,
                            const int& begin, // pack index                                                                                                                                
                            const int& end, // pack index                                                                                                                                  
                            const Lambda& lambda,
                            ValueType& result)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you saying that SYCL complains about the function being static, and requires the external keyword to compile? Either I am not understanding SYCL, or this seems wrong. The "inline" and "extern" keywords are somewhat mutually exclusive, so I am surprised that SYCL needs that macro. The documentation suggests that it is needed if the definition is not available to other TU's, but since it's inlined, it should be available as soon as this header is included.

Anyhow, if you say that it does not compile without that macro, then it's fine. I'm just not really happy with this confusing detail.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the confusion. The need for SYCL_EXTERNAL is not required any more. There has been a mix-up with compiler versions at my end. Thanks for pointing this out.

#else
static KOKKOS_INLINE_FUNCTION
#endif
void parallel_reduce (const TeamMember& team,
const int& begin, // pack index
const int& end, // pack index
Expand Down Expand Up @@ -88,7 +92,7 @@ void parallel_reduce (const TeamMember& team,
#ifdef EKAT_ENABLE_GPU
// Broadcast result to all threads by doing sum of one thread's
// non-0 value and the rest of the 0s.
TeamMember::vector_reduce(Kokkos::Sum<ValueType>(local_tmp));
team.vector_reduce(Kokkos::Sum<ValueType>(local_tmp));
#endif

result = local_tmp;
Expand Down Expand Up @@ -525,7 +529,7 @@ class TeamUtils<ValueType, Kokkos::OpenMP> : public TeamUtilsCommonBase<ValueTyp
#endif

/*
* Specialization for Cuda execution space.
* Specialization for CUDA, HIP and SYCL execution space.
*/
#ifdef EKAT_ENABLE_GPU
template <typename ValueType>
Expand Down
4 changes: 4 additions & 0 deletions src/ekat/util/ekat_arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ std::string ekat_config_string () {
<< " compiler id: " <<
#if defined __INTEL_COMPILER
"Intel\n"
#elif defined __INTEL_LLVM_COMPILER
"IntelLLVM\n"
#elif defined __HIPCC__
"AMD Clang\n"
#elif defined __GNUG__
"GCC\n"
#else
Expand Down
16 changes: 16 additions & 0 deletions src/ekat/util/ekat_tridiag.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ int get_thread_id_within_team_gpu (const TeamMember& team) {
// Can't use team.team_rank() here because vector direction also uses physical
// threads but TeamMember types don't expose that information.
return blockDim.x * threadIdx.y + threadIdx.x;
#elif defined(__SYCL_DEVICE_ONLY__)
auto item = team.item();
return static_cast<int>(item.get_local_range(1) * item.get_local_id(0)
+ item.get_local_id(1));
#else
assert(0);
return -1;
Expand All @@ -133,6 +137,9 @@ template <typename TeamMember> KOKKOS_FORCEINLINE_FUNCTION
int get_team_nthr_gpu (const TeamMember& team) {
#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__
return blockDim.x * blockDim.y;
#elif defined __SYCL_DEVICE_ONLY__
auto item = team.item();
return static_cast<int>(item.get_local_range(0) * item.get_local_range(1));
#else
assert(0);
return -1;
Expand All @@ -157,6 +164,15 @@ int get_team_nthr (const Kokkos::Impl::HIPTeamMember& team)
{ return get_team_nthr_gpu(team); }
#endif // KOKKOS_ENABLE_HIP

#ifdef KOKKOS_ENABLE_SYCL
KOKKOS_FORCEINLINE_FUNCTION
int get_thread_id_within_team (const Kokkos::Impl::SYCLTeamMember& team)
{ return get_thread_id_within_team_gpu(team); }
KOKKOS_FORCEINLINE_FUNCTION
int get_team_nthr (const Kokkos::Impl::SYCLTeamMember& team)
{ return get_team_nthr_gpu(team); }
#endif // KOKKOS_ENABLE_SYCL

// The caller must provide the team_barrier after this function returns before A
// is accessed.
template <typename TeamMember, typename TridiagDiag>
Expand Down
8 changes: 6 additions & 2 deletions tests/kokkos/kokkos_utils_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ void test_view_reduction(const Scalar a=Scalar(0.0), const int begin=0, const in
using Device = ekat::DefaultDevice;
using MemberType = typename ekat::KokkosTypes<Device>::MemberType;
using ExeSpace = typename ekat::KokkosTypes<Device>::ExeSpace;

using PackType = ekat::Pack<Scalar, VectorSize>;
using ViewType = Kokkos::View<PackType*,ExeSpace>;

Expand All @@ -267,7 +267,7 @@ void test_view_reduction(const Scalar a=Scalar(0.0), const int begin=0, const in
Scalar serial_result = Scalar(a);
ViewType data("data", view_length);
const auto data_h = Kokkos::create_mirror_view(data);
auto raw = data_h.data();
auto raw = data_h.data();
for (int k = 0; k < view_length; ++k) {
for (int p = 0; p < VectorSize; ++p) {
const int scalar_index = k*VectorSize+p;
Expand All @@ -292,7 +292,11 @@ void test_view_reduction(const Scalar a=Scalar(0.0), const int begin=0, const in
int team_size = ExeSpace::concurrency();
#ifdef EKAT_ENABLE_GPU
ExeSpace temp_space;
#ifdef KOKKOS_ENABLE_SYCL
auto num_sm = temp_space.impl_internal_space_instance()->m_queue->get_device().get_info<sycl::info::device::max_compute_units>();
#else
auto num_sm = temp_space.impl_internal_space_instance()->m_multiProcCount;
#endif
team_size /= (ekat::is_single_precision<Real>::value ? num_sm*64 : num_sm*32);
#endif

Expand Down