diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 41da22f8cb2f4..72dd0fe6c989e 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -187,6 +187,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true) TARGETS += $(BUILD_LIBFALLBACKLINKER) endif +SIMDSORT_BASE_DIR := $(TOPDIR)/src/java.base/linux/native/libsimdsort ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc) ############################################################################## ## Build libsimdsort @@ -196,6 +197,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) NAME := simdsort, \ LINK_TYPE := C++, \ OPTIMIZATION := HIGH, \ + SRC := $(SIMDSORT_BASE_DIR)/x86, \ CXXFLAGS := -std=c++17, \ DISABLED_WARNINGS_gcc := unused-variable, \ LIBS_linux := $(LIBM), \ @@ -204,4 +206,21 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) TARGETS += $(BUILD_LIBSIMD_SORT) endif +ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc) + $(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \ + NAME := simdsort, \ + TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ + OPTIMIZATION := HIGH, \ + SRC := $(SIMDSORT_BASE_DIR)/aarch64, \ + CFLAGS := $(CFLAGS_JDKLIB) -march=armv8.2-a+sve, \ + CXXFLAGS := $(CXXFLAGS_JDKLIB) -march=armv8.2-a+sve -std=c++17, \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LIBS := $(LIBCXX), \ + DISABLED_WARNINGS_gcc := unused-variable, \ + LIBS_linux := -lc -lm -ldl, \ + )) + + TARGETS += $(BUILD_LIBSIMD_SORT) +endif ################################################################################ diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 8e520314c8b6f..4f6451462aa3f 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -127,7 +127,9 @@ define_pd_global(intx, InlineSmallCode, 1000); "Branch Protection to use: none, standard, pac-ret") \ product(bool, AlwaysMergeDMB, true, DIAGNOSTIC, \ "Always merge DMB instructions in code emission") \ - + product(bool, UseSVELibSimdSortForFP, false, EXPERIMENTAL, \ + "Use SVE-based LibSimdSort for float type on SVE supporting " \ + "machines") \ // end of ARCH_FLAGS #endif // CPU_AARCH64_GLOBALS_AARCH64_HPP diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index 0fbc2ef141e8b..a0d98f3304afe 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -197,6 +197,13 @@ // Is SIMD sort supported for this CPU? static bool supports_simd_sort(BasicType bt) { + // SIMD sort is supported only on SVE machines + if (VM_Version::supports_sve()) { + // Currently, only T_INT and T_FLOAT types are supported. + // However, T_FLOAT is supported only if the experimental + // flag - UseSVELibSimdSortForFP is enabled. + return (bt == T_INT || (bt == T_FLOAT && UseSVELibSimdSortForFP)); + } return false; } diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 7e2f333ba4086..8025aa0c9331e 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -11873,6 +11873,10 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_montgomerySquare = g.generate_multiply(); } + // Load sve_sort library on supported hardware to enable SIMD sort and partition intrinsics + if (VM_Version::supports_sve()) { + (void)StubRoutines::try_load_simdsort("sve_sort", "sve_partition"); + } #endif // COMPILER2 if (UseChaCha20Intrinsics) { diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index efb0411aa39df..0ba451166de6a 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4303,22 +4303,10 @@ void StubGenerator::generate_compiler_stubs() { // Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics if (VM_Version::supports_avx512dq() || VM_Version::supports_avx2()) { - void *libsimdsort = nullptr; - char ebuf_[1024]; - char dll_name_simd_sort[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) { - libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); - } - // Get addresses for SIMD sort and partition routines - if (libsimdsort != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); - - os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_sort" : "avx2_sort"); - StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_); - - os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_partition" : "avx2_partition"); - StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_); - } + const bool use_avx512 = VM_Version::supports_avx512_simd_sort(); + const char* sort_sym = use_avx512 ? "avx512_sort" : "avx2_sort"; + const char* partition_sym = use_avx512 ? "avx512_partition" : "avx2_partition"; + (void)StubRoutines::try_load_simdsort(sort_sym, partition_sym); } #endif // COMPILER2 diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 5246613738e46..3131965bd63b1 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -469,6 +469,38 @@ StubRoutines::select_arraycopy_function(BasicType t, bool aligned, bool disjoint #undef RETURN_STUB_PARM } +bool StubRoutines::try_load_simdsort(const char* sort_sym, const char* partition_sym) { + void* libsimdsort = nullptr; + char ebuf_[1024]; + char dll_name_simd_sort[JVM_MAXPATHLEN]; + + if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), + Arguments::get_dll_dir(), "simdsort")) { + libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); + } + + if (libsimdsort == nullptr) { + return false; + } + + // Get addresses for SIMD sort and partition routines + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, + JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); + address sort_addr = (address)os::dll_lookup(libsimdsort, sort_sym); + address partition_addr = (address)os::dll_lookup(libsimdsort, partition_sym); + + if (sort_addr == nullptr || partition_addr == nullptr) { + log_warning(library)("libsimdsort missing symbols: %s=" INTPTR_FORMAT ", %s=" INTPTR_FORMAT, + sort_sym, p2i(sort_addr), partition_sym, p2i(partition_addr)); + // If either of the addresses are null, return false. + return false; + } + + StubRoutines::_array_sort = sort_addr; + StubRoutines::_array_partition = partition_addr; + return true; +} + UnsafeMemoryAccessMark::UnsafeMemoryAccessMark(StubCodeGenerator* cgen, bool add_entry, bool continue_at_scope_end, address error_exit_pc) { _cgen = cgen; _ucm_entry = nullptr; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 97e3e46b87063..edcbad64a19a6 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -28,6 +28,7 @@ #include "code/codeBlob.hpp" #include "memory/allocation.hpp" #include "prims/vectorSupport.hpp" +#include "runtime/arguments.hpp" #include "runtime/frame.hpp" #include "runtime/mutexLocker.hpp" #include "runtime/stubCodeGenerator.hpp" @@ -362,6 +363,9 @@ class StubRoutines: AllStatic { static void arrayof_oop_copy (HeapWord* src, HeapWord* dest, size_t count); static void arrayof_oop_copy_uninit(HeapWord* src, HeapWord* dest, size_t count); + // SIMD sort support. This method resolves the symbols - sort_sym, partition_sym + // and on success sets the StubRoutines::_array_sort/_array_partition and returns true. + static bool try_load_simdsort(const char* sort_sym, const char* partition_sym); }; #endif // SHARE_RUNTIME_STUBROUTINES_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp b/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp new file mode 100644 index 0000000000000..848f8a8562d7d --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_PIVOT_SELECTION_HPP +#define AARCH64_SVE_PIVOT_SELECTION_HPP + +#include +#include "sve-config.hpp" + +/* The current pivot selection method follows median-of-three method. + * Possible improvements could be the usage of sorting network (Compare and exchange sorting) + * for larger arrays. + */ + +template +static inline type_t get_pivot_blocks(type_t* arr, const arrsize_t left, const arrsize_t right) { + const arrsize_t len = right - left; + if (len < 64) return arr[left]; + + const arrsize_t mid = left + (len / 2); + const type_t a = arr[left]; + const type_t b = arr[mid]; + const type_t c = arr[right - 1]; + + const type_t min_ab = std::min(a, b); + const type_t max_ab = std::max(a, b); + + return std::min(max_ab, std::max(min_ab, c)); +} + +#endif // AARCH64_SVE_PIVOT_SELECTION_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp b/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp new file mode 100644 index 0000000000000..4773332f31281 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SIMDSORT_SUPPORT_HPP +#define SIMDSORT_SUPPORT_HPP +#include +#include + +#undef assert +#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }} + +// GCC >= 10.1 is required for a full support of ARM SVE ACLE intrinsics (which also includes the header file - arm_sve.h) +#if defined(__aarch64__) && defined(_LP64) && defined(__GNUC__) && \ + ((__GNUC__ > 10) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 1)) +#define __SIMDSORT_SUPPORTED_LINUX +#endif + +#endif //SIMDSORT_SUPPORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp new file mode 100644 index 0000000000000..d4f799b9eb858 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_COMMON_QSORT_HPP +#define AARCH64_SVE_COMMON_QSORT_HPP +#include +#include +#include +#include + +#include "sve-config.hpp" +#include "classfile_constants.h" +#include "simdsort-support.hpp" +#include "sve-qsort.hpp" +#include "pivot-selection.hpp" +#include "sve-oet-sort.hpp" + +template +bool sve_comparison_func_ge(const T &a, const T &b) { + return a < b; +} + +template +bool sve_comparison_func_gt(const T &a, const T &b) { + return a <= b; +} + +/* + * Partitions a single SIMD vector based on a pivot and returns the number + * of lanes greater than or equal to the pivot. + */ +template +SVE_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t &smallest_vec, + reg_t &biggest_vec, bool use_gt) { + typename vtype::opmask_t mask; + if (use_gt) { + mask = vtype::gt(curr_vec, pivot_vec); + } else { + mask = vtype::ge(curr_vec, pivot_vec); + } + + int amount_ge_pivot = vtype::double_compressstore(l_store, r_store, mask, curr_vec); + + smallest_vec = vtype::min(curr_vec, smallest_vec); + biggest_vec = vtype::max(curr_vec, biggest_vec); + + return amount_ge_pivot; +} + +/* + * Partition an array based on the pivot and returns the index of the + * first element that is greater than or equal to the pivot. + */ +template +SVE_SORT_INLINE arrsize_t sve_vect_partition_(type_t *arr, arrsize_t left, + arrsize_t right, type_t pivot, + type_t *smallest, + type_t *biggest, + bool use_gt) { + auto comparison_func = use_gt ? sve_comparison_func_gt : sve_comparison_func_ge; + + // Store the number of lanes in a local variable + const arrsize_t num_lanes = vtype::numlanes(); + + /* make array length divisible by num_lanes, shortening the array */ + for (int32_t i = (right - left) % num_lanes; i > 0; --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + if (left == right) + return left; /* less than num_lanes elements in the array */ + + using reg_t = typename vtype::reg_t; + + reg_t pivot_vec = vtype::set1(pivot); + reg_t min_vec = vtype::set1(*smallest); + reg_t max_vec = vtype::set1(*biggest); + + // If there is only num_lanes worth of elements to be sorted + if (right - left == num_lanes) { + reg_t vec = vtype::loadu(arr + left); + arrsize_t l_store = left; + arrsize_t r_store = l_store; + + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec, pivot_vec, min_vec, max_vec, use_gt); + + l_store += (num_lanes - amount_ge_pivot); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + + return l_store; + } + + // first and last num_lanes values are partitioned at the end + reg_t vec_left = vtype::loadu(arr + left); + reg_t vec_right = vtype::loadu(arr + (right - num_lanes)); + + // store points of the vectors + arrsize_t l_store = left; + arrsize_t r_store = right - num_lanes; + + // indices for loading the elements + left += num_lanes; + right -= num_lanes; + + while (right - left != 0) { + reg_t curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((r_store + num_lanes) - right < left - l_store) { + right -= num_lanes; + curr_vec = vtype::loadu(arr + right); + } else { + curr_vec = vtype::loadu(arr + left); + left += num_lanes; + } + // partition the current vector and save it on both sides of the array + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + curr_vec, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + } + + /* partition and save vec_left and vec_right */ + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec_left, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + + + amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec_right, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + + return l_store; +} + +// Process a single vector for partitioning +template +SVE_SORT_INLINE void sve_partition_single_vec(type_t* arr, + arrsize_t& l_store, + arrsize_t& r_store, + typename vtype::reg_t v, + typename vtype::reg_t pivot_vec, + typename vtype::reg_t& min_vec, + typename vtype::reg_t& max_vec, + bool use_gt, arrsize_t num_lanes) { + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + v, pivot_vec, min_vec, max_vec, use_gt); + + l_store += num_lanes - amount_ge_pivot; + r_store -= amount_ge_pivot; +} + +// Unrolled version of sve_vect_partition_() with an UNROLL_FACTOR of either 2 or 4 +// The UNROLL_FACTOR is 2 if the vector length <= 16B and it is 4 if the vector length > 16B +template +SVE_SORT_INLINE arrsize_t +sve_partition_unrolled(type_t* arr, arrsize_t left, arrsize_t right, + type_t pivot, type_t* smallest, type_t* biggest, bool use_gt) { + static_assert(UNROLL_FACTOR == 2 || UNROLL_FACTOR == 4, "unsupported unroll factor"); + + const arrsize_t num_lanes = vtype::numlanes(); + + if constexpr (UNROLL_FACTOR == 0) { + return sve_vect_partition_(arr, left, right, pivot, smallest, biggest, use_gt); + } + + // use regular partition routine for small arrays + if (right - left < 3 * UNROLL_FACTOR * num_lanes) { + return sve_vect_partition_(arr, left, right, pivot, smallest, biggest, use_gt); + } + + auto comparison_func = use_gt ? sve_comparison_func_gt + : sve_comparison_func_ge; + + // make array length divisible by num_lanes, shortening the array + for (int32_t i = (right - left) % num_lanes; i > 0; --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + arrsize_t l_store = left; + arrsize_t r_store = right - num_lanes; + + using reg_t = typename vtype::reg_t; + reg_t pivot_vec = vtype::set1(pivot); + reg_t min_vec = vtype::set1(*smallest); + reg_t max_vec = vtype::set1(*biggest); + + /* Calculate and load more registers to make the rest of the array a + * multiple of num_unroll. These registers will be partitioned at the very + * end. */ + int vecsToPartition = ((right - left) / num_lanes) % UNROLL_FACTOR; + +#define SVE_UNROLL_APPLY(OP) \ + do { \ + if constexpr (UNROLL_FACTOR >= 1) { OP(0); } \ + if constexpr (UNROLL_FACTOR >= 2) { OP(1); } \ + if constexpr (UNROLL_FACTOR >= 3) { OP(2); } \ + if constexpr (UNROLL_FACTOR >= 4) { OP(3); } \ + } while (false) + +#define SVE_DECLARE_REG_SET(NAME, INIT) \ + [[maybe_unused]] reg_t NAME##0 = (INIT); \ + [[maybe_unused]] reg_t NAME##1 = NAME##0; \ + [[maybe_unused]] reg_t NAME##2 = NAME##0; \ + [[maybe_unused]] reg_t NAME##3 = NAME##0 + +#define SVE_DECLARE_REG_SET_UNINIT(NAME) \ + reg_t NAME##0; \ + reg_t NAME##1; \ + reg_t NAME##2; \ + reg_t NAME##3 + +#define SVE_REG(NAME, IDX) NAME##IDX + +#define SVE_PARTITION_ONE(REG) \ + sve_partition_single_vec(arr, l_store, r_store, \ + REG, pivot_vec, min_vec, max_vec, \ + use_gt, num_lanes) + +#define SVE_LOAD_BLOCK_FROM(BASE_PTR, NAME, I) \ + SVE_REG(NAME, I) = vtype::loadu((BASE_PTR) + (I) * num_lanes) + +#define SVE_LOAD_TAIL(I) \ + do { \ + if (vecsToPartition > (I)) { \ + SVE_LOAD_BLOCK_FROM(arr + left, align_vec, I); \ + } \ + } while(false) + +#define SVE_LOAD_LEFT(I) \ + SVE_LOAD_BLOCK_FROM(arr + left, left_vec, I) + +#define SVE_LOAD_RIGHT(I) \ + SVE_LOAD_BLOCK_FROM(arr + right_load_start, right_vec, I) + +#define SVE_LOAD_BATCH_FROM_RIGHT(I) \ + SVE_LOAD_BLOCK_FROM(arr + right, curr_vec, I) + +#define SVE_LOAD_BATCH_FROM_LEFT(I) \ + SVE_LOAD_BLOCK_FROM(arr + left, curr_vec, I) + +#define SVE_PARTITION_BATCH(I) \ + SVE_PARTITION_ONE(SVE_REG(curr_vec, I)) +#define SVE_PARTITION_LEFT(I) SVE_PARTITION_ONE(SVE_REG(left_vec, I)) +#define SVE_PARTITION_RIGHT(I) SVE_PARTITION_ONE(SVE_REG(right_vec, I)) +#define SVE_PARTITION_TAIL(I) \ + do { \ + if (vecsToPartition > (I)) { \ + SVE_PARTITION_ONE(SVE_REG(align_vec, I)); \ + } \ + } while(false) + + // Initialize the vectors to something arbitrary which will be overwritten when + // the appropriate array elements are loaded in them + SVE_DECLARE_REG_SET(align_vec, vtype::set1(pivot)); + + // Load the align_vec vectors depending on the vecsToPartition value + SVE_UNROLL_APPLY(SVE_LOAD_TAIL); + + // Initialize the vectors to something arbitrary which will be overwritten when + // the appropriate array elements are loaded in them + left += vecsToPartition * num_lanes; + + /* Load left and right vtype::numlanes*num_unroll values into + * registers to make space for in-place parition. The vec_left and + * vec_right registers are partitioned at the end. + * Similar to the align_vec vectors, the left and right vectors + * are also initialized to an arbitrary value which will eventually be + * overwritten by array loads. */ + + SVE_DECLARE_REG_SET(left_vec, vtype::set1(pivot)); + SVE_DECLARE_REG_SET(right_vec, vtype::set1(pivot)); + + const arrsize_t right_load_start = right - UNROLL_FACTOR * num_lanes; + + SVE_UNROLL_APPLY(SVE_LOAD_LEFT); + SVE_UNROLL_APPLY(SVE_LOAD_RIGHT); + + /* indices for loading the elements */ + left += UNROLL_FACTOR * num_lanes; + right -= UNROLL_FACTOR * num_lanes; + + while ((right - left) != 0) { + if ((r_store + num_lanes) - right < left - l_store) { + // Load from the right side if there are fewer elements on the right + // and partition the vectors + // TODO: Explore if prefetching the next set of vectors would be beneficial here + right -= (UNROLL_FACTOR * num_lanes); + SVE_DECLARE_REG_SET_UNINIT(curr_vec); + SVE_UNROLL_APPLY(SVE_LOAD_BATCH_FROM_RIGHT); + SVE_UNROLL_APPLY(SVE_PARTITION_BATCH); + } else { + // Load from the left side if there are fewer elements on the left + // and partition the vectors + SVE_DECLARE_REG_SET_UNINIT(curr_vec); + SVE_UNROLL_APPLY(SVE_LOAD_BATCH_FROM_LEFT); + left += UNROLL_FACTOR * num_lanes; + SVE_UNROLL_APPLY(SVE_PARTITION_BATCH); + } + } + + // Partition the left and right vectors + SVE_UNROLL_APPLY(SVE_PARTITION_LEFT); + SVE_UNROLL_APPLY(SVE_PARTITION_RIGHT); + + // Partition the align_vec vectors + SVE_UNROLL_APPLY(SVE_PARTITION_TAIL); + +#undef SVE_LOAD_TAIL +#undef SVE_LOAD_LEFT +#undef SVE_LOAD_RIGHT +#undef SVE_PARTITION_LEFT +#undef SVE_PARTITION_RIGHT +#undef SVE_PARTITION_TAIL +#undef SVE_PARTITION_BATCH +#undef SVE_LOAD_BATCH_FROM_LEFT +#undef SVE_LOAD_BATCH_FROM_RIGHT +#undef SVE_PARTITION_ONE +#undef SVE_REG +#undef SVE_DECLARE_REG_SET +#undef SVE_DECLARE_REG_SET_UNINIT +#undef SVE_UNROLL_APPLY + + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} + +template +SVE_SORT_INLINE arrsize_t sve_partition_select(type_t *arr, arrsize_t left, arrsize_t right, type_t pivot, + type_t *smallest, type_t *biggest, bool use_gt) { + if (vtype::partition_unroll_factor() == 4) { + return sve_partition_unrolled(arr, left, right, pivot, smallest, biggest, use_gt); + } else { + return sve_partition_unrolled(arr, left, right, pivot, smallest, biggest, use_gt); + } +} + +template +SVE_SORT_INLINE void sve_qsort(type_t* arr, arrsize_t left, arrsize_t right, + arrsize_t max_iters) { + if ((right - left) <= OET_SORT_THRESHOLD) + return; + + if (max_iters <= 0) { + std::sort(arr + left, arr + right, sve_comparison_func_ge); + return; + } + + type_t pivot = get_pivot_blocks(arr, left, right); + + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + + arrsize_t pivot_index = sve_partition_select(arr, left, right, + pivot, &smallest, + &biggest, false); + + if (pivot != smallest) { + sve_qsort(arr, left, pivot_index, max_iters - 1); + } + if (pivot != biggest) { + sve_qsort(arr, pivot_index, right, max_iters - 1); + } +} + +template +SVE_SORT_INLINE int64_t sve_vect_partition(type_t* arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) { + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = sve_partition_select(arr, from_index, to_index, + pivot, &smallest, &biggest, use_gt); + return pivot_index; +} + +template +SVE_SORT_INLINE void sve_dual_pivot_partition(T* arr, int64_t from_index, int64_t to_index, + int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){ + const T pivot1 = arr[index_pivot1]; + const T pivot2 = arr[index_pivot2]; + + const int64_t low = from_index; + const int64_t high = to_index; + const int64_t start = low + 1; + const int64_t end = high - 1; + + std::swap(arr[index_pivot1], arr[low]); + std::swap(arr[index_pivot2], arr[end]); + + const int64_t pivot_index2 = sve_vect_partition(arr, start, end, pivot2, true); // use_gt = true + std::swap(arr[end], arr[pivot_index2]); + int64_t upper = pivot_index2; + + // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning + if (upper == start) { + pivot_indices[0] = low; + pivot_indices[1] = upper; + return; + } + + const int64_t pivot_index1 = sve_vect_partition(arr, start, upper, pivot1, false); // use_ge (use_gt = false) + int64_t lower = pivot_index1 - 1; + std::swap(arr[low], arr[lower]); + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +SVE_SORT_INLINE void sve_single_pivot_partition(T* arr, int64_t from_index, int64_t to_index, + int32_t *pivot_indices, int64_t index_pivot) { + const T pivot = arr[index_pivot]; + + const int64_t low = from_index; + const int64_t high = to_index; + const int64_t end = high - 1; + + + const int64_t pivot_index1 = sve_vect_partition(arr, low, high, pivot, false); // use_gt = false (use_ge) + int64_t lower = pivot_index1; + + const int64_t pivot_index2 = sve_vect_partition(arr, pivot_index1, high, pivot, true); // use_gt = true + int64_t upper = pivot_index2; + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +SVE_SORT_INLINE void insertion_sort(T* arr, int32_t from_index, int32_t to_index) { + for (int i, k = from_index; ++k < to_index; ) { + T ai = arr[i = k]; + if (ai < arr[i - 1]) { + while (--i >= from_index && ai < arr[i]) { + arr[i + 1] = arr[i]; + } + arr[i + 1] = ai; + } + } +} + +template +SVE_SORT_INLINE void sve_fast_sort(T* arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) { + arrsize_t arrsize = to_index - from_index; + + if (arrsize <= INS_SORT_THRESHOLD) { + insertion_sort(arr, from_index, to_index); + } else { + sve_qsort, T>(arr, from_index, to_index, 2 * (arrsize_t) (63 - __builtin_clzll((unsigned long long) arrsize))); + sve_oet_sort, T>(arr, from_index, to_index); + } +} + +template +SVE_SORT_INLINE void sve_fast_partition(T* arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) { + if (index_pivot1 != index_pivot2) { + sve_dual_pivot_partition, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + else { + sve_single_pivot_partition, T>(arr, from_index, to_index, pivot_indices, index_pivot1); + } +} +#endif // AARCH64_SVE_COMMON_QSORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp new file mode 100644 index 0000000000000..86a7217ca71f1 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp @@ -0,0 +1,55 @@ +/* + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_CONFIG_HPP +#define AARCH64_SVE_CONFIG_HPP + +#include +#include +#include +#include "simdsort-support.hpp" + +#define SIMD_SORT_INFINITYF std::numeric_limits::infinity() +#define SIMD_SORT_MAX_INT32 std::numeric_limits::max() +#define SIMD_SORT_MIN_INT32 std::numeric_limits::min() + +#if defined(__GNUC__) + #define SVE_SORT_INLINE static inline + #define SVE_SORT_FINLINE static inline __attribute__((always_inline)) +#else + #define SVE_SORT_INLINE static + #define SVE_SORT_FINLINE static +#endif + +#ifndef DLL_PUBLIC + #define DLL_PUBLIC __attribute__((visibility("default"))) +#endif + +using arrsize_t = std::size_t; + +#ifndef OET_SORT_THRESHOLD + #define OET_SORT_THRESHOLD 8 +#endif + +#endif // AARCH64_SVE_CONFIG_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp new file mode 100644 index 0000000000000..9b6d3d2f52e4e --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "sve-config.hpp" +#include "sve-common-qsort.hpp" +#include "classfile_constants.h" +#include "simdsort-support.hpp" +#include + +extern "C" { + + DLL_PUBLIC void sve_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { + switch(elem_type) { + case JVM_T_INT: + sve_fast_sort((int32_t*)array, from_index, to_index, 64); + break; + case JVM_T_FLOAT: + sve_fast_sort((float*)array, from_index, to_index, 64); + break; + case JVM_T_LONG: + case JVM_T_DOUBLE: + default: + assert(false, "Unexpected type"); + } + } + + DLL_PUBLIC void sve_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + switch(elem_type) { + case JVM_T_INT: + sve_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_FLOAT: + sve_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_LONG: + case JVM_T_DOUBLE: + default: + assert(false, "Unexpected type"); + } + } +} diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp new file mode 100644 index 0000000000000..61f24bc01a532 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp @@ -0,0 +1,52 @@ +/* + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_OET_SORT_HPP +#define AARCH64_SVE_OET_SORT_HPP + +#include "sve-config.hpp" +#include "sve-qsort.hpp" + +template +SVE_SORT_INLINE void sve_oet_sort(type_t* arr, arrsize_t from_index, arrsize_t to_index) { + arrsize_t arr_num = to_index - from_index; + const uint8_t numLanes = vtype::numlanes(); + + for (int32_t i = 0; i < OET_SORT_THRESHOLD; i++) { + // Odd-even pass: even i -> j starts at from_index + // odd i -> j starts at from_index + 1 + int32_t j = from_index + i % 2; + int32_t remaining = arr_num - (i % 2); + + while (remaining >= 2) { + const int32_t vals_per_iteration = (remaining < (2 * numLanes)) ? remaining : 2 * numLanes; + const int32_t num = vals_per_iteration / 2; + vtype::oet_sort(&arr[j], num); + + j += vals_per_iteration; + remaining -= vals_per_iteration; + } + } +} +#endif // AARCH64_SVE_OET_SORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp new file mode 100644 index 0000000000000..24b5f1aaf8033 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SVE_QSORT_VECTOR +#define SVE_QSORT_VECTOR + +#include +#include +#include + +template +struct sve_vector; + +template <> +struct sve_vector { + using type_t = int32_t; + using reg_t = svint32_t; // SVE 32-bit integer vector + using opmask_t = svbool_t; // predicate register + /* TODO: Prefer avoiding a runtime svcntw() call when the vector length + * is known at compile time. One option is to add a template parameter to + * this struct for common cases - 128/256 bits with a fallback to svcntw() + * if the vector width is unknown at compile time. + */ + static inline uint8_t numlanes() { + return static_cast(svcntw()); + } + + static inline int partition_unroll_factor() { + return (svcntw() * sizeof(type_t)) > 16 ? 4 : 2; + } + + static type_t type_max() { return SIMD_SORT_MAX_INT32; } + static type_t type_min() { return SIMD_SORT_MIN_INT32; } + + static opmask_t knot_opmask(opmask_t x) { + return svnot_b_z(svptrue_b32(), x); + } + + static opmask_t ge(reg_t x, reg_t y) { + return svcmpge_s32(svptrue_b32(),x, y); + } + + static opmask_t gt(reg_t x, reg_t y) { + return svcmpgt_s32(svptrue_b32(),x, y); + } + + static reg_t loadu(void const *mem) { + return svld1_s32(svptrue_b32(), (const int32_t*)mem); + } + + static type_t reducemax(reg_t v) { + return svmaxv_s32(svptrue_b32(), v); + } + + static type_t reducemin(reg_t v) { + return svminv_s32(svptrue_b32(), v); + } + + static reg_t set1(type_t v) { + return svdup_n_s32(v); + } + + static void storeu(void *mem, reg_t x) { + return svst1_s32(svptrue_b32(), (int32_t*)mem, x); + } + + static reg_t min(reg_t x, reg_t y) { + return svmin_s32_z(svptrue_b32(), x, y); + } + + static reg_t max(reg_t x, reg_t y) { + return svmax_s32_z(svptrue_b32(), x, y); + } + + static int double_compressstore(type_t *left_addr, type_t *right_addr, + opmask_t k, reg_t reg) { + // fast path if all vector elements are less than pivot + svbool_t pg = svptrue_b32(); + if (!svptest_any(pg, k)) { + svst1_s32(pg, (int32_t*)left_addr, reg); + return 0; + } + + // fast path if all vector elements are greater than pivot + if (!svptest_any(pg, svnot_b_z(pg, k))) { + svst1_s32(pg, (int32_t*)right_addr, reg); + return numlanes(); + } + + uint64_t amount_ge_pivot = svcntp_b32(svptrue_b32(), k); + uint64_t amount_nge_pivot = numlanes() - amount_ge_pivot; + + svint32_t compressed_1 = svcompact_s32(knot_opmask(k), reg); + svint32_t compressed_2 = svcompact_s32(k, reg); + + svbool_t store_mask_1 = svwhilelt_b32_u64(0, amount_nge_pivot); + svbool_t store_mask_2 = svwhilelt_b32_u64(0, amount_ge_pivot); + + svst1_s32(store_mask_1, (int32_t*)left_addr, compressed_1); + svst1_s32(store_mask_2, (int32_t*)(right_addr + amount_nge_pivot), compressed_2); + + return amount_ge_pivot; + } + + static void oet_sort(type_t *arr, arrsize_t num) { + svbool_t p1 = svwhilelt_b32_u64(0, num); + const svint32x2_t z0_z1 = svld2_s32(p1, arr); + const svbool_t p2 = svcmplt_s32(p1, svget2_s32(z0_z1, 0), svget2_s32(z0_z1, 1)); + + const svint32_t z4 = svsel_s32(p2, svget2_s32(z0_z1, 0), svget2_s32(z0_z1, 1)); // z4 <- smaller values + const svint32_t z5 = svsel_s32(p2, svget2_s32(z0_z1, 1), svget2_s32(z0_z1, 0)); // z5 <- larger values + + svst2_s32(p1, arr, svcreate2_s32(z4, z5)); + } +}; + +template <> +struct sve_vector { + using type_t = float; + using reg_t = svfloat32_t; // SVE 32-bit float vector + using opmask_t = svbool_t; // predicate register + /* TODO: Prefer avoiding a runtime svcntw() call when the vector length + * is known at compile time. One option is to add a template parameter to + * this struct for common cases - 128/256 bits with a fallback to svcntw() + * if the vector width is unknown at compile time. + */ + static inline uint8_t numlanes() { + return static_cast(svcntw()); + } + + static inline int partition_unroll_factor() { + return (svcntw() * sizeof(type_t)) > 16 ? 4 : 2; + } + + static type_t type_max() { return SIMD_SORT_INFINITYF; } + static type_t type_min() { return -SIMD_SORT_INFINITYF; } + + static opmask_t knot_opmask(opmask_t x) { + return svnot_b_z(svptrue_b32(), x); + } + + static opmask_t ge(reg_t x, reg_t y) { + return svcmpge_f32(svptrue_b32(),x, y); + } + + static opmask_t gt(reg_t x, reg_t y) { + return svcmpgt_f32(svptrue_b32(),x, y); + } + + static reg_t loadu(void const *mem) { + return svld1_f32(svptrue_b32(), (const float*)mem); + } + + static type_t reducemax(reg_t v) { + return svmaxv_f32(svptrue_b32(), v); + } + + static type_t reducemin(reg_t v) { + return svminv_f32(svptrue_b32(), v); + } + + static reg_t set1(type_t v) { + return svdup_n_f32(v); + } + + static void storeu(void *mem, reg_t x) { + return svst1_f32(svptrue_b32(), (float32_t*)mem, x); + } + + static reg_t min(reg_t x, reg_t y) { + return svmin_f32_z(svptrue_b32(), x, y); + } + + static reg_t max(reg_t x, reg_t y) { + return svmax_f32_z(svptrue_b32(), x, y); + } + + static int double_compressstore(type_t *left_addr, type_t *right_addr, + opmask_t k, reg_t reg) { + // fast path if all vector elements are less than pivot + svbool_t pg = svptrue_b32(); + if (!svptest_any(pg, k)) { + svst1_f32(pg, (float32_t*)left_addr, reg); + return 0; + } + + // fast path if all vector elements are greater than pivot + if (!svptest_any(pg, svnot_b_z(pg, k))) { + svst1_f32(pg, (float32_t*)right_addr, reg); + return numlanes(); + } + + uint64_t amount_ge_pivot = svcntp_b32(svptrue_b32(), k); + uint64_t amount_nge_pivot = numlanes() - amount_ge_pivot; + + svfloat32_t compressed_1 = svcompact_f32(knot_opmask(k), reg); + svfloat32_t compressed_2 = svcompact_f32(k, reg); + + svbool_t store_mask_1 = svwhilelt_b32_u64(0, amount_nge_pivot); + svbool_t store_mask_2 = svwhilelt_b32_u64(0, amount_ge_pivot); + + svst1_f32(store_mask_1, (float32_t*)left_addr, compressed_1); + svst1_f32(store_mask_2, (float32_t*)(right_addr + amount_nge_pivot), compressed_2); + + return amount_ge_pivot; + } + + static void oet_sort(type_t *arr, arrsize_t num) { + svbool_t p1 = svwhilelt_b32_u64(0, num); + const svfloat32x2_t z0_z1 = svld2_f32(p1, arr); + const svbool_t p2 = svcmplt_f32(p1, svget2_f32(z0_z1, 0), svget2_f32(z0_z1, 1)); + + const svfloat32_t z4 = svsel_f32(p2, svget2_f32(z0_z1, 0), svget2_f32(z0_z1, 1)); // z4 <- smaller values + const svfloat32_t z5 = svsel_f32(p2, svget2_f32(z0_z1, 1), svget2_f32(z0_z1, 0)); // z5 <- larger values + + svst2_f32(p1, arr, svcreate2_f32(z4, z5)); + } +}; +#endif // SVE_QSORT_VECTOR diff --git a/src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp b/src/java.base/linux/native/libsimdsort/x86/avx2-emu-funcs.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx2-emu-funcs.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp rename to src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp diff --git a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx512-32bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-32bit-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx512-64bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-64bit-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/x86/avx512-linux-qsort.cpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-linux-qsort.cpp diff --git a/src/java.base/linux/native/libsimdsort/simdsort-support.hpp b/src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/simdsort-support.hpp rename to src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-common-includes.h b/src/java.base/linux/native/libsimdsort/x86/xss-common-includes.h similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-common-includes.h rename to src/java.base/linux/native/libsimdsort/x86/xss-common-includes.h diff --git a/src/java.base/linux/native/libsimdsort/xss-common-qsort.h b/src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-common-qsort.h rename to src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h diff --git a/src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-network-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/xss-network-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-optimal-networks.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp rename to src/java.base/linux/native/libsimdsort/x86/xss-optimal-networks.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp rename to src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp