Don't use [[gnu::target("avx2")]]

adamant-pwn · adamant-pwn · commit 671b2cb99b38 · 2025-12-08T13:06:39.000+01:00
diff --git a/.verify-helper/config.toml b/.verify-helper/config.toml
@@ -1,3 +1,3 @@
 [[languages.cpp.environments]]
 CXX = "g++"
-CXXFLAGS = ["-std=c++23", "-Wall", "-Wextra", "-Wconversion", "-Werror", "-pedantic", "-O2"]
+CXXFLAGS = ["-std=c++23", "-Wall", "-Wextra", "-Wconversion", "-Werror", "-Wno-psabi", "-pedantic", "-O2"]
diff --git a/cp-algo/math/cvector.hpp b/cp-algo/math/cvector.hpp
@@ -15,7 +15,7 @@ namespace cp_algo::math::fft {
     using point = complex<ftype>;
     using vpoint = complex<vftype>;
     static constexpr vftype vz = {};
-    [[gnu::target("avx2")]] vpoint vi(vpoint const& r) {
+    simd_target vpoint vi(vpoint const& r) {
         return {-imag(r), real(r)};
     }
 
@@ -30,7 +30,7 @@ namespace cp_algo::math::fft {
         vpoint& at(size_t k) {return r[k / flen];}
         vpoint at(size_t k) const {return r[k / flen];}
         template<class pt = point>
-        void set(size_t k, pt const& t) {
+        simd_inline void set(size_t k, pt const& t) {
             if constexpr(std::is_same_v<pt, point>) {
                 real(r[k / flen])[k % flen] = real(t);
                 imag(r[k / flen])[k % flen] = imag(t);
@@ -39,7 +39,7 @@ namespace cp_algo::math::fft {
             }
         }
         template<class pt = point>
-        [[gnu::target("avx2")]] pt get(size_t k) const {
+        simd_inline pt get(size_t k) const {
             if constexpr(std::is_same_v<pt, point>) {
                 return {real(r[k / flen])[k % flen], imag(r[k / flen])[k % flen]};
             } else {
@@ -79,18 +79,18 @@ namespace cp_algo::math::fft {
             return roots[std::bit_width(n)];
         }
         template<int step>
-        [[gnu::target("avx2")]] static void exec_on_eval(size_t n, size_t k, auto &&callback) {
+        simd_target static void exec_on_eval(size_t n, size_t k, auto &&callback) {
             callback(k, root(4 * step * n) * eval_point(step * k));
         }
         template<int step>
-        [[gnu::target("avx2")]] static void exec_on_evals(size_t n, auto &&callback) {
+        simd_target static void exec_on_evals(size_t n, auto &&callback) {
             point factor = root(4 * step * n);
             for(size_t i = 0; i < n; i++) {
                 callback(i, factor * eval_point(step * i));
             }
         }
 
-        [[gnu::target("avx2")]] static void do_dot_iter(point rt, vpoint& Bv, vpoint const& Av, vpoint& res) {
+        simd_target static void do_dot_iter(point rt, vpoint& Bv, vpoint const& Av, vpoint& res) {
             res += Av * Bv;
             real(Bv) = rotate_right(real(Bv));
             imag(Bv) = rotate_right(imag(Bv));
@@ -99,7 +99,7 @@ namespace cp_algo::math::fft {
             imag(Bv)[0] = x * imag(rt) + y * real(rt);
         }
 
-        [[gnu::target("avx2")]] void dot(cvector const& t) {
+        simd_target void dot(cvector const& t) {
             size_t n = this->size();
             exec_on_evals<1>(n / flen, [&](size_t k, point rt) {
                 k *= flen;
@@ -115,7 +115,7 @@ namespace cp_algo::math::fft {
             checkpoint("dot");
         }
         template<bool partial = true>
-        [[gnu::target("avx2")]] void ifft() {
+        simd_target void ifft() {
             size_t n = size();
             if constexpr (!partial) {
                 point pi(0, 1);
@@ -177,7 +177,7 @@ namespace cp_algo::math::fft {
             }
         }
         template<bool partial = true>
-        [[gnu::target("avx2")]] void fft() {
+        simd_target void fft() {
             size_t n = size();
             bool parity = std::countr_zero(n) % 2;
             for(size_t leaf = 0; leaf < n; leaf += 4 * flen) {
diff --git a/cp-algo/math/factorials.hpp b/cp-algo/math/factorials.hpp
@@ -9,7 +9,7 @@
 
 namespace cp_algo::math {
     template<bool use_bump_alloc = false, int maxn = -1>
-    [[gnu::target("avx2")]] auto facts(auto const& args) {
+    simd_target auto facts(auto const& args) {
         static_assert(!use_bump_alloc || maxn > 0, "maxn must be set if use_bump_alloc is true");
         constexpr int max_mod = 1'000'000'000;
         constexpr int accum = 4;
diff --git a/cp-algo/math/fft.hpp b/cp-algo/math/fft.hpp
@@ -29,7 +29,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        [[gnu::target("avx2")]] static std::pair<vftype, vftype> 
+        simd_target static std::pair<vftype, vftype> 
         do_split(auto const& a, size_t idx, u64x4 mul) {
             if(idx >= std::size(a)) {
                 return std::pair{vftype(), vftype()};
@@ -48,7 +48,7 @@ namespace cp_algo::math::fft {
         }
 
         dft(size_t n): A(n), B(n) {init();}
-        [[gnu::target("avx2")]] dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {
+        simd_target dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {
             init();
             base b2x32 = bpow(base(2), 32);
             u64x4 cur = {
@@ -77,7 +77,7 @@ namespace cp_algo::math::fft {
                 }
             }
         }
-        [[gnu::target("avx2")]] static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {
+        simd_target static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {
             AC += Av * Cv; AD += Av * Dv;
             BC += Bv * Cv; BD += Bv * Dv;
             real(Cv) = rotate_right(real(Cv));
@@ -93,7 +93,7 @@ namespace cp_algo::math::fft {
         }
 
         template<bool overwrite = true, bool partial = true>
-        [[gnu::target("avx2")]] void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {
+        simd_target void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {
             cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) {
                 k *= flen;
                 vpoint AC, AD, BC, BD;
@@ -129,7 +129,7 @@ namespace cp_algo::math::fft {
             dot(C, D, A, B, C);
         }
 
-        [[gnu::target("avx2")]] static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {
+        simd_target static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {
             auto A0 = lround(A), A1 = lround(C), A2 = lround(B);
             auto Ai = A0 + A1 * split() + A2 * splitsplit + uint64_t(base::modmod());
             auto Au = montgomery_reduce(u64x4(Ai), mod, imod);
@@ -140,7 +140,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        [[gnu::target("avx2")]] void recover_mod(auto &&C, auto &res, size_t k) {
+        simd_target void recover_mod(auto &&C, auto &res, size_t k) {
             size_t check = (k + flen - 1) / flen * flen;
             assert(res.size() >= check);
             size_t n = A.size();
@@ -168,7 +168,7 @@ namespace cp_algo::math::fft {
             checkpoint("recover mod");
         }
 
-        [[gnu::target("avx2")]] void mul(auto &&C, auto const& D, auto &res, size_t k) {
+        simd_target void mul(auto &&C, auto const& D, auto &res, size_t k) {
             assert(A.size() == C.size());
             size_t n = A.size();
             if(!n) {
@@ -181,10 +181,10 @@ namespace cp_algo::math::fft {
             C.ifft();
             recover_mod(C, res, k);
         }
-        [[gnu::target("avx2")]] void mul_inplace(auto &&B, auto& res, size_t k) {
+        simd_target void mul_inplace(auto &&B, auto& res, size_t k) {
             mul(B.A, B.B, res, k);
         }
-        [[gnu::target("avx2")]] void mul(auto const& B, auto& res, size_t k) {
+        simd_target void mul(auto const& B, auto& res, size_t k) {
             mul(cvector(B.A), B.B, res, k);
         }
         big_vector<base> operator *= (dft &B) {
@@ -247,7 +247,7 @@ namespace cp_algo::math::fft {
     }
 
     // store mod x^n-k in first half, x^n+k in second half
-    [[gnu::target("avx2")]] void mod_split(auto &&x, size_t n, auto k) {
+    simd_target void mod_split(auto &&x, size_t n, auto k) {
         using base = std::decay_t<decltype(k)>;
         dft<base>::init();
         assert(std::size(x) == 2 * n);
diff --git a/cp-algo/math/fft64.hpp b/cp-algo/math/fft64.hpp
@@ -46,7 +46,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        [[gnu::target("avx2")]] static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {
+        simd_target static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {
             for(size_t k = 0; k < 4; k++) {
                 for(size_t i = 0; i <= k; i++) {
                     C[k] += A[i] * B[k - i];
diff --git a/cp-algo/util/bit.hpp b/cp-algo/util/bit.hpp
@@ -38,14 +38,14 @@ namespace cp_algo {
         });
     }
 
-    [[gnu::target("avx2")]] inline uint32_t read_bits(char const* p) {
+    simd_inline uint32_t read_bits(char const* p) {
         return _mm256_movemask_epi8(__m256i(vector_cast<u8x32 const>(p[0]) + (127 - '0')));
     }
-    [[gnu::target("avx2")]] inline uint64_t read_bits64(char const* p) {
+    simd_inline uint64_t read_bits64(char const* p) {
         return read_bits(p) | (uint64_t(read_bits(p + 32)) << 32);
     }
 
-    [[gnu::target("avx2")]] inline void write_bits(char *p, uint32_t bits) {
+    simd_inline void write_bits(char *p, uint32_t bits) {
         static constexpr u8x32 shuffler = {
             0, 0, 0, 0, 0, 0, 0, 0,
             1, 1, 1, 1, 1, 1, 1, 1,
@@ -63,7 +63,7 @@ namespace cp_algo {
             p[z] = shuffled[z] & mask[z] ? '1' : '0';
         }
     }
-    [[gnu::target("avx2")]] inline void write_bits64(char *p, uint64_t bits) {
+    simd_inline void write_bits64(char *p, uint64_t bits) {
         write_bits(p, uint32_t(bits));
         write_bits(p + 32, uint32_t(bits >> 32));
     }
diff --git a/cp-algo/util/checkpoint.hpp b/cp-algo/util/checkpoint.hpp
@@ -5,12 +5,14 @@
 #include <string>
 #include <map>
 namespace cp_algo {
+#ifdef CP_ALGO_CHECKPOINT
+    std::map<std::string, double> checkpoints;
+    double last;
+#endif
     template<bool final = false>
-    void checkpoint([[maybe_unused]] auto const& _msg = "") {
+    void checkpoint([[maybe_unused]] auto const& _msg) {
 #ifdef CP_ALGO_CHECKPOINT
         std::string msg = _msg;
-        static std::map<std::string, double> checkpoints;
-        static double last = 0;
         double now = (double)clock() / CLOCKS_PER_SEC;
         double delta = now - last;
         last = now;
@@ -25,5 +27,9 @@ namespace cp_algo {
         }
 #endif
     }
+    template<bool final = false>
+    void checkpoint() {
+        checkpoint<final>("");
+    }
 }
 #endif // CP_ALGO_UTIL_CHECKPOINT_HPP
diff --git a/cp-algo/util/complex.hpp b/cp-algo/util/complex.hpp
@@ -9,45 +9,45 @@ namespace cp_algo {
     struct complex {
         using value_type = T;
         T x, y;
-        constexpr complex(): x(), y() {}
-        constexpr complex(T const& x): x(x), y() {}
-        constexpr complex(T const& x, T const& y): x(x), y(y) {}
-        [[gnu::target("avx2")]] complex& operator *= (T const& t) {x *= t; y *= t; return *this;}
-        [[gnu::target("avx2")]] complex& operator /= (T const& t) {x /= t; y /= t; return *this;}
-        [[gnu::target("avx2")]] complex operator * (T const& t) const {return complex(*this) *= t;}
-        [[gnu::target("avx2")]] complex operator / (T const& t) const {return complex(*this) /= t;}
-        [[gnu::target("avx2")]] complex& operator += (complex const& t) {x += t.x; y += t.y; return *this;}
-        [[gnu::target("avx2")]] complex& operator -= (complex const& t) {x -= t.x; y -= t.y; return *this;}
-        [[gnu::target("avx2")]] complex operator * (complex const& t) const {return {x * t.x - y * t.y, x * t.y + y * t.x};}
-        [[gnu::target("avx2")]] complex operator / (complex const& t) const {return *this * t.conj() / t.norm();}
-        [[gnu::target("avx2")]] complex operator + (complex const& t) const {return complex(*this) += t;}
-        [[gnu::target("avx2")]] complex operator - (complex const& t) const {return complex(*this) -= t;}
-        [[gnu::target("avx2")]] complex& operator *= (complex const& t) {return *this = *this * t;}
-        [[gnu::target("avx2")]] complex& operator /= (complex const& t) {return *this = *this / t;}
-        [[gnu::target("avx2")]] complex operator - () const {return {-x, -y};}
-        [[gnu::target("avx2")]] complex conj() const {return {x, -y};}
-        [[gnu::target("avx2")]] T norm() const {return x * x + y * y;}
-        [[gnu::target("avx2")]] T abs() const {return std::sqrt(norm());}
-        [[gnu::target("avx2")]] T const real() const {return x;}
-        [[gnu::target("avx2")]] T const imag() const {return y;}
-        [[gnu::target("avx2")]] T& real() {return x;}
-        [[gnu::target("avx2")]] T& imag() {return y;}
-        [[gnu::target("avx2")]] static constexpr complex polar(T r, T theta) {return {T(r * cos(theta)), T(r * sin(theta))};}
-        [[gnu::target("avx2")]] auto operator <=> (complex const& t) const = default;
+        inline constexpr complex(): x(), y() {}
+        inline constexpr complex(T const& x): x(x), y() {}
+        inline constexpr complex(T const& x, T const& y): x(x), y(y) {}
+        inline complex& operator *= (T const& t) {x *= t; y *= t; return *this;}
+        inline complex& operator /= (T const& t) {x /= t; y /= t; return *this;}
+        inline complex operator * (T const& t) const {return complex(*this) *= t;}
+        inline complex operator / (T const& t) const {return complex(*this) /= t;}
+        inline complex& operator += (complex const& t) {x += t.x; y += t.y; return *this;}
+        inline complex& operator -= (complex const& t) {x -= t.x; y -= t.y; return *this;}
+        inline complex operator * (complex const& t) const {return {x * t.x - y * t.y, x * t.y + y * t.x};}
+        inline complex operator / (complex const& t) const {return *this * t.conj() / t.norm();}
+        inline complex operator + (complex const& t) const {return complex(*this) += t;}
+        inline complex operator - (complex const& t) const {return complex(*this) -= t;}
+        inline complex& operator *= (complex const& t) {return *this = *this * t;}
+        inline complex& operator /= (complex const& t) {return *this = *this / t;}
+        inline complex operator - () const {return {-x, -y};}
+        inline complex conj() const {return {x, -y};}
+        inline T norm() const {return x * x + y * y;}
+        inline T abs() const {return std::sqrt(norm());}
+        inline T const real() const {return x;}
+        inline T const imag() const {return y;}
+        inline T& real() {return x;}
+        inline T& imag() {return y;}
+        inline static constexpr complex polar(T r, T theta) {return {T(r * cos(theta)), T(r * sin(theta))};}
+        inline auto operator <=> (complex const& t) const = default;
     };
-    template<typename T> [[gnu::target("avx2")]] complex<T> conj(complex<T> const& x) {return x.conj();}
-    template<typename T> [[gnu::target("avx2")]] T norm(complex<T> const& x) {return x.norm();}
-    template<typename T> [[gnu::target("avx2")]] T abs(complex<T> const& x) {return x.abs();}
-    template<typename T> [[gnu::target("avx2")]] T& real(complex<T> &x) {return x.real();}
-    template<typename T> [[gnu::target("avx2")]] T& imag(complex<T> &x) {return x.imag();}
-    template<typename T> [[gnu::target("avx2")]] T const real(complex<T> const& x) {return x.real();}
-    template<typename T> [[gnu::target("avx2")]] T const imag(complex<T> const& x) {return x.imag();}
+    template<typename T> inline complex<T> conj(complex<T> const& x) {return x.conj();}
+    template<typename T> inline T norm(complex<T> const& x) {return x.norm();}
+    template<typename T> inline T abs(complex<T> const& x) {return x.abs();}
+    template<typename T> inline T& real(complex<T> &x) {return x.real();}
+    template<typename T> inline T& imag(complex<T> &x) {return x.imag();}
+    template<typename T> inline T const real(complex<T> const& x) {return x.real();}
+    template<typename T> inline T const imag(complex<T> const& x) {return x.imag();}
     template<typename T>
-    [[gnu::target("avx2")]] constexpr complex<T> polar(T r, T theta) {
+    inline constexpr complex<T> polar(T r, T theta) {
         return complex<T>::polar(r, theta);
     }
     template<typename T>
-    std::ostream& operator << (std::ostream &out, complex<T> const& x) {
+    inline std::ostream& operator << (std::ostream &out, complex<T> const& x) {
         return out << x.real() << ' ' << x.imag();
     }
 }
diff --git a/cp-algo/util/simd.hpp b/cp-algo/util/simd.hpp
diff --git a/verify/poly/convolution_large.test.cpp b/verify/poly/convolution_large.test.cpp
diff --git a/verify/poly/wildcard.test.cpp b/verify/poly/wildcard.test.cpp

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ namespace cp_algo::math::fft {`
`29`	`29`	`}`
`30`	`30`	`}`
`31`	`31`
`32`		`- [[gnu::target("avx2")]] static std::pair<vftype, vftype>`
	`32`	`+ simd_target static std::pair<vftype, vftype>`
`33`	`33`	`do_split(auto const& a, size_t idx, u64x4 mul) {`
`34`	`34`	`if(idx >= std::size(a)) {`
`35`	`35`	`return std::pair{vftype(), vftype()};`
`@@ -48,7 +48,7 @@ namespace cp_algo::math::fft {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`dft(size_t n): A(n), B(n) {init();}`
`51`		`- [[gnu::target("avx2")]] dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {`
	`51`	`+ simd_target dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {`
`52`	`52`	`init();`
`53`	`53`	`base b2x32 = bpow(base(2), 32);`
`54`	`54`	`u64x4 cur = {`
`@@ -77,7 +77,7 @@ namespace cp_algo::math::fft {`
`77`	`77`	`}`
`78`	`78`	`}`
`79`	`79`	`}`
`80`		`- [[gnu::target("avx2")]] static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {`
	`80`	`+ simd_target static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {`
`81`	`81`	`AC += Av * Cv; AD += Av * Dv;`
`82`	`82`	`BC += Bv * Cv; BD += Bv * Dv;`
`83`	`83`	`real(Cv) = rotate_right(real(Cv));`
`@@ -93,7 +93,7 @@ namespace cp_algo::math::fft {`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`template<bool overwrite = true, bool partial = true>`
`96`		`- [[gnu::target("avx2")]] void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {`
	`96`	`+ simd_target void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {`
`97`	`97`	`cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) {`
`98`	`98`	`k *= flen;`
`99`	`99`	`vpoint AC, AD, BC, BD;`
`@@ -129,7 +129,7 @@ namespace cp_algo::math::fft {`
`129`	`129`	`dot(C, D, A, B, C);`
`130`	`130`	`}`
`131`	`131`
`132`		`- [[gnu::target("avx2")]] static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {`
	`132`	`+ simd_target static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {`
`133`	`133`	`auto A0 = lround(A), A1 = lround(C), A2 = lround(B);`
`134`	`134`	`auto Ai = A0 + A1 * split() + A2 * splitsplit + uint64_t(base::modmod());`
`135`	`135`	`auto Au = montgomery_reduce(u64x4(Ai), mod, imod);`
`@@ -140,7 +140,7 @@ namespace cp_algo::math::fft {`
`140`	`140`	`}`
`141`	`141`	`}`
`142`	`142`
`143`		`- [[gnu::target("avx2")]] void recover_mod(auto &&C, auto &res, size_t k) {`
	`143`	`+ simd_target void recover_mod(auto &&C, auto &res, size_t k) {`
`144`	`144`	`size_t check = (k + flen - 1) / flen * flen;`
`145`	`145`	`assert(res.size() >= check);`
`146`	`146`	`size_t n = A.size();`
`@@ -168,7 +168,7 @@ namespace cp_algo::math::fft {`
`168`	`168`	`checkpoint("recover mod");`
`169`	`169`	`}`
`170`	`170`
`171`		`- [[gnu::target("avx2")]] void mul(auto &&C, auto const& D, auto &res, size_t k) {`
	`171`	`+ simd_target void mul(auto &&C, auto const& D, auto &res, size_t k) {`
`172`	`172`	`assert(A.size() == C.size());`
`173`	`173`	`size_t n = A.size();`
`174`	`174`	`if(!n) {`
`@@ -181,10 +181,10 @@ namespace cp_algo::math::fft {`
`181`	`181`	`C.ifft();`
`182`	`182`	`recover_mod(C, res, k);`
`183`	`183`	`}`
`184`		`- [[gnu::target("avx2")]] void mul_inplace(auto &&B, auto& res, size_t k) {`
	`184`	`+ simd_target void mul_inplace(auto &&B, auto& res, size_t k) {`
`185`	`185`	`mul(B.A, B.B, res, k);`
`186`	`186`	`}`
`187`		`- [[gnu::target("avx2")]] void mul(auto const& B, auto& res, size_t k) {`
	`187`	`+ simd_target void mul(auto const& B, auto& res, size_t k) {`
`188`	`188`	`mul(cvector(B.A), B.B, res, k);`
`189`	`189`	`}`
`190`	`190`	`big_vector<base> operator *= (dft &B) {`
`@@ -247,7 +247,7 @@ namespace cp_algo::math::fft {`
`247`	`247`	`}`
`248`	`248`
`249`	`249`	`// store mod x^n-k in first half, x^n+k in second half`
`250`		`- [[gnu::target("avx2")]] void mod_split(auto &&x, size_t n, auto k) {`
	`250`	`+ simd_target void mod_split(auto &&x, size_t n, auto k) {`
`251`	`251`	`using base = std::decay_t<decltype(k)>;`
`252`	`252`	`dft<base>::init();`
`253`	`253`	`assert(std::size(x) == 2 * n);`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ namespace cp_algo::math::fft {`
`46`	`46`	`}`
`47`	`47`	`}`
`48`	`48`
`49`		`- [[gnu::target("avx2")]] static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {`
	`49`	`+ simd_target static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {`
`50`	`50`	`for(size_t k = 0; k < 4; k++) {`
`51`	`51`	`for(size_t i = 0; i <= k; i++) {`
`52`	`52`	`C[k] += A[i] * B[k - i];`
Original file line number	Diff line number	Diff line change
`@@ -38,14 +38,14 @@ namespace cp_algo {`
`38`	`38`	`});`
`39`	`39`	`}`
`40`	`40`
`41`		`- [[gnu::target("avx2")]] inline uint32_t read_bits(char const* p) {`
	`41`	`+ simd_inline uint32_t read_bits(char const* p) {`
`42`	`42`	`return _mm256_movemask_epi8(__m256i(vector_cast<u8x32 const>(p[0]) + (127 - '0')));`
`43`	`43`	`}`
`44`		`- [[gnu::target("avx2")]] inline uint64_t read_bits64(char const* p) {`
	`44`	`+ simd_inline uint64_t read_bits64(char const* p) {`
`45`	`45`	`return read_bits(p) \| (uint64_t(read_bits(p + 32)) << 32);`
`46`	`46`	`}`
`47`	`47`
`48`		`- [[gnu::target("avx2")]] inline void write_bits(char *p, uint32_t bits) {`
	`48`	`+ simd_inline void write_bits(char *p, uint32_t bits) {`
`49`	`49`	`static constexpr u8x32 shuffler = {`
`50`	`50`	`0, 0, 0, 0, 0, 0, 0, 0,`
`51`	`51`	`1, 1, 1, 1, 1, 1, 1, 1,`
`@@ -63,7 +63,7 @@ namespace cp_algo {`
`63`	`63`	`p[z] = shuffled[z] & mask[z] ? '1' : '0';`
`64`	`64`	`}`
`65`	`65`	`}`
`66`		`- [[gnu::target("avx2")]] inline void write_bits64(char *p, uint64_t bits) {`
	`66`	`+ simd_inline void write_bits64(char *p, uint64_t bits) {`
`67`	`67`	`write_bits(p, uint32_t(bits));`
`68`	`68`	`write_bits(p + 32, uint32_t(bits >> 32));`
`69`	`69`	`}`