+namespace cp_algo{template<typename T,size_t len>using simd[[gnu::vector_size(len*sizeof(T))]]=T;using i64x4=simd<int64_t,4>;using u64x4=simd<uint64_t,4>;using u32x8=simd<uint32_t,8>;using i32x4=simd<int32_t,4>;using u32x4=simd<uint32_t,4>;using i16x4=simd<int16_t,4>;using u8x32=simd<uint8_t,32>;using dx4=simd<double,4>;dx4 abs(dx4 a){return dx4{std::abs(a[0]),std::abs(a[1]),std::abs(a[2]),std::abs(a[3])};}static constexpr dx4 magic=dx4()+(3ULL<<51);inline i64x4 lround(dx4 x){return i64x4(x+magic)-i64x4(magic);}inline dx4 to_double(i64x4 x){return dx4(x+i64x4(magic))-magic;}inline dx4 round(dx4 a){return dx4{std::nearbyint(a[0]),std::nearbyint(a[1]),std::nearbyint(a[2]),std::nearbyint(a[3])};}inline u64x4 low32(u64x4 x){return x&uint32_t(-1);}inline auto swap_bytes(auto x){return decltype(x)(__builtin_shufflevector(u32x8(x),u32x8(x),1,0,3,2,5,4,7,6));}inline u64x4 montgomery_reduce(u64x4 x,uint32_t mod,uint32_t imod){
0 commit comments