diff --git a/Cargo.lock b/Cargo.lock index a718252..f01bcad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.58" +version = "1.2.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" dependencies = [ "find-msvc-tools", "shlex", @@ -436,9 +436,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "find-msvc-tools" @@ -526,9 +526,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -667,12 +667,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.1" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -706,9 +706,9 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "js-sys" -version = "0.3.94" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" dependencies = [ "once_cell", "wasm-bindgen", @@ -722,9 +722,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.184" +version = "0.2.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" +checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" [[package]] name = "libloading" @@ -1159,9 +1159,9 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "seq-macro" @@ -1577,9 +1577,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", "once_cell", @@ -1590,9 +1590,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1600,9 +1600,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ "bumpalo", "proc-macro2", @@ -1613,9 +1613,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" dependencies = [ "unicode-ident", ] @@ -1863,8 +1863,7 @@ dependencies = [ [[package]] name = "zerocopy" version = "0.8.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +source = "git+https://github.com/usamoi/zerocopy.git?branch=v0.8.48-simd#36ade3473c8334db766a6d9a21159b8cb31c7269" dependencies = [ "zerocopy-derive", ] @@ -1872,8 +1871,7 @@ dependencies = [ [[package]] name = "zerocopy-derive" version = "0.8.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +source = "git+https://github.com/usamoi/zerocopy.git?branch=v0.8.48-simd#36ade3473c8334db766a6d9a21159b8cb31c7269" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 5979d68..ef64580 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,10 @@ rand = "0.10.1" serde = { version = "1.0.228", features = ["derive"] } tempfile = "3.27.0" validator = { version = "0.20.0", features = ["derive"] } -zerocopy = { version = "0.8.48", features = ["derive", "simd"] } +zerocopy = { version = "=0.8.48", features = ["derive", "simd"] } + +[patch.crates-io] +zerocopy = { git = "https://github.com/usamoi/zerocopy.git", branch = "v0.8.48-simd" } [workspace.lints] # complexity diff --git a/crates/simd/Cargo.toml b/crates/simd/Cargo.toml index efe6556..c3265ce 100644 --- a/crates/simd/Cargo.toml +++ b/crates/simd/Cargo.toml @@ -11,8 +11,13 @@ init = [] simd_macros = { path = "../simd_macros" } seq-macro = "0.3.6" + +[target.'cfg(not(any(target_arch = "s390x", target_arch = "powerpc64")))'.dependencies] zerocopy.workspace = true +[target.'cfg(any(target_arch = "s390x", target_arch = "powerpc64"))'.dependencies] +zerocopy = { workspace = true, features = ["simd-nightly"] } + [dev-dependencies] rand.workspace = true diff --git a/crates/simd/src/bitpacking_u16_ordered.rs b/crates/simd/src/bitpacking_u16_ordered.rs deleted file mode 100644 index d230b7a..0000000 --- a/crates/simd/src/bitpacking_u16_ordered.rs +++ /dev/null @@ -1,271 +0,0 @@ -// This software is licensed under a dual license model: -// -// GNU Affero General Public License v3 (AGPLv3): You may use, modify, and -// distribute this software under the terms of the AGPLv3. -// -// Elastic License v2 (ELv2): You may also use, modify, and distribute this -// software under the Elastic License v2, which has specific restrictions. -// -// We welcome any commercial collaboration or support. For inquiries -// regarding the licenses, please contact us at: -// vectorchord-inquiry@tensorchord.ai -// -// Copyright (c) 2025-2026 TensorChord Inc. - -mod bitwidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] - pub fn bitwidth(min: u16, input: &[u16; 128]) -> u8 { - let mut last = min; - let mut reduce_or = 0_u16; - for x in input.iter().copied() { - reduce_or |= x - last; - last = x; - } - if reduce_or != 0 { - 1 + reduce_or.ilog2() as u8 - } else { - 0 - } - } -} - -pub fn bitwidth(min: u16, input: &[u16; 128]) -> u8 { - bitwidth::bitwidth(min, input) -} - -seq_macro::seq!(BITWIDTH in 1..=15 { - mod compress_~BITWIDTH { - #[inline] - #[cfg(target_arch = "x86_64")] - #[crate::target_cpu(enable = "v2")] - fn compress_v2(min: u16, input: &[u16; 128], output: &mut [u8]) { - type S = core::arch::x86_64::__m128i; - type T = core::arch::x86_64::__m128i; - #[inline] - #[crate::target_cpu(enable = "v2")] - fn delta(state: &mut S, value: T) -> T { - use core::arch::x86_64::*; - let l = _mm_slli_si128(value, 2); - let r = _mm_srli_si128(*state, 14); - let result = _mm_sub_epi16(value, _mm_or_si128(l, r)); - *state = value; - result - } - use core::arch::x86_64::_mm_or_si128 as bitor; - use core::arch::x86_64::_mm_slli_epi16 as shl; - use core::arch::x86_64::_mm_srli_epi16 as shr; - let state = core::arch::x86_64::_mm_set1_epi16(min.cast_signed()); - crate::bitpacking::compress!(BITWIDTH, 16, state, input, output) - } - - #[inline] - #[cfg(target_arch = "aarch64")] - #[crate::target_cpu(enable = "a2")] - fn compress_a2(min: u16, input: &[u16; 128], output: &mut [u8]) { - type S = core::arch::aarch64::uint16x8_t; - type T = core::arch::aarch64::uint16x8_t; - #[inline] - #[crate::target_cpu(enable = "a2")] - fn delta(state: &mut S, value: T) -> T { - use core::arch::aarch64::*; - let result = vsubq_u16(value, vextq_u16(*state, value, 7)); - *state = value; - result - } - use core::arch::aarch64::vorrq_u16 as bitor; - use crate::emulate::vshlq_n_u16 as shl; - use crate::emulate::vshrq_n_u16 as shr; - let state = core::arch::aarch64::vdupq_n_u16(min); - crate::bitpacking::compress!(BITWIDTH, 16, state, input, output) - } - - #[crate::multiversion(@"v2", @"a2")] - pub fn compress(min: u16, input: &[u16; 128], output: &mut [u8]) { - type S = u16; - type T = [u16; 8]; - fn delta(state: &mut S, value: T) -> T { - let result = [ - value[0] - *state, - value[1] - value[0], - value[2] - value[1], - value[3] - value[2], - value[4] - value[3], - value[5] - value[4], - value[6] - value[5], - value[7] - value[6], - ]; - *state = value[7]; - result - } - fn bitor(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] | rhs[i]) - } - fn shl(value: T) -> T { - core::array::from_fn(|i| value[i] << N) - } - fn shr(value: T) -> T { - core::array::from_fn(|i| value[i] >> N) - } - crate::bitpacking::compress!(BITWIDTH, 16, min, input, output) - } - } -}); - -pub fn compress(min: u16, bitwidth: u8, input: &[u16; 128], output: &mut [u8]) { - assert!( - bitwidth <= 16 && bitwidth as usize * 128 / 8 == output.len(), - "unexpected output len" - ); - seq_macro::seq!(BITWIDTH in 1..=15 { - match bitwidth { - 0 => (), - #(BITWIDTH => compress_~BITWIDTH::compress(min, input, output),)* - 16 => { - output.copy_from_slice(zerocopy::IntoBytes::as_bytes(input)); - }, - _ => panic!("bitwidth out of bound"), - } - }); -} - -seq_macro::seq!(BITWIDTH in 1..=15 { - mod decompress_~BITWIDTH { - #[inline] - #[cfg(target_arch = "x86_64")] - #[crate::target_cpu(enable = "v2")] - fn decompress_v2(min: u16, input: &[u8], output: &mut [u16; 128]) { - type S = core::arch::x86_64::__m128i; - type T = core::arch::x86_64::__m128i; - #[inline] - #[crate::target_cpu(enable = "v2")] - fn delta(state: &mut S, value: T) -> T { - #![allow(non_snake_case)] - use core::arch::x86_64::*; - let base = _mm_set1_epi16(_mm_extract_epi16(*state, 7) as i16); - let x0 = value; - let x1 = _mm_slli_si128(x0, 8); - let x2 = _mm_add_epi16(x0, x1); - let x3 = _mm_slli_si128(x2, 4); - let x4 = _mm_add_epi16(x2, x3); - let x5 = _mm_slli_si128(x4, 2); - let x6 = _mm_add_epi16(x4, x5); - let result = _mm_add_epi16(base, x6); - *state = result; - result - } - use core::arch::x86_64::_mm_or_si128 as bitor; - use core::arch::x86_64::_mm_and_si128 as bitand; - use core::arch::x86_64::_mm_slli_epi16 as shl; - use core::arch::x86_64::_mm_srli_epi16 as shr; - let mask = core::arch::x86_64::_mm_set1_epi16(((1u16 << BITWIDTH) - 1).cast_signed()); - let state = core::arch::x86_64::_mm_set1_epi16(min.cast_signed()); - crate::bitpacking::decompress!(BITWIDTH, 16, mask, state, input, output) - } - - #[inline] - #[cfg(target_arch = "aarch64")] - #[crate::target_cpu(enable = "a2")] - fn decompress_a2(min: u16, input: &[u8], output: &mut [u16; 128]) { - type S = core::arch::aarch64::uint16x8_t; - type T = core::arch::aarch64::uint16x8_t; - #[inline] - #[crate::target_cpu(enable = "a2")] - fn delta(state: &mut S, value: T) -> T { - #![allow(non_snake_case)] - use core::arch::aarch64::*; - let zero = vdupq_n_u16(0); - let base = vdupq_n_u16(vgetq_lane_u16(*state, 7)); - let x0 = value; - let x1 = vextq_u16(zero, x0, 4); - let x2 = vaddq_u16(x0, x1); - let x3 = vextq_u16(zero, x2, 6); - let x4 = vaddq_u16(x2, x3); - let x5 = vextq_u16(zero, x4, 7); - let x6 = vaddq_u16(x4, x5); - let result = vaddq_u16(base, x6); - *state = result; - result - } - use core::arch::aarch64::vorrq_u16 as bitor; - use core::arch::aarch64::vandq_u16 as bitand; - use crate::emulate::vshlq_n_u16 as shl; - use crate::emulate::vshrq_n_u16 as shr; - let mask = core::arch::aarch64::vdupq_n_u16((1u16 << BITWIDTH) - 1); - let state = core::arch::aarch64::vdupq_n_u16(min); - crate::bitpacking::decompress!(BITWIDTH, 16, mask, state, input, output) - } - - #[crate::multiversion(@"v2", @"a2")] - pub fn decompress(min: u16, input: &[u8], output: &mut [u16; 128]) { - type S = u16; - type T = [u16; 8]; - fn delta(state: &mut S, value: T) -> T { - let result = [ - *state + value[0], - *state + value[0] + value[1], - *state + value[0] + value[1] + value[2], - *state + value[0] + value[1] + value[2] + value[3], - *state + value[0] + value[1] + value[2] + value[3] + value[4], - *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5], - *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6], - *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6] + value[7], - ]; - *state = result[7]; - result - } - fn bitor(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] | rhs[i]) - } - fn bitand(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] & rhs[i]) - } - fn shl(value: T) -> T { - core::array::from_fn(|i| value[i] << N) - } - fn shr(value: T) -> T { - core::array::from_fn(|i| value[i] >> N) - } - let mask = [(1u16 << BITWIDTH) - 1; _]; - crate::bitpacking::decompress!(BITWIDTH, 16, mask, min, input, output) - } - } -}); - -pub fn decompress(min: u16, bitwidth: u8, input: &[u8], output: &mut [u16; 128]) { - assert!( - bitwidth <= 16 && bitwidth as usize * 128 / 8 == input.len(), - "unexpected input len" - ); - seq_macro::seq!(BITWIDTH in 1..=15 { - match bitwidth { - 0 => (), - #(BITWIDTH => decompress_~BITWIDTH::decompress(min, input, output),)* - 16 => { - zerocopy::IntoBytes::as_mut_bytes(output).copy_from_slice(input); - }, - _ => panic!("bitwidth out of bound"), - } - }); -} - -#[test] -fn test() { - for i in 0..=16 { - let mut data: [u16; 128] = core::array::from_fn(|_| { - if i < 16 { - rand::random_range(0..1 << i) - } else { - rand::random() - } - }); - data.sort(); - let min = data[0]; - let bitwidth = bitwidth(min, &data); - assert!(bitwidth as usize <= i); - let mut compressed = vec![0_u8; bitwidth as usize * 128 / 8]; - compress(min, bitwidth, &data, &mut compressed); - let mut decompressed = [0_u16; 128]; - decompress(min, bitwidth, &compressed, &mut decompressed); - assert_eq!(data, decompressed); - } -} diff --git a/crates/simd/src/bitpacking_u16_unordered.rs b/crates/simd/src/bitpacking_u16_unordered.rs deleted file mode 100644 index 7f5c972..0000000 --- a/crates/simd/src/bitpacking_u16_unordered.rs +++ /dev/null @@ -1,208 +0,0 @@ -// This software is licensed under a dual license model: -// -// GNU Affero General Public License v3 (AGPLv3): You may use, modify, and -// distribute this software under the terms of the AGPLv3. -// -// Elastic License v2 (ELv2): You may also use, modify, and distribute this -// software under the Elastic License v2, which has specific restrictions. -// -// We welcome any commercial collaboration or support. For inquiries -// regarding the licenses, please contact us at: -// vectorchord-inquiry@tensorchord.ai -// -// Copyright (c) 2025-2026 TensorChord Inc. - -mod bitwidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] - pub fn bitwidth(input: &[u16; 128]) -> u8 { - let mut reduce_or = 0_u16; - for x in input.iter().copied() { - reduce_or |= x; - } - if reduce_or != 0 { - 1 + reduce_or.ilog2() as u8 - } else { - 0 - } - } -} - -pub fn bitwidth(input: &[u16; 128]) -> u8 { - bitwidth::bitwidth(input) -} - -seq_macro::seq!(BITWIDTH in 1..=15 { - mod compress_~BITWIDTH { - #[inline] - #[cfg(target_arch = "x86_64")] - #[crate::target_cpu(enable = "v2")] - fn compress_v2(input: &[u16; 128], output: &mut [u8]) { - type S = (); - type T = core::arch::x86_64::__m128i; - #[inline] - #[crate::target_cpu(enable = "v2")] - fn delta(&mut (): &mut S, value: T) -> T { - value - } - use core::arch::x86_64::_mm_or_si128 as bitor; - use core::arch::x86_64::_mm_slli_epi16 as shl; - use core::arch::x86_64::_mm_srli_epi16 as shr; - crate::bitpacking::compress!(BITWIDTH, 16, (), input, output) - } - - #[inline] - #[cfg(target_arch = "aarch64")] - #[crate::target_cpu(enable = "a2")] - fn compress_a2(input: &[u16; 128], output: &mut [u8]) { - type S = (); - type T = core::arch::aarch64::uint16x8_t; - #[inline] - #[crate::target_cpu(enable = "a2")] - fn delta(&mut (): &mut S, value: T) -> T { - value - } - use core::arch::aarch64::vorrq_u16 as bitor; - use crate::emulate::vshlq_n_u16 as shl; - use crate::emulate::vshrq_n_u16 as shr; - crate::bitpacking::compress!(BITWIDTH, 16, (), input, output) - } - - #[crate::multiversion(@"v2", @"a2")] - pub fn compress(input: &[u16; 128], output: &mut [u8]) { - type S = (); - type T = [u16; 8]; - fn delta(&mut (): &mut S, value: T) -> T { - value - } - fn bitor(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] | rhs[i]) - } - fn shl(value: T) -> T { - core::array::from_fn(|i| value[i] << N) - } - fn shr(value: T) -> T { - core::array::from_fn(|i| value[i] >> N) - } - crate::bitpacking::compress!(BITWIDTH, 16, (), input, output) - } - } -}); - -pub fn compress(bitwidth: u8, input: &[u16; 128], output: &mut [u8]) { - assert!( - bitwidth <= 16 && bitwidth as usize * 128 / 8 == output.len(), - "unexpected output len" - ); - seq_macro::seq!(BITWIDTH in 1..=15 { - match bitwidth { - 0 => (), - #(BITWIDTH => compress_~BITWIDTH::compress(input, output),)* - 16 => { - output.copy_from_slice(zerocopy::IntoBytes::as_bytes(input)); - }, - _ => panic!("bitwidth out of bound"), - } - }); -} - -seq_macro::seq!(BITWIDTH in 1..=15 { - mod decompress_~BITWIDTH { - #[inline] - #[cfg(target_arch = "x86_64")] - #[crate::target_cpu(enable = "v2")] - fn decompress_v2(input: &[u8], output: &mut [u16; 128]) { - type S = (); - type T = core::arch::x86_64::__m128i; - #[inline] - #[crate::target_cpu(enable = "v2")] - fn delta(&mut (): &mut S, value: T) -> T { - value - } - use core::arch::x86_64::_mm_or_si128 as bitor; - use core::arch::x86_64::_mm_and_si128 as bitand; - use core::arch::x86_64::_mm_slli_epi16 as shl; - use core::arch::x86_64::_mm_srli_epi16 as shr; - let mask = core::arch::x86_64::_mm_set1_epi16(((1u16 << BITWIDTH) - 1) as i16); - crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output) - } - - #[inline] - #[cfg(target_arch = "aarch64")] - #[crate::target_cpu(enable = "a2")] - fn decompress_a2(input: &[u8], output: &mut [u16; 128]) { - type S = (); - type T = core::arch::aarch64::uint16x8_t; - #[inline] - #[crate::target_cpu(enable = "a2")] - fn delta(&mut (): &mut S, value: T) -> T { - value - } - use core::arch::aarch64::vorrq_u16 as bitor; - use core::arch::aarch64::vandq_u16 as bitand; - use crate::emulate::vshlq_n_u16 as shl; - use crate::emulate::vshrq_n_u16 as shr; - let mask = core::arch::aarch64::vdupq_n_u16((1u16 << BITWIDTH) - 1); - crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output) - } - - #[crate::multiversion(@"v2", @"a2")] - pub fn decompress(input: &[u8], output: &mut [u16; 128]) { - type S = (); - type T = [u16; 8]; - fn delta(&mut (): &mut S, value: T) -> T { - value - } - fn bitor(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] | rhs[i]) - } - fn bitand(lhs: T, rhs: T) -> T { - core::array::from_fn(|i| lhs[i] & rhs[i]) - } - fn shl(value: T) -> T { - core::array::from_fn(|i| value[i] << N) - } - fn shr(value: T) -> T { - core::array::from_fn(|i| value[i] >> N) - } - let mask = [(1u16 << BITWIDTH) - 1; _]; - crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output) - } - } -}); - -pub fn decompress(bitwidth: u8, input: &[u8], output: &mut [u16; 128]) { - assert!( - bitwidth <= 16 && bitwidth as usize * 128 / 8 == input.len(), - "unexpected input len" - ); - seq_macro::seq!(BITWIDTH in 1..=15 { - match bitwidth { - 0 => (), - #(BITWIDTH => decompress_~BITWIDTH::decompress(input, output),)* - 16 => { - zerocopy::IntoBytes::as_mut_bytes(output).copy_from_slice(input); - }, - _ => panic!("bitwidth out of bound"), - } - }); -} - -#[test] -fn test() { - for i in 0..=16 { - let data: [u16; 128] = core::array::from_fn(|_| { - if i < 16 { - rand::random_range(0..1 << i) - } else { - rand::random() - } - }); - let bitwidth = bitwidth(&data); - assert!(bitwidth as usize <= i); - let mut compressed = vec![0_u8; i * 128 / 8]; - compress(bitwidth, &data, &mut compressed); - let mut decompressed = [0_u16; 128]; - decompress(bitwidth, &compressed, &mut decompressed); - assert_eq!(data, decompressed); - } -} diff --git a/crates/simd/src/bitpacking_u32_ordered.rs b/crates/simd/src/bitpacking_u32_ordered.rs index 2655c7c..e88ddad 100644 --- a/crates/simd/src/bitpacking_u32_ordered.rs +++ b/crates/simd/src/bitpacking_u32_ordered.rs @@ -13,7 +13,9 @@ // Copyright (c) 2025-2026 TensorChord Inc. mod bitwidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn bitwidth(min: u32, input: &[u32; 128]) -> u8 { let mut last = min; let mut reduce_or = 0_u32; @@ -79,7 +81,80 @@ seq_macro::seq!(BITWIDTH in 1..=31 { crate::bitpacking::compress!(BITWIDTH, 32, state, input, output) } - #[crate::multiversion(@"v2", @"a2")] + #[inline] + #[cfg(target_arch = "s390x")] + #[crate::target_cpu(enable = "z13")] + fn compress_z13(min: u32, input: &[u32; 128], output: &mut [u8]) { + type S = core::arch::s390x::vector_unsigned_int; + type T = core::arch::s390x::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "z13")] + fn delta(state: &mut S, value: T) -> T { + unsafe { + use core::arch::s390x::*; + let result = vec_sub(value, vec_sld::<_, 12>(*state, value)); + *state = value; + result + } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shl(x: T) -> T { + unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shr(x: T) -> T { + unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) } + } + let state = unsafe { core::arch::s390x::vec_splats(min) }; + crate::bitpacking::compress!(BITWIDTH, 32, state, input, output) + } + + #[inline] + #[cfg(target_arch = "powerpc64")] + #[crate::target_cpu(enable = "p7")] + fn compress_p7(min: u32, input: &[u32; 128], output: &mut [u8]) { + type S = core::arch::powerpc64::vector_unsigned_int; + type T = core::arch::powerpc64::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "p7")] + fn delta(state: &mut S, value: T) -> T { + unsafe { + use core::arch::powerpc64::*; + #[cfg(target_endian = "big")] + let result = vec_sub(value, vec_sld::<_, 12>(*state, value)); + #[cfg(target_endian = "little")] + let result = vec_sub(value, vec_sld::<_, 4>(value, *state)); + *state = value; + result + } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shl(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shr(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) } + } + let state = unsafe { core::arch::powerpc64::vec_splats(min) }; + crate::bitpacking::compress!(BITWIDTH, 32, state, input, output) + } + + #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")] pub fn compress(min: u32, input: &[u32; 128], output: &mut [u8]) { type S = u32; type T = [u32; 4]; @@ -187,7 +262,109 @@ seq_macro::seq!(BITWIDTH in 1..=31 { crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output) } - #[crate::multiversion(@"v2", @"a2")] + #[inline] + #[cfg(target_arch = "s390x")] + #[crate::target_cpu(enable = "z13")] + fn decompress_z13(min: u32, input: &[u8], output: &mut [u32; 128]) { + type S = core::arch::s390x::vector_unsigned_int; + type T = core::arch::s390x::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "z13")] + fn delta(state: &mut S, value: T) -> T { + unsafe { + use core::arch::s390x::*; + let zero = vec_splat_u32::<0>(); + let base = vec_splat::<_, 3>(*state); + let x0 = value; + let x1 = vec_sld::<_, 8>(zero, x0); + let x2 = vec_add(x0, x1); + let x3 = vec_sld::<_, 12>(zero, x2); + let x4 = vec_add(x2, x3); + let result = vec_add(base, x4); + *state = result; + result + } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitand(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_and(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shl(x: T) -> T { + unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shr(x: T) -> T { + unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) } + } + let mask = unsafe { core::arch::s390x::vec_splats((1u32 << BITWIDTH) - 1) }; + let state = unsafe { core::arch::s390x::vec_splats(min) }; + crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output) + } + + #[inline] + #[cfg(target_arch = "powerpc64")] + #[crate::target_cpu(enable = "p7")] + fn decompress_p7(min: u32, input: &[u8], output: &mut [u32; 128]) { + type S = core::arch::powerpc64::vector_unsigned_int; + type T = core::arch::powerpc64::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "p7")] + fn delta(state: &mut S, value: T) -> T { + unsafe { + use core::arch::powerpc64::*; + let zero = vec_splat_u32::<0>(); + let base = vec_splat::<_, 3>(*state); + let x0 = value; + #[cfg(target_endian = "big")] + let x1 = vec_sld::<_, 8>(zero, x0); + #[cfg(target_endian = "little")] + let x1 = vec_sld::<_, 8>(x0, zero); + let x2 = vec_add(x0, x1); + #[cfg(target_endian = "big")] + let x3 = vec_sld::<_, 12>(zero, x2); + #[cfg(target_endian = "little")] + let x3 = vec_sld::<_, 4>(x2, zero); + let x4 = vec_add(x2, x3); + let result = vec_add(base, x4); + *state = result; + result + } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitand(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_and(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shl(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shr(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) } + } + let mask = unsafe { core::arch::powerpc64::vec_splats((1u32 << BITWIDTH) - 1) }; + let state = unsafe { core::arch::powerpc64::vec_splats(min) }; + crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output) + } + + #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")] pub fn decompress(min: u32, input: &[u8], output: &mut [u32; 128]) { type S = u32; type T = [u32; 4]; diff --git a/crates/simd/src/bitpacking_u32_unordered.rs b/crates/simd/src/bitpacking_u32_unordered.rs index ec1af80..ca65586 100644 --- a/crates/simd/src/bitpacking_u32_unordered.rs +++ b/crates/simd/src/bitpacking_u32_unordered.rs @@ -13,7 +13,9 @@ // Copyright (c) 2025-2026 TensorChord Inc. mod bitwidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn bitwidth(input: &[u32; 128]) -> u8 { let mut reduce_or = 0_u32; for x in input.iter().copied() { @@ -67,7 +69,65 @@ seq_macro::seq!(BITWIDTH in 1..=31 { crate::bitpacking::compress!(BITWIDTH, 32, (), input, output) } - #[crate::multiversion(@"v2", @"a2")] + #[inline] + #[cfg(target_arch = "s390x")] + #[crate::target_cpu(enable = "z13")] + fn compress_z13(input: &[u32; 128], output: &mut [u8]) { + type S = (); + type T = core::arch::s390x::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "z13")] + fn delta(&mut (): &mut S, value: T) -> T { + value + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shl(x: T) -> T { + unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shr(x: T) -> T { + unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) } + } + crate::bitpacking::compress!(BITWIDTH, 32, (), input, output) + } + + #[inline] + #[cfg(target_arch = "powerpc64")] + #[crate::target_cpu(enable = "p7")] + fn compress_p7(input: &[u32; 128], output: &mut [u8]) { + type S = (); + type T = core::arch::powerpc64::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "p7")] + fn delta(&mut (): &mut S, value: T) -> T { + value + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shl(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shr(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) } + } + crate::bitpacking::compress!(BITWIDTH, 32, (), input, output) + } + + #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")] pub fn compress(input: &[u32; 128], output: &mut [u8]) { type S = (); type T = [u32; 4]; @@ -145,7 +205,77 @@ seq_macro::seq!(BITWIDTH in 1..=31 { crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output) } - #[crate::multiversion(@"v2", @"a2")] + #[inline] + #[cfg(target_arch = "s390x")] + #[crate::target_cpu(enable = "z13")] + fn decompress_z13(input: &[u8], output: &mut [u32; 128]) { + type S = (); + type T = core::arch::s390x::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "z13")] + fn delta(&mut (): &mut S, value: T) -> T { + value + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn bitand(lhs: T, rhs: T) -> T { + unsafe { core::arch::s390x::vec_and(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shl(x: T) -> T { + unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "z13")] + fn shr(x: T) -> T { + unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) } + } + let mask = unsafe { core::arch::s390x::vec_splats((1u32 << BITWIDTH) - 1) }; + crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output) + } + + #[inline] + #[cfg(target_arch = "powerpc64")] + #[crate::target_cpu(enable = "p7")] + fn decompress_p7(input: &[u8], output: &mut [u32; 128]) { + type S = (); + type T = core::arch::powerpc64::vector_unsigned_int; + #[inline] + #[crate::target_cpu(enable = "p7")] + fn delta(&mut (): &mut S, value: T) -> T { + value + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitor(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_or(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn bitand(lhs: T, rhs: T) -> T { + unsafe { core::arch::powerpc64::vec_and(lhs, rhs) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shl(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) } + } + #[inline] + #[crate::target_cpu(enable = "p7")] + fn shr(x: T) -> T { + unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) } + } + let mask = unsafe { core::arch::powerpc64::vec_splats((1u32 << BITWIDTH) - 1) }; + crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output) + } + + #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")] pub fn decompress(input: &[u8], output: &mut [u32; 128]) { type S = (); type T = [u32; 4]; diff --git a/crates/simd/src/bytepacking_u32_ordered.rs b/crates/simd/src/bytepacking_u32_ordered.rs index 5721b90..aa3d5f1 100644 --- a/crates/simd/src/bytepacking_u32_ordered.rs +++ b/crates/simd/src/bytepacking_u32_ordered.rs @@ -13,7 +13,9 @@ // Copyright (c) 2025-2026 TensorChord Inc. mod bytewidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn bytewidth(min: u32, input: &[u32]) -> u8 { let mut last = min; let mut reduce_or = 0_u32; @@ -35,7 +37,9 @@ pub fn bytewidth(min: u32, input: &[u32]) -> u8 { } mod compress_1 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(min: u32, input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<1>(); @@ -59,7 +63,9 @@ mod compress_1 { } mod decompress_1 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<1>(); @@ -84,7 +90,9 @@ mod decompress_1 { } mod compress_2 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(min: u32, input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<2>(); @@ -108,7 +116,9 @@ mod compress_2 { } mod decompress_2 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<2>(); @@ -133,7 +143,9 @@ mod decompress_2 { } mod compress_3 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(min: u32, input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<3>(); @@ -157,7 +169,9 @@ mod compress_3 { } mod decompress_3 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<3>(); diff --git a/crates/simd/src/bytepacking_u32_unordered.rs b/crates/simd/src/bytepacking_u32_unordered.rs index 2c874ab..1efae72 100644 --- a/crates/simd/src/bytepacking_u32_unordered.rs +++ b/crates/simd/src/bytepacking_u32_unordered.rs @@ -13,7 +13,9 @@ // Copyright (c) 2025-2026 TensorChord Inc. mod bytewidth { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn bytewidth(input: &[u32]) -> u8 { let mut reduce_or = 0_u32; for x in input.iter().copied() { @@ -33,7 +35,9 @@ pub fn bytewidth(input: &[u32]) -> u8 { } mod compress_1 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<1>(); @@ -55,7 +59,9 @@ mod compress_1 { } mod decompress_1 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<1>(); @@ -78,7 +84,9 @@ mod decompress_1 { } mod compress_2 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<2>(); @@ -100,7 +108,9 @@ mod compress_2 { } mod decompress_2 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<2>(); @@ -123,7 +133,9 @@ mod decompress_2 { } mod compress_3 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn compress(input: &[u32], output: &mut [u8]) { assert!(input.len() <= 128); let (output, remainder) = output.as_chunks_mut::<3>(); @@ -145,7 +157,9 @@ mod compress_3 { } mod decompress_3 { - #[crate::multiversion("v4", "v3", "v2", "a2")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn decompress(input: &[u8], output: &mut [u32]) { assert!(output.len() <= 128); let (input, remainder) = input.as_chunks::<3>(); diff --git a/crates/simd/src/emulate.rs b/crates/simd/src/emulate.rs index d0a8d56..fa4a6cb 100644 --- a/crates/simd/src/emulate.rs +++ b/crates/simd/src/emulate.rs @@ -12,21 +12,6 @@ // // Copyright (c) 2025-2026 TensorChord Inc. -#[cfg(target_arch = "aarch64")] -#[inline] -#[target_feature(enable = "neon")] -pub fn vshlq_n_u16( - a: core::arch::aarch64::uint16x8_t, -) -> core::arch::aarch64::uint16x8_t { - seq_macro::seq!(I in 1..16 { - match N { - 0 => a, - #(I => core::arch::aarch64::vshlq_n_u16::(a),)* - _ => unreachable!(), - } - }) -} - #[cfg(target_arch = "aarch64")] #[inline] #[target_feature(enable = "neon")] @@ -42,21 +27,6 @@ pub fn vshlq_n_u32( }) } -#[cfg(target_arch = "aarch64")] -#[inline] -#[target_feature(enable = "neon")] -pub fn vshrq_n_u16( - a: core::arch::aarch64::uint16x8_t, -) -> core::arch::aarch64::uint16x8_t { - seq_macro::seq!(I in 1..16 { - match N { - 0 => a, - #(I => core::arch::aarch64::vshrq_n_u16::(a),)* - _ => unreachable!(), - } - }) -} - #[cfg(target_arch = "aarch64")] #[inline] #[target_feature(enable = "neon")] diff --git a/crates/simd/src/lib.rs b/crates/simd/src/lib.rs index 10978e1..623ad8f 100644 --- a/crates/simd/src/lib.rs +++ b/crates/simd/src/lib.rs @@ -24,8 +24,6 @@ mod bitpacking; mod emulate; -pub mod bitpacking_u16_ordered; -pub mod bitpacking_u16_unordered; pub mod bitpacking_u32_ordered; pub mod bitpacking_u32_unordered; pub mod bytepacking_u32_ordered;