diff --git a/Cargo.lock b/Cargo.lock
index a718252..f01bcad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -184,9 +184,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.58"
+version = "1.2.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1"
+checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
 dependencies = [
  "find-msvc-tools",
  "shlex",
@@ -436,9 +436,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.3.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 
 [[package]]
 name = "find-msvc-tools"
@@ -526,9 +526,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.16.1"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
 
 [[package]]
 name = "heck"
@@ -667,12 +667,12 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.13.1"
+version = "2.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.16.1",
+ "hashbrown 0.17.0",
  "serde",
  "serde_core",
 ]
@@ -706,9 +706,9 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
 
 [[package]]
 name = "js-sys"
-version = "0.3.94"
+version = "0.3.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9"
+checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
 dependencies = [
  "once_cell",
  "wasm-bindgen",
@@ -722,9 +722,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
 
 [[package]]
 name = "libc"
-version = "0.2.184"
+version = "0.2.185"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af"
+checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
 
 [[package]]
 name = "libloading"
@@ -1159,9 +1159,9 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
 
 [[package]]
 name = "semver"
-version = "1.0.27"
+version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
 
 [[package]]
 name = "seq-macro"
@@ -1577,9 +1577,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -1590,9 +1590,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -1600,9 +1600,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
 dependencies = [
  "bumpalo",
  "proc-macro2",
@@ -1613,9 +1613,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
 dependencies = [
  "unicode-ident",
 ]
@@ -1863,8 +1863,7 @@ dependencies = [
 [[package]]
 name = "zerocopy"
 version = "0.8.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+source = "git+https://github.com/usamoi/zerocopy.git?branch=v0.8.48-simd#36ade3473c8334db766a6d9a21159b8cb31c7269"
 dependencies = [
  "zerocopy-derive",
 ]
@@ -1872,8 +1871,7 @@ dependencies = [
 [[package]]
 name = "zerocopy-derive"
 version = "0.8.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+source = "git+https://github.com/usamoi/zerocopy.git?branch=v0.8.48-simd#36ade3473c8334db766a6d9a21159b8cb31c7269"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/Cargo.toml b/Cargo.toml
index 5979d68..ef64580 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,10 @@ rand = "0.10.1"
 serde = { version = "1.0.228", features = ["derive"] }
 tempfile = "3.27.0"
 validator = { version = "0.20.0", features = ["derive"] }
-zerocopy = { version = "0.8.48", features = ["derive", "simd"] }
+zerocopy = { version = "=0.8.48", features = ["derive", "simd"] }
+
+[patch.crates-io]
+zerocopy = { git = "https://github.com/usamoi/zerocopy.git", branch = "v0.8.48-simd" }
 
 [workspace.lints]
 # complexity
diff --git a/crates/simd/Cargo.toml b/crates/simd/Cargo.toml
index efe6556..c3265ce 100644
--- a/crates/simd/Cargo.toml
+++ b/crates/simd/Cargo.toml
@@ -11,8 +11,13 @@ init = []
 simd_macros = { path = "../simd_macros" }
 
 seq-macro = "0.3.6"
+
+[target.'cfg(not(any(target_arch = "s390x", target_arch = "powerpc64")))'.dependencies]
 zerocopy.workspace = true
 
+[target.'cfg(any(target_arch = "s390x", target_arch = "powerpc64"))'.dependencies]
+zerocopy = { workspace = true, features = ["simd-nightly"] }
+
 [dev-dependencies]
 rand.workspace = true
 
diff --git a/crates/simd/src/bitpacking_u16_ordered.rs b/crates/simd/src/bitpacking_u16_ordered.rs
deleted file mode 100644
index d230b7a..0000000
--- a/crates/simd/src/bitpacking_u16_ordered.rs
+++ /dev/null
@@ -1,271 +0,0 @@
-// This software is licensed under a dual license model:
-//
-// GNU Affero General Public License v3 (AGPLv3): You may use, modify, and
-// distribute this software under the terms of the AGPLv3.
-//
-// Elastic License v2 (ELv2): You may also use, modify, and distribute this
-// software under the Elastic License v2, which has specific restrictions.
-//
-// We welcome any commercial collaboration or support. For inquiries
-// regarding the licenses, please contact us at:
-// vectorchord-inquiry@tensorchord.ai
-//
-// Copyright (c) 2025-2026 TensorChord Inc.
-
-mod bitwidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
-    pub fn bitwidth(min: u16, input: &[u16; 128]) -> u8 {
-        let mut last = min;
-        let mut reduce_or = 0_u16;
-        for x in input.iter().copied() {
-            reduce_or |= x - last;
-            last = x;
-        }
-        if reduce_or != 0 {
-            1 + reduce_or.ilog2() as u8
-        } else {
-            0
-        }
-    }
-}
-
-pub fn bitwidth(min: u16, input: &[u16; 128]) -> u8 {
-    bitwidth::bitwidth(min, input)
-}
-
-seq_macro::seq!(BITWIDTH in 1..=15 {
-    mod compress_~BITWIDTH {
-        #[inline]
-        #[cfg(target_arch = "x86_64")]
-        #[crate::target_cpu(enable = "v2")]
-        fn compress_v2(min: u16, input: &[u16; 128], output: &mut [u8]) {
-            type S = core::arch::x86_64::__m128i;
-            type T = core::arch::x86_64::__m128i;
-            #[inline]
-            #[crate::target_cpu(enable = "v2")]
-            fn delta(state: &mut S, value: T) -> T {
-                use core::arch::x86_64::*;
-                let l = _mm_slli_si128(value, 2);
-                let r = _mm_srli_si128(*state, 14);
-                let result = _mm_sub_epi16(value, _mm_or_si128(l, r));
-                *state = value;
-                result
-            }
-            use core::arch::x86_64::_mm_or_si128 as bitor;
-            use core::arch::x86_64::_mm_slli_epi16 as shl;
-            use core::arch::x86_64::_mm_srli_epi16 as shr;
-            let state = core::arch::x86_64::_mm_set1_epi16(min.cast_signed());
-            crate::bitpacking::compress!(BITWIDTH, 16, state, input, output)
-        }
-
-        #[inline]
-        #[cfg(target_arch = "aarch64")]
-        #[crate::target_cpu(enable = "a2")]
-        fn compress_a2(min: u16, input: &[u16; 128], output: &mut [u8]) {
-            type S = core::arch::aarch64::uint16x8_t;
-            type T = core::arch::aarch64::uint16x8_t;
-            #[inline]
-            #[crate::target_cpu(enable = "a2")]
-            fn delta(state: &mut S, value: T) -> T {
-                use core::arch::aarch64::*;
-                let result = vsubq_u16(value, vextq_u16(*state, value, 7));
-                *state = value;
-                result
-            }
-            use core::arch::aarch64::vorrq_u16 as bitor;
-            use crate::emulate::vshlq_n_u16 as shl;
-            use crate::emulate::vshrq_n_u16 as shr;
-            let state = core::arch::aarch64::vdupq_n_u16(min);
-            crate::bitpacking::compress!(BITWIDTH, 16, state, input, output)
-        }
-
-        #[crate::multiversion(@"v2", @"a2")]
-        pub fn compress(min: u16, input: &[u16; 128], output: &mut [u8]) {
-            type S = u16;
-            type T = [u16; 8];
-            fn delta(state: &mut S, value: T) -> T {
-                let result = [
-                    value[0] - *state,
-                    value[1] - value[0],
-                    value[2] - value[1],
-                    value[3] - value[2],
-                    value[4] - value[3],
-                    value[5] - value[4],
-                    value[6] - value[5],
-                    value[7] - value[6],
-                ];
-                *state = value[7];
-                result
-            }
-            fn bitor(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] | rhs[i])
-            }
-            fn shl<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] << N)
-            }
-            fn shr<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] >> N)
-            }
-            crate::bitpacking::compress!(BITWIDTH, 16, min, input, output)
-        }
-    }
-});
-
-pub fn compress(min: u16, bitwidth: u8, input: &[u16; 128], output: &mut [u8]) {
-    assert!(
-        bitwidth <= 16 && bitwidth as usize * 128 / 8 == output.len(),
-        "unexpected output len"
-    );
-    seq_macro::seq!(BITWIDTH in 1..=15 {
-        match bitwidth {
-            0 => (),
-            #(BITWIDTH => compress_~BITWIDTH::compress(min, input, output),)*
-            16 => {
-                output.copy_from_slice(zerocopy::IntoBytes::as_bytes(input));
-            },
-            _ => panic!("bitwidth out of bound"),
-        }
-    });
-}
-
-seq_macro::seq!(BITWIDTH in 1..=15 {
-    mod decompress_~BITWIDTH {
-        #[inline]
-        #[cfg(target_arch = "x86_64")]
-        #[crate::target_cpu(enable = "v2")]
-        fn decompress_v2(min: u16, input: &[u8], output: &mut [u16; 128]) {
-            type S = core::arch::x86_64::__m128i;
-            type T = core::arch::x86_64::__m128i;
-            #[inline]
-            #[crate::target_cpu(enable = "v2")]
-            fn delta(state: &mut S, value: T) -> T {
-                #![allow(non_snake_case)]
-                use core::arch::x86_64::*;
-                let base = _mm_set1_epi16(_mm_extract_epi16(*state, 7) as i16);
-                let x0 = value;
-                let x1 = _mm_slli_si128(x0, 8);
-                let x2 = _mm_add_epi16(x0, x1);
-                let x3 = _mm_slli_si128(x2, 4);
-                let x4 = _mm_add_epi16(x2, x3);
-                let x5 = _mm_slli_si128(x4, 2);
-                let x6 = _mm_add_epi16(x4, x5);
-                let result = _mm_add_epi16(base, x6);
-                *state = result;
-                result
-            }
-            use core::arch::x86_64::_mm_or_si128 as bitor;
-            use core::arch::x86_64::_mm_and_si128 as bitand;
-            use core::arch::x86_64::_mm_slli_epi16 as shl;
-            use core::arch::x86_64::_mm_srli_epi16 as shr;
-            let mask = core::arch::x86_64::_mm_set1_epi16(((1u16 << BITWIDTH) - 1).cast_signed());
-            let state = core::arch::x86_64::_mm_set1_epi16(min.cast_signed());
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, state, input, output)
-        }
-
-        #[inline]
-        #[cfg(target_arch = "aarch64")]
-        #[crate::target_cpu(enable = "a2")]
-        fn decompress_a2(min: u16, input: &[u8], output: &mut [u16; 128]) {
-            type S = core::arch::aarch64::uint16x8_t;
-            type T = core::arch::aarch64::uint16x8_t;
-            #[inline]
-            #[crate::target_cpu(enable = "a2")]
-            fn delta(state: &mut S, value: T) -> T {
-                #![allow(non_snake_case)]
-                use core::arch::aarch64::*;
-                let zero = vdupq_n_u16(0);
-                let base = vdupq_n_u16(vgetq_lane_u16(*state, 7));
-                let x0 = value;
-                let x1 = vextq_u16(zero, x0, 4);
-                let x2 = vaddq_u16(x0, x1);
-                let x3 = vextq_u16(zero, x2, 6);
-                let x4 = vaddq_u16(x2, x3);
-                let x5 = vextq_u16(zero, x4, 7);
-                let x6 = vaddq_u16(x4, x5);
-                let result = vaddq_u16(base, x6);
-                *state = result;
-                result
-            }
-            use core::arch::aarch64::vorrq_u16 as bitor;
-            use core::arch::aarch64::vandq_u16 as bitand;
-            use crate::emulate::vshlq_n_u16 as shl;
-            use crate::emulate::vshrq_n_u16 as shr;
-            let mask = core::arch::aarch64::vdupq_n_u16((1u16 << BITWIDTH) - 1);
-            let state = core::arch::aarch64::vdupq_n_u16(min);
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, state, input, output)
-        }
-
-        #[crate::multiversion(@"v2", @"a2")]
-        pub fn decompress(min: u16, input: &[u8], output: &mut [u16; 128]) {
-            type S = u16;
-            type T = [u16; 8];
-            fn delta(state: &mut S, value: T) -> T {
-                let result = [
-                    *state + value[0],
-                    *state + value[0] + value[1],
-                    *state + value[0] + value[1] + value[2],
-                    *state + value[0] + value[1] + value[2] + value[3],
-                    *state + value[0] + value[1] + value[2] + value[3] + value[4],
-                    *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5],
-                    *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6],
-                    *state + value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6] + value[7],
-                ];
-                *state = result[7];
-                result
-            }
-            fn bitor(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] | rhs[i])
-            }
-            fn bitand(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] & rhs[i])
-            }
-            fn shl<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] << N)
-            }
-            fn shr<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] >> N)
-            }
-            let mask = [(1u16 << BITWIDTH) - 1; _];
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, min, input, output)
-        }
-    }
-});
-
-pub fn decompress(min: u16, bitwidth: u8, input: &[u8], output: &mut [u16; 128]) {
-    assert!(
-        bitwidth <= 16 && bitwidth as usize * 128 / 8 == input.len(),
-        "unexpected input len"
-    );
-    seq_macro::seq!(BITWIDTH in 1..=15 {
-        match bitwidth {
-            0 => (),
-            #(BITWIDTH => decompress_~BITWIDTH::decompress(min, input, output),)*
-            16 => {
-                zerocopy::IntoBytes::as_mut_bytes(output).copy_from_slice(input);
-            },
-            _ => panic!("bitwidth out of bound"),
-        }
-    });
-}
-
-#[test]
-fn test() {
-    for i in 0..=16 {
-        let mut data: [u16; 128] = core::array::from_fn(|_| {
-            if i < 16 {
-                rand::random_range(0..1 << i)
-            } else {
-                rand::random()
-            }
-        });
-        data.sort();
-        let min = data[0];
-        let bitwidth = bitwidth(min, &data);
-        assert!(bitwidth as usize <= i);
-        let mut compressed = vec![0_u8; bitwidth as usize * 128 / 8];
-        compress(min, bitwidth, &data, &mut compressed);
-        let mut decompressed = [0_u16; 128];
-        decompress(min, bitwidth, &compressed, &mut decompressed);
-        assert_eq!(data, decompressed);
-    }
-}
diff --git a/crates/simd/src/bitpacking_u16_unordered.rs b/crates/simd/src/bitpacking_u16_unordered.rs
deleted file mode 100644
index 7f5c972..0000000
--- a/crates/simd/src/bitpacking_u16_unordered.rs
+++ /dev/null
@@ -1,208 +0,0 @@
-// This software is licensed under a dual license model:
-//
-// GNU Affero General Public License v3 (AGPLv3): You may use, modify, and
-// distribute this software under the terms of the AGPLv3.
-//
-// Elastic License v2 (ELv2): You may also use, modify, and distribute this
-// software under the Elastic License v2, which has specific restrictions.
-//
-// We welcome any commercial collaboration or support. For inquiries
-// regarding the licenses, please contact us at:
-// vectorchord-inquiry@tensorchord.ai
-//
-// Copyright (c) 2025-2026 TensorChord Inc.
-
-mod bitwidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
-    pub fn bitwidth(input: &[u16; 128]) -> u8 {
-        let mut reduce_or = 0_u16;
-        for x in input.iter().copied() {
-            reduce_or |= x;
-        }
-        if reduce_or != 0 {
-            1 + reduce_or.ilog2() as u8
-        } else {
-            0
-        }
-    }
-}
-
-pub fn bitwidth(input: &[u16; 128]) -> u8 {
-    bitwidth::bitwidth(input)
-}
-
-seq_macro::seq!(BITWIDTH in 1..=15 {
-    mod compress_~BITWIDTH {
-        #[inline]
-        #[cfg(target_arch = "x86_64")]
-        #[crate::target_cpu(enable = "v2")]
-        fn compress_v2(input: &[u16; 128], output: &mut [u8]) {
-            type S = ();
-            type T = core::arch::x86_64::__m128i;
-            #[inline]
-            #[crate::target_cpu(enable = "v2")]
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            use core::arch::x86_64::_mm_or_si128 as bitor;
-            use core::arch::x86_64::_mm_slli_epi16 as shl;
-            use core::arch::x86_64::_mm_srli_epi16 as shr;
-            crate::bitpacking::compress!(BITWIDTH, 16, (), input, output)
-        }
-
-        #[inline]
-        #[cfg(target_arch = "aarch64")]
-        #[crate::target_cpu(enable = "a2")]
-        fn compress_a2(input: &[u16; 128], output: &mut [u8]) {
-            type S = ();
-            type T = core::arch::aarch64::uint16x8_t;
-            #[inline]
-            #[crate::target_cpu(enable = "a2")]
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            use core::arch::aarch64::vorrq_u16 as bitor;
-            use crate::emulate::vshlq_n_u16 as shl;
-            use crate::emulate::vshrq_n_u16 as shr;
-            crate::bitpacking::compress!(BITWIDTH, 16, (), input, output)
-        }
-
-        #[crate::multiversion(@"v2", @"a2")]
-        pub fn compress(input: &[u16; 128], output: &mut [u8]) {
-            type S = ();
-            type T = [u16; 8];
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            fn bitor(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] | rhs[i])
-            }
-            fn shl<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] << N)
-            }
-            fn shr<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] >> N)
-            }
-            crate::bitpacking::compress!(BITWIDTH, 16, (), input, output)
-        }
-    }
-});
-
-pub fn compress(bitwidth: u8, input: &[u16; 128], output: &mut [u8]) {
-    assert!(
-        bitwidth <= 16 && bitwidth as usize * 128 / 8 == output.len(),
-        "unexpected output len"
-    );
-    seq_macro::seq!(BITWIDTH in 1..=15 {
-        match bitwidth {
-            0 => (),
-            #(BITWIDTH => compress_~BITWIDTH::compress(input, output),)*
-            16 => {
-                output.copy_from_slice(zerocopy::IntoBytes::as_bytes(input));
-            },
-            _ => panic!("bitwidth out of bound"),
-        }
-    });
-}
-
-seq_macro::seq!(BITWIDTH in 1..=15 {
-    mod decompress_~BITWIDTH {
-        #[inline]
-        #[cfg(target_arch = "x86_64")]
-        #[crate::target_cpu(enable = "v2")]
-        fn decompress_v2(input: &[u8], output: &mut [u16; 128]) {
-            type S = ();
-            type T = core::arch::x86_64::__m128i;
-            #[inline]
-            #[crate::target_cpu(enable = "v2")]
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            use core::arch::x86_64::_mm_or_si128 as bitor;
-            use core::arch::x86_64::_mm_and_si128 as bitand;
-            use core::arch::x86_64::_mm_slli_epi16 as shl;
-            use core::arch::x86_64::_mm_srli_epi16 as shr;
-            let mask = core::arch::x86_64::_mm_set1_epi16(((1u16 << BITWIDTH) - 1) as i16);
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output)
-        }
-
-        #[inline]
-        #[cfg(target_arch = "aarch64")]
-        #[crate::target_cpu(enable = "a2")]
-        fn decompress_a2(input: &[u8], output: &mut [u16; 128]) {
-            type S = ();
-            type T = core::arch::aarch64::uint16x8_t;
-            #[inline]
-            #[crate::target_cpu(enable = "a2")]
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            use core::arch::aarch64::vorrq_u16 as bitor;
-            use core::arch::aarch64::vandq_u16 as bitand;
-            use crate::emulate::vshlq_n_u16 as shl;
-            use crate::emulate::vshrq_n_u16 as shr;
-            let mask = core::arch::aarch64::vdupq_n_u16((1u16 << BITWIDTH) - 1);
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output)
-        }
-
-        #[crate::multiversion(@"v2", @"a2")]
-        pub fn decompress(input: &[u8], output: &mut [u16; 128]) {
-            type S = ();
-            type T = [u16; 8];
-            fn delta(&mut (): &mut S, value: T) -> T {
-                value
-            }
-            fn bitor(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] | rhs[i])
-            }
-            fn bitand(lhs: T, rhs: T) -> T {
-                core::array::from_fn(|i| lhs[i] & rhs[i])
-            }
-            fn shl<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] << N)
-            }
-            fn shr<const N: usize>(value: T) -> T {
-                core::array::from_fn(|i| value[i] >> N)
-            }
-            let mask = [(1u16 << BITWIDTH) - 1; _];
-            crate::bitpacking::decompress!(BITWIDTH, 16, mask, (), input, output)
-        }
-    }
-});
-
-pub fn decompress(bitwidth: u8, input: &[u8], output: &mut [u16; 128]) {
-    assert!(
-        bitwidth <= 16 && bitwidth as usize * 128 / 8 == input.len(),
-        "unexpected input len"
-    );
-    seq_macro::seq!(BITWIDTH in 1..=15 {
-        match bitwidth {
-            0 => (),
-            #(BITWIDTH => decompress_~BITWIDTH::decompress(input, output),)*
-            16 => {
-                zerocopy::IntoBytes::as_mut_bytes(output).copy_from_slice(input);
-            },
-            _ => panic!("bitwidth out of bound"),
-        }
-    });
-}
-
-#[test]
-fn test() {
-    for i in 0..=16 {
-        let data: [u16; 128] = core::array::from_fn(|_| {
-            if i < 16 {
-                rand::random_range(0..1 << i)
-            } else {
-                rand::random()
-            }
-        });
-        let bitwidth = bitwidth(&data);
-        assert!(bitwidth as usize <= i);
-        let mut compressed = vec![0_u8; i * 128 / 8];
-        compress(bitwidth, &data, &mut compressed);
-        let mut decompressed = [0_u16; 128];
-        decompress(bitwidth, &compressed, &mut decompressed);
-        assert_eq!(data, decompressed);
-    }
-}
diff --git a/crates/simd/src/bitpacking_u32_ordered.rs b/crates/simd/src/bitpacking_u32_ordered.rs
index 2655c7c..e88ddad 100644
--- a/crates/simd/src/bitpacking_u32_ordered.rs
+++ b/crates/simd/src/bitpacking_u32_ordered.rs
@@ -13,7 +13,9 @@
 // Copyright (c) 2025-2026 TensorChord Inc.
 
 mod bitwidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn bitwidth(min: u32, input: &[u32; 128]) -> u8 {
         let mut last = min;
         let mut reduce_or = 0_u32;
@@ -79,7 +81,80 @@ seq_macro::seq!(BITWIDTH in 1..=31 {
             crate::bitpacking::compress!(BITWIDTH, 32, state, input, output)
         }
 
-        #[crate::multiversion(@"v2", @"a2")]
+        #[inline]
+        #[cfg(target_arch = "s390x")]
+        #[crate::target_cpu(enable = "z13")]
+        fn compress_z13(min: u32, input: &[u32; 128], output: &mut [u8]) {
+            type S = core::arch::s390x::vector_unsigned_int;
+            type T = core::arch::s390x::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn delta(state: &mut S, value: T) -> T {
+                unsafe {
+                    use core::arch::s390x::*;
+                    let result = vec_sub(value, vec_sld::<_, 12>(*state, value));
+                    *state = value;
+                    result
+                }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) }
+            }
+            let state = unsafe { core::arch::s390x::vec_splats(min) };
+            crate::bitpacking::compress!(BITWIDTH, 32, state, input, output)
+        }
+
+        #[inline]
+        #[cfg(target_arch = "powerpc64")]
+        #[crate::target_cpu(enable = "p7")]
+        fn compress_p7(min: u32, input: &[u32; 128], output: &mut [u8]) {
+            type S = core::arch::powerpc64::vector_unsigned_int;
+            type T = core::arch::powerpc64::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn delta(state: &mut S, value: T) -> T {
+                unsafe {
+                    use core::arch::powerpc64::*;
+                    #[cfg(target_endian = "big")]
+                    let result = vec_sub(value, vec_sld::<_, 12>(*state, value));
+                    #[cfg(target_endian = "little")]
+                    let result = vec_sub(value, vec_sld::<_, 4>(value, *state));
+                    *state = value;
+                    result
+                }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            let state = unsafe { core::arch::powerpc64::vec_splats(min) };
+            crate::bitpacking::compress!(BITWIDTH, 32, state, input, output)
+        }
+
+        #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")]
         pub fn compress(min: u32, input: &[u32; 128], output: &mut [u8]) {
             type S = u32;
             type T = [u32; 4];
@@ -187,7 +262,109 @@ seq_macro::seq!(BITWIDTH in 1..=31 {
             crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output)
         }
 
-        #[crate::multiversion(@"v2", @"a2")]
+        #[inline]
+        #[cfg(target_arch = "s390x")]
+        #[crate::target_cpu(enable = "z13")]
+        fn decompress_z13(min: u32, input: &[u8], output: &mut [u32; 128]) {
+            type S = core::arch::s390x::vector_unsigned_int;
+            type T = core::arch::s390x::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn delta(state: &mut S, value: T) -> T {
+                unsafe {
+                    use core::arch::s390x::*;
+                    let zero = vec_splat_u32::<0>();
+                    let base = vec_splat::<_, 3>(*state);
+                    let x0 = value;
+                    let x1 = vec_sld::<_, 8>(zero, x0);
+                    let x2 = vec_add(x0, x1);
+                    let x3 = vec_sld::<_, 12>(zero, x2);
+                    let x4 = vec_add(x2, x3);
+                    let result = vec_add(base, x4);
+                    *state = result;
+                    result
+                }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitand(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_and(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) }
+            }
+            let mask = unsafe { core::arch::s390x::vec_splats((1u32 << BITWIDTH) - 1) };
+            let state = unsafe { core::arch::s390x::vec_splats(min) };
+            crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output)
+        }
+
+        #[inline]
+        #[cfg(target_arch = "powerpc64")]
+        #[crate::target_cpu(enable = "p7")]
+        fn decompress_p7(min: u32, input: &[u8], output: &mut [u32; 128]) {
+            type S = core::arch::powerpc64::vector_unsigned_int;
+            type T = core::arch::powerpc64::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn delta(state: &mut S, value: T) -> T {
+                unsafe {
+                    use core::arch::powerpc64::*;
+                    let zero = vec_splat_u32::<0>();
+                    let base = vec_splat::<_, 3>(*state);
+                    let x0 = value;
+                    #[cfg(target_endian = "big")]
+                    let x1 = vec_sld::<_, 8>(zero, x0);
+                    #[cfg(target_endian = "little")]
+                    let x1 = vec_sld::<_, 8>(x0, zero);
+                    let x2 = vec_add(x0, x1);
+                    #[cfg(target_endian = "big")]
+                    let x3 = vec_sld::<_, 12>(zero, x2);
+                    #[cfg(target_endian = "little")]
+                    let x3 = vec_sld::<_, 4>(x2, zero);
+                    let x4 = vec_add(x2, x3);
+                    let result = vec_add(base, x4);
+                    *state = result;
+                    result
+                }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitand(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_and(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            let mask = unsafe { core::arch::powerpc64::vec_splats((1u32 << BITWIDTH) - 1) };
+            let state = unsafe { core::arch::powerpc64::vec_splats(min) };
+            crate::bitpacking::decompress!(BITWIDTH, 32, mask, state, input, output)
+        }
+
+        #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")]
         pub fn decompress(min: u32, input: &[u8], output: &mut [u32; 128]) {
             type S = u32;
             type T = [u32; 4];
diff --git a/crates/simd/src/bitpacking_u32_unordered.rs b/crates/simd/src/bitpacking_u32_unordered.rs
index ec1af80..ca65586 100644
--- a/crates/simd/src/bitpacking_u32_unordered.rs
+++ b/crates/simd/src/bitpacking_u32_unordered.rs
@@ -13,7 +13,9 @@
 // Copyright (c) 2025-2026 TensorChord Inc.
 
 mod bitwidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn bitwidth(input: &[u32; 128]) -> u8 {
         let mut reduce_or = 0_u32;
         for x in input.iter().copied() {
@@ -67,7 +69,65 @@ seq_macro::seq!(BITWIDTH in 1..=31 {
             crate::bitpacking::compress!(BITWIDTH, 32, (), input, output)
         }
 
-        #[crate::multiversion(@"v2", @"a2")]
+        #[inline]
+        #[cfg(target_arch = "s390x")]
+        #[crate::target_cpu(enable = "z13")]
+        fn compress_z13(input: &[u32; 128], output: &mut [u8]) {
+            type S = ();
+            type T = core::arch::s390x::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn delta(&mut (): &mut S, value: T) -> T {
+                value
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) }
+            }
+            crate::bitpacking::compress!(BITWIDTH, 32, (), input, output)
+        }
+
+        #[inline]
+        #[cfg(target_arch = "powerpc64")]
+        #[crate::target_cpu(enable = "p7")]
+        fn compress_p7(input: &[u32; 128], output: &mut [u8]) {
+            type S = ();
+            type T = core::arch::powerpc64::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn delta(&mut (): &mut S, value: T) -> T {
+                value
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            crate::bitpacking::compress!(BITWIDTH, 32, (), input, output)
+        }
+
+        #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")]
         pub fn compress(input: &[u32; 128], output: &mut [u8]) {
             type S = ();
             type T = [u32; 4];
@@ -145,7 +205,77 @@ seq_macro::seq!(BITWIDTH in 1..=31 {
             crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output)
         }
 
-        #[crate::multiversion(@"v2", @"a2")]
+        #[inline]
+        #[cfg(target_arch = "s390x")]
+        #[crate::target_cpu(enable = "z13")]
+        fn decompress_z13(input: &[u8], output: &mut [u32; 128]) {
+            type S = ();
+            type T = core::arch::s390x::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn delta(&mut (): &mut S, value: T) -> T {
+                value
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn bitand(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::s390x::vec_and(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sl(x, core::arch::s390x::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "z13")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::s390x::vec_sr(x, core::arch::s390x::vec_splats(N)) }
+            }
+            let mask = unsafe { core::arch::s390x::vec_splats((1u32 << BITWIDTH) - 1) };
+            crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output)
+        }
+
+        #[inline]
+        #[cfg(target_arch = "powerpc64")]
+        #[crate::target_cpu(enable = "p7")]
+        fn decompress_p7(input: &[u8], output: &mut [u32; 128]) {
+            type S = ();
+            type T = core::arch::powerpc64::vector_unsigned_int;
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn delta(&mut (): &mut S, value: T) -> T {
+                value
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitor(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_or(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn bitand(lhs: T, rhs: T) -> T {
+                unsafe { core::arch::powerpc64::vec_and(lhs, rhs) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shl<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sl(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            #[inline]
+            #[crate::target_cpu(enable = "p7")]
+            fn shr<const N: u32>(x: T) -> T {
+                unsafe { core::arch::powerpc64::vec_sr(x, core::arch::powerpc64::vec_splats(N)) }
+            }
+            let mask = unsafe { core::arch::powerpc64::vec_splats((1u32 << BITWIDTH) - 1) };
+            crate::bitpacking::decompress!(BITWIDTH, 32, mask, (), input, output)
+        }
+
+        #[crate::multiversion(@"v2", @"a2", @"z13", @"p7")]
         pub fn decompress(input: &[u8], output: &mut [u32; 128]) {
             type S = ();
             type T = [u32; 4];
diff --git a/crates/simd/src/bytepacking_u32_ordered.rs b/crates/simd/src/bytepacking_u32_ordered.rs
index 5721b90..aa3d5f1 100644
--- a/crates/simd/src/bytepacking_u32_ordered.rs
+++ b/crates/simd/src/bytepacking_u32_ordered.rs
@@ -13,7 +13,9 @@
 // Copyright (c) 2025-2026 TensorChord Inc.
 
 mod bytewidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn bytewidth(min: u32, input: &[u32]) -> u8 {
         let mut last = min;
         let mut reduce_or = 0_u32;
@@ -35,7 +37,9 @@ pub fn bytewidth(min: u32, input: &[u32]) -> u8 {
 }
 
 mod compress_1 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(min: u32, input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<1>();
@@ -59,7 +63,9 @@ mod compress_1 {
 }
 
 mod decompress_1 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<1>();
@@ -84,7 +90,9 @@ mod decompress_1 {
 }
 
 mod compress_2 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(min: u32, input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<2>();
@@ -108,7 +116,9 @@ mod compress_2 {
 }
 
 mod decompress_2 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<2>();
@@ -133,7 +143,9 @@ mod decompress_2 {
 }
 
 mod compress_3 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(min: u32, input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<3>();
@@ -157,7 +169,9 @@ mod compress_3 {
 }
 
 mod decompress_3 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(min: u32, input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<3>();
diff --git a/crates/simd/src/bytepacking_u32_unordered.rs b/crates/simd/src/bytepacking_u32_unordered.rs
index 2c874ab..1efae72 100644
--- a/crates/simd/src/bytepacking_u32_unordered.rs
+++ b/crates/simd/src/bytepacking_u32_unordered.rs
@@ -13,7 +13,9 @@
 // Copyright (c) 2025-2026 TensorChord Inc.
 
 mod bytewidth {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn bytewidth(input: &[u32]) -> u8 {
         let mut reduce_or = 0_u32;
         for x in input.iter().copied() {
@@ -33,7 +35,9 @@ pub fn bytewidth(input: &[u32]) -> u8 {
 }
 
 mod compress_1 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<1>();
@@ -55,7 +59,9 @@ mod compress_1 {
 }
 
 mod decompress_1 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<1>();
@@ -78,7 +84,9 @@ mod decompress_1 {
 }
 
 mod compress_2 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<2>();
@@ -100,7 +108,9 @@ mod compress_2 {
 }
 
 mod decompress_2 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<2>();
@@ -123,7 +133,9 @@ mod decompress_2 {
 }
 
 mod compress_3 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn compress(input: &[u32], output: &mut [u8]) {
         assert!(input.len() <= 128);
         let (output, remainder) = output.as_chunks_mut::<3>();
@@ -145,7 +157,9 @@ mod compress_3 {
 }
 
 mod decompress_3 {
-    #[crate::multiversion("v4", "v3", "v2", "a2")]
+    #[crate::multiversion(
+        "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7"
+    )]
     pub fn decompress(input: &[u8], output: &mut [u32]) {
         assert!(output.len() <= 128);
         let (input, remainder) = input.as_chunks::<3>();
diff --git a/crates/simd/src/emulate.rs b/crates/simd/src/emulate.rs
index d0a8d56..fa4a6cb 100644
--- a/crates/simd/src/emulate.rs
+++ b/crates/simd/src/emulate.rs
@@ -12,21 +12,6 @@
 //
 // Copyright (c) 2025-2026 TensorChord Inc.
 
-#[cfg(target_arch = "aarch64")]
-#[inline]
-#[target_feature(enable = "neon")]
-pub fn vshlq_n_u16<const N: usize>(
-    a: core::arch::aarch64::uint16x8_t,
-) -> core::arch::aarch64::uint16x8_t {
-    seq_macro::seq!(I in 1..16 {
-        match N {
-            0 => a,
-            #(I => core::arch::aarch64::vshlq_n_u16::<I>(a),)*
-            _ => unreachable!(),
-        }
-    })
-}
-
 #[cfg(target_arch = "aarch64")]
 #[inline]
 #[target_feature(enable = "neon")]
@@ -42,21 +27,6 @@ pub fn vshlq_n_u32<const N: usize>(
     })
 }
 
-#[cfg(target_arch = "aarch64")]
-#[inline]
-#[target_feature(enable = "neon")]
-pub fn vshrq_n_u16<const N: usize>(
-    a: core::arch::aarch64::uint16x8_t,
-) -> core::arch::aarch64::uint16x8_t {
-    seq_macro::seq!(I in 1..16 {
-        match N {
-            0 => a,
-            #(I => core::arch::aarch64::vshrq_n_u16::<I>(a),)*
-            _ => unreachable!(),
-        }
-    })
-}
-
 #[cfg(target_arch = "aarch64")]
 #[inline]
 #[target_feature(enable = "neon")]
diff --git a/crates/simd/src/lib.rs b/crates/simd/src/lib.rs
index 10978e1..623ad8f 100644
--- a/crates/simd/src/lib.rs
+++ b/crates/simd/src/lib.rs
@@ -24,8 +24,6 @@
 mod bitpacking;
 mod emulate;
 
-pub mod bitpacking_u16_ordered;
-pub mod bitpacking_u16_unordered;
 pub mod bitpacking_u32_ordered;
 pub mod bitpacking_u32_unordered;
 pub mod bytepacking_u32_ordered;