Skip to content

Commit 76cdf28

Browse files
fix: Use correct byte representation for decimal hashing (#1998)
## Which issue does this PR close? - Closes #1981. ## What changes are included in this PR? The [spec](https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements) states that: >"Decimal values are hashed using the minimum number of bytes required to hold the unscaled value as a two's complement big-endian". Prior to this fix, we would incorrectly consume leading `0xFF` bytes and hash them. Now, we only consume the bytes starting with the one that is used to preserve the sign, and everything that follows it. ## Are these changes tested? Added unit tests for original scenario mentioned in the issue, as well as some additional cases
1 parent 700e62e commit 76cdf28

1 file changed

Lines changed: 39 additions & 4 deletions

File tree

crates/iceberg/src/transform/bucket.rs

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,26 @@ impl Bucket {
7878
/// ref: https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
7979
#[inline]
8080
fn hash_decimal(v: i128) -> i32 {
81+
if v == 0 {
82+
return Self::hash_bytes(&[0]);
83+
}
84+
8185
let bytes = v.to_be_bytes();
82-
if let Some(start) = bytes.iter().position(|&x| x != 0) {
83-
Self::hash_bytes(&bytes[start..])
86+
let start = if v > 0 {
87+
// Positive: skip 0x00 unless next byte would appear negative
88+
bytes
89+
.windows(2)
90+
.position(|w| w[0] != 0x00 || w[1] & 0x80 != 0)
91+
.unwrap_or(15)
8492
} else {
85-
Self::hash_bytes(&[0])
86-
}
93+
// Negative: skip 0xFF only if next byte stays negative
94+
bytes
95+
.windows(2)
96+
.position(|w| w[0] != 0xFF || w[1] & 0x80 == 0)
97+
.unwrap_or(15)
98+
};
99+
100+
Self::hash_bytes(&bytes[start..])
87101
}
88102

89103
/// def bucket_N(x) = (murmur3_x86_32_hash(x) & Integer.MAX_VALUE) % N
@@ -790,6 +804,27 @@ mod test {
790804
);
791805
}
792806

807+
#[test]
808+
fn test_hash_decimal_with_negative_value() {
809+
// Test cases from GitHub issue #1981
810+
assert_eq!(Bucket::hash_decimal(1), -463810133);
811+
assert_eq!(Bucket::hash_decimal(-1), -43192051);
812+
813+
// Additional test cases for edge case values
814+
assert_eq!(Bucket::hash_decimal(0), Bucket::hash_decimal(0));
815+
assert_eq!(Bucket::hash_decimal(127), Bucket::hash_decimal(127));
816+
assert_eq!(Bucket::hash_decimal(-128), Bucket::hash_decimal(-128));
817+
818+
// Test minimum representation is used
819+
// -1 should hash as [0xFF] not [0xFF, 0xFF, ..., 0xFF]
820+
// 128 should hash as [0x00, 0x80] not [0x00, 0x00, ..., 0x80]
821+
assert_eq!(Bucket::hash_decimal(128), Bucket::hash_bytes(&[0x00, 0x80]));
822+
assert_eq!(
823+
Bucket::hash_decimal(-129),
824+
Bucket::hash_bytes(&[0xFF, 0x7F])
825+
);
826+
}
827+
793828
#[test]
794829
fn test_int_literal() {
795830
let bucket = Bucket::new(10);

0 commit comments

Comments
 (0)