Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ coverage:
patch:
default:
target: 100
ignore:
- "numcodecs/tests/**"
comment:
layout: "diff, files"
behavior: default
Expand Down
27 changes: 22 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
fail-fast: false
matrix:
python-version: ["3.11", "3.12", "3.13"]
# macos-13 is an intel runner, macos-14 is a arm64 runner
platform: [ubuntu-latest, windows-latest, macos-13, macos-14]
# macos-15-large is an intel runner, macos-14 is a arm64 runner
platform: [ubuntu-latest, windows-latest, macos-15-large, macos-14]

defaults:
run:
Expand All @@ -38,7 +38,7 @@ jobs:
run: conda install -y c-compiler cxx-compiler

- name: Install clang
if: matrix.platform == 'macos-13'
if: matrix.platform == 'macos-15-large'
run: conda install -y 'clang>=12.0.1,<17'

- name: Show conda environment info
Expand All @@ -52,12 +52,29 @@ jobs:
export DISABLE_NUMCODECS_AVX2=""
python -m pip install -v -e .[test,test_extras,msgpack,crc32c,pcodec,zfpy]

- name: Deduplicate LC_RPATH entries
# The conda-forge compilers inject -Wl,-rpath,$PREFIX/lib multiple
# times, producing duplicate LC_RPATH load commands. macOS 15's dyld
# rejects duplicate LC_RPATH at load time, so strip the extras.
if: runner.os == 'macOS'
run: |
rpaths() { otool -l "$1" | awk '/ LC_RPATH$/{getline; getline; print $2}'; }
for so in numcodecs/*.so; do
for rp in $(rpaths "$so" | sort | uniq -d); do
while [ "$(rpaths "$so" | grep -cx "$rp")" -gt 1 ]; do
echo "Removing duplicate LC_RPATH '$rp' from $so"
install_name_tool -delete_rpath "$rp" "$so"
done
done
done

- name: Install zarr-python
# Since zarr v3 requires numpy >= 1.25, on Python 3.11 leave it out
# so we can have some tests of our minimum version of numpy (1.24)
if: matrix.python-version != '3.11'
# TODO: remove --pre option when zarr v3 is out
run: python -m pip install --pre zarr>=3.0.0b2
# numcodecs.zarr3 in this branch targets the zarr 3.0.x API; zarr 3.1+
# moved these codecs into zarr itself and changed internal APIs.
run: python -m pip install "zarr>=3.0,<3.1"

- name: List installed packages
run: python -m pip list
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
strategy:
fail-fast: false
matrix:
# macos-13 is an intel runner, macos-14 is a arm64 runner
os: [ubuntu-latest, windows-latest, macos-13, macos-14]
# macos-15-large is an intel runner, macos-14 is an arm64 runner
os: [ubuntu-latest, windows-latest, macos-15-large, macos-14]
env:
CIBW_TEST_COMMAND: python -c "import numcodecs"
CIBW_BUILD: "cp311-* cp312-* cp313-*"
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ repos:
hooks:
- id: mypy
args: [--config-file, pyproject.toml]
additional_dependencies: [numpy, pytest, crc32c, zfpy, 'zarr>=3.0.0rc1']
additional_dependencies: [numpy, pytest, crc32c, zfpy, 'zarr>=3.0.0rc1,<3.1']
4 changes: 2 additions & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ submodules:
include: all

build:
os: ubuntu-20.04
os: ubuntu-24.04
tools:
python: "3.12"
jobs:
post_install:
- python -m pip install --pre 'zarr>=3.0.0b2'
- python -m pip install --pre 'zarr~=3.0.0'

sphinx:
configuration: docs/conf.py
Expand Down
6 changes: 6 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ Improvements
~~~~~~~~~~~~
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
By :user:`Cas Wognum <cwognum>`.
* Add streaming decompression for Zstandard when the frame content size is unknown.
By :user:`Mark Kittisopikul <mkitti>`, :issue:`707`
* Fix Zstd decompression negative size issue on 32-bit platforms.
By :user:`Mark Kittisopikul <mkitti>`, :issue:`782`
* Allow Zstandard to decompress multiple concatenated frames.
By :user:`Mark Kittisopikul <mkitti>`, :issue:`757`

Fixes
~~~~~
Expand Down
4 changes: 2 additions & 2 deletions numcodecs/tests/test_checksum32.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@
)


@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
@pytest.mark.parametrize(("codec", "arr"), tuple(itertools.product(codecs, arrays)))
def test_encode_decode(codec, arr):
check_encode_decode(arr, codec)


@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
@pytest.mark.parametrize(("codec", "arr"), tuple(itertools.product(codecs, arrays)))
def test_errors(codec, arr):
enc = codec.encode(arr)
with pytest.raises(RuntimeError):
Expand Down
76 changes: 76 additions & 0 deletions numcodecs/tests/test_pyzstd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Check Zstd against pyzstd package

import numpy as np
import pytest
import pyzstd

from numcodecs.zstd import Zstd

test_data = [
b"Hello World!",
np.arange(113).tobytes(),
np.arange(10, 15).tobytes(),
np.random.randint(3, 50, size=(53,), dtype=np.uint16).tobytes(),
]


@pytest.mark.parametrize("input", test_data)
def test_pyzstd_simple(input):
"""
Test if Zstd.[decode, encode] can perform the inverse operation to
pyzstd.[compress, decompress] in the simple case.
"""
z = Zstd()
assert z.decode(pyzstd.compress(input)) == input
assert pyzstd.decompress(z.encode(input)) == input


@pytest.mark.parametrize("input", test_data)
def test_pyzstd_simple_multiple_frames_decode(input):
"""
Test decompression of two concatenated frames of known sizes

numcodecs.zstd.Zstd currently fails because it only assesses the size of the
first frame. Rather, it should keep iterating through all the frames until
the end of the input buffer.
"""
z = Zstd()
assert pyzstd.decompress(pyzstd.compress(input) * 2) == input * 2
assert z.decode(pyzstd.compress(input) * 2) == input * 2


@pytest.mark.parametrize("input", test_data)
def test_pyzstd_simple_multiple_frames_encode(input):
"""
Test if pyzstd can decompress two concatenated frames from Zstd.encode
"""
z = Zstd()
assert pyzstd.decompress(z.encode(input) * 2) == input * 2


@pytest.mark.parametrize("input", test_data)
def test_pyzstd_streaming(input):
"""
Test if Zstd can decode a single frame and concatenated frames in streaming
mode where the decompressed size is not recorded in the frame header.
"""
pyzstd_c = pyzstd.ZstdCompressor()
pyzstd_d = pyzstd.ZstdDecompressor()
pyzstd_e = pyzstd.EndlessZstdDecompressor()
z = Zstd()

d_bytes = input
pyzstd_c.compress(d_bytes)
c_bytes = pyzstd_c.flush()
assert z.decode(c_bytes) == d_bytes
assert pyzstd_d.decompress(z.encode(d_bytes)) == d_bytes

# Test multiple streaming frames
assert z.decode(c_bytes * 2) == pyzstd_e.decompress(c_bytes * 2)
assert z.decode(c_bytes * 3) == pyzstd_e.decompress(c_bytes * 3)
assert z.decode(c_bytes * 4) == pyzstd_e.decompress(c_bytes * 4)
assert z.decode(c_bytes * 5) == pyzstd_e.decompress(c_bytes * 5)
assert z.decode(c_bytes * 7) == pyzstd_e.decompress(c_bytes * 7)
assert z.decode(c_bytes * 11) == pyzstd_e.decompress(c_bytes * 11)
assert z.decode(c_bytes * 13) == pyzstd_e.decompress(c_bytes * 13)
assert z.decode(c_bytes * 99) == pyzstd_e.decompress(c_bytes * 99)
2 changes: 2 additions & 0 deletions numcodecs/tests/test_zarr3.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,14 @@ def test_generic_compressor(
(numcodecs.zarr3.FixedScaleOffset, {"offset": 0, "scale": 25.5}),
(numcodecs.zarr3.FixedScaleOffset, {"offset": 0, "scale": 51, "astype": "uint16"}),
(numcodecs.zarr3.AsType, {"encode_dtype": "float32", "decode_dtype": "float32"}),
(numcodecs.zarr3.AsType, {"encode_dtype": "float32"}),
],
ids=[
"delta",
"fixedscaleoffset",
"fixedscaleoffset2",
"astype",
"astype_no_decode_dtype",
],
)
def test_generic_filter(
Expand Down
102 changes: 102 additions & 0 deletions numcodecs/tests/test_zstd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import subprocess

import numpy as np
import pytest
Expand Down Expand Up @@ -90,3 +91,104 @@ def test_native_functions():
assert Zstd.default_level() == 3
assert Zstd.min_level() == -131072
assert Zstd.max_level() == 22


def test_streaming_decompression():
# Test input frames with unknown frame content size
codec = Zstd()

# If the zstd command line interface is available, check the bytes
cli = zstd_cli_available()
if cli:
view_zstd_streaming_bytes()

# Encode bytes directly that were the result of streaming compression
bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!'
dec = codec.decode(bytes_val)
dec_expected = b'Hello World!'
assert dec == dec_expected
if cli:
assert bytes_val == generate_zstd_streaming_bytes(dec_expected)
assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True)

# Two consecutive frames given as input
bytes2 = bytes(bytearray(bytes_val * 2))
dec2 = codec.decode(bytes2)
dec2_expected = b'Hello World!Hello World!'
assert dec2 == dec2_expected
if cli:
assert dec2_expected == generate_zstd_streaming_bytes(bytes2, decompress=True)

# Single long frame that decompresses to a large output
bytes3 = b'(\xb5/\xfd\x00X$\x02\x00\xa4\x03ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz\x01\x00:\xfc\xdfs\x05\x05L\x00\x00\x08s\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08k\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08c\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08[\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08S\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08K\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08C\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08u\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08m\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08e\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08]\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08U\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08M\x01\x00\xfc\xff9\x10\x02M\x00\x00\x08E\x01\x00\xfc\x7f\x1d\x08\x01'
dec3 = codec.decode(bytes3)
dec3_expected = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz' * 1024 * 32
assert dec3 == dec3_expected
if cli:
assert bytes3 == generate_zstd_streaming_bytes(dec3_expected)
assert dec3_expected == generate_zstd_streaming_bytes(bytes3, decompress=True)

# Garbage input results in an error
bytes4 = bytes(bytearray([0, 0, 0, 0, 0, 0, 0, 0]))
with pytest.raises(RuntimeError, match='Zstd decompression error: invalid input data'):
codec.decode(bytes4)


def test_multi_frame():
codec = Zstd()

hello_world = codec.encode(b"Hello world!")
assert codec.decode(hello_world) == b"Hello world!"
assert codec.decode(hello_world * 2) == b"Hello world!Hello world!"

hola = codec.encode(b"Hola ")
mundo = codec.encode(b"Mundo!")
assert codec.decode(hola) == b"Hola "
assert codec.decode(mundo) == b"Mundo!"
assert codec.decode(hola + mundo) == b"Hola Mundo!"

bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!'
dec = codec.decode(bytes_val)
dec_expected = b'Hello World!'
assert dec == dec_expected
cli = zstd_cli_available()
if cli:
assert bytes_val == generate_zstd_streaming_bytes(dec_expected)
assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True)

# Concatenate frames of known sizes and unknown sizes
# unknown size frame at the end
assert codec.decode(hola + mundo + bytes_val) == b"Hola Mundo!Hello World!"
# unknown size frame at the beginning
assert codec.decode(bytes_val + hola + mundo) == b"Hello World!Hola Mundo!"
# unknown size frame in the middle
assert codec.decode(hola + bytes_val + mundo) == b"Hola Hello World!Mundo!"


def generate_zstd_streaming_bytes(input: bytes, *, decompress: bool = False) -> bytes:
"""
Use the zstd command line interface to compress or decompress bytes in streaming mode.
"""
if decompress:
args = ["-d"]
else:
args = []

p = subprocess.run(["zstd", "--no-check", *args], input=input, capture_output=True)
return p.stdout


def view_zstd_streaming_bytes():
bytes_val = generate_zstd_streaming_bytes(b"Hello world!")
print(f" bytes_val = {bytes_val}")

bytes3 = generate_zstd_streaming_bytes(
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz" * 1024 * 32
)
print(f" bytes3 = {bytes3}")


def zstd_cli_available() -> bool:
return not subprocess.run(
["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
).returncode
Comment on lines +191 to +194
4 changes: 2 additions & 2 deletions numcodecs/zarr3.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def __init__(self, **codec_config: dict[str, JSON]) -> None:

def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
if astype := self.codec_config.get("astype"):
return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[arg-type]
return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[call-overload]
return chunk_spec


Expand All @@ -304,7 +304,7 @@ def __init__(self, **codec_config: JSON) -> None:

def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
if astype := self.codec_config.get("astype"):
return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[arg-type]
return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[call-overload]
return chunk_spec

def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset:
Expand Down
Loading
Loading