diff --git a/docs/content/concepts/spec/fileformat.md b/docs/content/concepts/spec/fileformat.md index 27d4f63d8182..ec4a133f6835 100644 --- a/docs/content/concepts/spec/fileformat.md +++ b/docs/content/concepts/spec/fileformat.md @@ -791,3 +791,11 @@ Limitations: 3. Statistics collection is not supported for BLOB columns. For usage details, configuration options, and examples, see [Blob Type]({{< ref "append-table/blob" >}}). + +## MOSAIC + +Mosaic is a columnar-bucket hybrid format optimized for wide tables (10,000+ columns). Columns are hashed into buckets +by name, stored row-oriented within each bucket, and independently compressed. This enables efficient projection pushdown +at bucket granularity — reading 10 columns out of 10,000 only decompresses the buckets that contain those 10 columns. + +For the detailed file format specification, see [Mosaic File Format]({{< ref "concepts/spec/mosaic" >}}). diff --git a/docs/content/concepts/spec/mosaic.md b/docs/content/concepts/spec/mosaic.md new file mode 100644 index 000000000000..6c5169a3b0c3 --- /dev/null +++ b/docs/content/concepts/spec/mosaic.md @@ -0,0 +1,313 @@ +--- +title: "Mosaic" +weight: 9 +type: docs +aliases: +- /concepts/spec/mosaic.html +--- + + +# Mosaic File Format + +Mosaic is a columnar-bucket hybrid format optimized for wide tables (10,000+ columns). Columns are hashed into buckets +by name, stored column-oriented within each bucket, and independently compressed. This enables efficient projection +pushdown at bucket granularity — reading 10 columns out of 10,000 only decompresses the buckets that contain those +10 columns. + +## File Layout + +``` ++--------------------------------------------+ +| Row Group 0: Bucket Data | +| [Bucket 0 compressed block] | +| [Bucket 3 compressed block] | +| ... (only non-empty buckets) | ++--------------------------------------------+ +| Row Group 1: Bucket Data | +| ... | ++--------------------------------------------+ +| Schema Block | +| [4 bytes: uncompressed size (BE int)] | +| [schema data (possibly compressed)] | ++--------------------------------------------+ +| Row Group Index (varint encoded) | ++--------------------------------------------+ +| Footer (32 bytes, fixed) | ++--------------------------------------------+ +``` + +## Footer (32 bytes, big-endian) + +| Offset | Size | Field | Description | +|--------|------|-------------------|------------------------------------| +| 0 | 8 | indexOffset | Absolute offset of Row Group Index | +| 8 | 8 | schemaBlockOffset | Absolute offset of Schema Block | +| 16 | 4 | numBuckets | Total number of buckets | +| 20 | 4 | numRowGroups | Total number of row groups | +| 24 | 1 | compression | 0 = none, 1 = zstd | +| 25 | 1 | version | Format version (currently 1) | +| 26 | 2 | (reserved) | Padding, set to 0 | +| 28 | 4 | magic | `MOSA` (0x4D4F5341) | + +## Row Group Index + +Varint-encoded, only non-empty buckets are stored. For each row group: + +``` +varint numRows +varint nonEmptyCount +repeated nonEmptyCount times: + varint bucketId + 8 bytes bucketOffset (big-endian, absolute file offset) + varint compressedSize + varint uncompressedSize +``` + +## Schema Block + +Prefixed with a 4-byte big-endian int (uncompressed size), followed by the schema data (compressed with the file's +compression method). + +Column names are stored using **front coding** (incremental encoding): each name shares a prefix with the previous name, +and only the suffix is stored. This is the same technique used by Lucene, LevelDB, and RocksDB for their block index +entries. + +``` +varint numColumns +varint numBuckets +repeated numColumns times: + varint fieldId + varint bucketId + varint indexInBucket + varint sharedPrefixLen (bytes shared with previous column name) + varint suffixLen (bytes of new suffix) + bytes suffix (UTF-8) (suffixLen bytes) + TypeDescriptor +``` + +The first column has `sharedPrefixLen = 0`. To reconstruct a column name, take the first `sharedPrefixLen` bytes from +the previous name and append the suffix. + +### TypeDescriptor + +``` +1 byte typeId +1 byte nullable (0 = not null, 1 = nullable) +[type-specific params] +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
typeIdTypeParams
0BOOLEAN(none)
1TINYINT(none)
2SMALLINT(none)
3INTEGER(none)
4BIGINT(none)
5FLOAT(none)
6DOUBLE(none)
7DATE(none)
8CHARvarint length
9VARCHARvarint length
10STRING(none) — VARCHAR with MAX_LENGTH
11BINARYvarint length
12VARBINARYvarint length
13BYTES(none) — VARBINARY with MAX_LENGTH
14DECIMALvarint precision, varint scale
15TIMEvarint precision
16TIMESTAMPvarint precision
17TIMESTAMP_LTZvarint precision
+ +Complex types (ARRAY, MAP, ROW, etc.), VARIANT, and BLOB are not supported. + +## Bucket Data + +Each bucket is stored as a **column-oriented** block. Within a bucket, each column is independently encoded using one +of four encodings (PLAIN, CONST, DICT, or ALL_NULL), chosen automatically based on the column's value distribution. + +### Bucket Block Layout (before compression) + +``` ++--------------------------------------------+ +| Encoding Flags | +| 2 bits per column, packed into bytes | ++--------------------------------------------+ +| Has-Nulls Flags | +| 1 bit per column, packed into bytes | ++--------------------------------------------+ +| Const Metadata (CONST columns only) | +| serialized value for each CONST column | ++--------------------------------------------+ +| Dict Metadata (DICT columns only) | +| for each DICT column: | +| varint numEntries | +| repeated: serialized value per entry | ++--------------------------------------------+ +| Null Bitmaps | +| ceil(numRows/8) bytes per column | +| (only for columns with nulls, | +| excluding ALL_NULL columns) | ++--------------------------------------------+ +| Column Data | +| PLAIN: raw serialized values | +| DICT: 1-byte index per non-null cell | +| CONST/ALL_NULL: (nothing) | ++--------------------------------------------+ +``` + +**Encoding Flags**: 2 bits per column, packed left-to-right. Encoding values: + +| Value | Encoding | Description | +|-------|----------|-------------| +| 0 | PLAIN | Raw serialized values for each non-null cell | +| 1 | CONST | All non-null values are identical; the single value is stored in metadata | +| 2 | DICT | 2-255 distinct values; each non-null cell stores a 1-byte dictionary index | +| 3 | ALL_NULL | Every cell in this column is null; no data or null bitmap stored | + +**Has-Nulls Flags**: 1 bit per column. If set, a null bitmap exists for that column. ALL_NULL columns always have +this flag cleared (no bitmap is stored for them). + +**Null Bitmap**: `ceil(numRows / 8)` bytes per column. Bit `i` = 1 means row `i` is null. Only present for columns +where has-nulls flag is set. + +### Column Encoding Selection + +The encoding for each column is chosen automatically during writing based on value distribution and cost: + +- **ALL_NULL**: 0 non-null values +- **CONST**: exactly 1 distinct non-null value (any number of nulls allowed) +- **DICT**: 2-255 distinct non-null values, **and** the dictionary-encoded size is smaller than plain — the writer + compares `varint(numEntries) + sum(entryBytes) + nonNullCount` against the raw value buffer size +- **PLAIN**: 256+ distinct values, dict tracking was abandoned, or dict encoding would be larger than plain + +CONST detection is independent of dictionary tracking — it uses a lightweight byte comparison against the first non-null +value, so it works for all types and value sizes (including long strings). + +Dictionary encoding works for all data types including variable-width types (VARCHAR, VARBINARY, DECIMAL). The writer +uses primitive long keys for fixed-width types (≤8 bytes) and byte-array keys for variable-width types. Variable-width +dictionary tracking is bounded by a cumulative byte budget and abandoned when cardinality exceeds 255 or total dictionary +entry bytes exceed the budget. + +Dictionary indices are limited to 1 byte (max 255 entries). This is a deliberate simplicity trade-off for the first +version — columns with 256+ distinct values fall back to PLAIN encoding. + +## Value Serialization + +Values are serialized in the same format for PLAIN data, CONST metadata, and DICT entries: + + + + + + + + + + + + + + + + + + + + + + + +
TypeEncoding
BOOLEAN1 byte (0 or 1)
TINYINT1 byte
SMALLINT2 bytes big-endian
INTEGER / DATE / TIME4 bytes big-endian
BIGINT8 bytes big-endian
FLOAT4 bytes IEEE 754 (big-endian)
DOUBLE8 bytes IEEE 754 (big-endian)
DECIMAL (compact, precision ≤ 18)8 bytes big-endian (unscaled long)
DECIMAL (large, precision > 18)varint length + unscaled BigInteger bytes
TIMESTAMP (precision ≤ 3)8 bytes (epoch millis, big-endian)
TIMESTAMP (precision > 3)8 bytes (epoch millis) + 4 bytes (nanos of millis)
CHAR / VARCHAR / STRINGvarint length + UTF-8 bytes
BINARY / VARBINARY / BYTESvarint length + raw bytes
+ +## ALL_NULL Column Pruning + +For single-row-group files (the common case with small files), columns where every value is null are pruned from both +the schema and bucket data. This reduces schema size for wide sparse tables where many columns are entirely null. + +- The writer detects ALL_NULL columns after buffering all rows +- ALL_NULL columns are removed from the encoding/null flags in bucket data +- ALL_NULL columns are removed from the schema block +- The reader treats any projected column not found in the schema as all-null (returns null for every row) + +This optimization only applies to single-row-group files. Multi-row-group files retain all columns because a column may +be ALL_NULL in one row group but have values in another. + +## Column-to-Bucket Assignment + +Columns are assigned to buckets by hashing the column name: + +``` +bucketId = Math.floorMod(fieldName.hashCode(), numBuckets) +``` + +Default number of buckets: `min(100, numColumns)`. + +## Compression + +Compression is applied independently to each bucket data block and to the schema block. Supported methods: + +- `0` — No compression +- `1` — Zstd (configurable level) + +## Benchmark + +Test setup: 10,000 columns (90% STRING, 10% INT), column names ~80 bytes each, Zstd compression (level 9). + +**File Size (10 rows):** + +| Format | Size | vs Mosaic | +|---------|------------|-----------| +| Parquet | 9,696 KB | 14.8x | +| ORC | 6,377 KB | 9.7x | +| Mosaic | 654 KB | 1x | + +**Projection Read (500 rows):** + +| Projected Columns | Parquet | ORC | Mosaic | +|-------------------|------------|------------|-----------| +| 10 / 10,000 | 53,170 us | 72,729 us | 25,081 us | +| 1 / 10,000 | 50,919 us | 70,712 us | 2,374 us | + +File size — Parquet: 57.4 MB, ORC: 95.4 MB, Mosaic: 11.5 MB + +**Projection Read (4,500 rows, ~458 MB Parquet):** + +| Projected Columns | Parquet | ORC | Mosaic | +|-------------------|-------------|------------|------------| +| 10 / 10,000 | 369,627 us | 89,344 us | 67,314 us | +| 1 / 10,000 | 360,458 us | 81,934 us | 26,924 us | + +File size — Parquet: 458.4 MB, ORC: 827.9 MB, Mosaic: 100.2 MB + +When projecting a small subset of columns, Mosaic only decompresses the buckets containing the requested columns, +avoiding I/O on the remaining data. + +## Limitations + +1. Complex types (ARRAY, MAP, MULTISET, ROW) are not supported. +2. Mosaic format is designed for wide tables and may not be efficient for narrow tables with few columns. diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java new file mode 100644 index 000000000000..0a614f5552b4 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DecimalType; + +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_ALL_NULL; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_CONST; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_DICT; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_PLAIN; + +/** + * Columnar bucket reader for the Mosaic v2 format. Reads column-oriented data with + * CONST/DICT/PLAIN/ALL_NULL encoding. + */ +public class MosaicBucketReader { + + private final DataType[] allColumnTypes; + private final int[] localToOutputMapping; + private final int numColumnsInBucket; + + // Per-column state set during init() + private byte[] encodings; + private boolean[] hasNulls; + private byte[][] nullBitmaps; + private Object[] constValues; + private Object[][] dictValues; + private int[] dataCursors; + private byte[] data; + private int numRows; + private int currentRow; + + public MosaicBucketReader(DataType[] allColumnTypes, int[] localToOutputMapping) { + this.allColumnTypes = allColumnTypes; + this.localToOutputMapping = localToOutputMapping; + this.numColumnsInBucket = allColumnTypes.length; + } + + public void init(byte[] data, int numRows) { + this.data = data; + this.numRows = numRows; + this.currentRow = 0; + + this.encodings = new byte[numColumnsInBucket]; + this.hasNulls = new boolean[numColumnsInBucket]; + this.nullBitmaps = new byte[numColumnsInBucket][]; + this.constValues = new Object[numColumnsInBucket]; + this.dictValues = new Object[numColumnsInBucket][]; + this.dataCursors = new int[numColumnsInBucket]; + + int pos = 0; + + // 1. Read encoding flags (2 bits per column) + int encodingFlagsBytes = (numColumnsInBucket * 2 + 7) / 8; + for (int i = 0; i < numColumnsInBucket; i++) { + int byteIdx = (i * 2) / 8; + int bitIdx = (i * 2) % 8; + encodings[i] = (byte) ((data[pos + byteIdx] >>> bitIdx) & 0x03); + } + pos += encodingFlagsBytes; + + // 2. Read has-nulls flags (1 bit per column) + int hasNullsFlagsBytes = (numColumnsInBucket + 7) / 8; + for (int i = 0; i < numColumnsInBucket; i++) { + hasNulls[i] = (data[pos + i / 8] & (1 << (i % 8))) != 0; + } + pos += hasNullsFlagsBytes; + + // 3. Read const metadata + for (int i = 0; i < numColumnsInBucket; i++) { + if (encodings[i] == ENCODING_CONST) { + int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]); + if (w > 0) { + constValues[i] = readTypedValue(allColumnTypes[i], data, pos, w); + pos += w; + } else { + constValues[i] = readVariableValue(allColumnTypes[i], data, pos); + int len = readVarint(data, pos); + pos += varintSize(len) + len; + } + } + } + + // 4. Read dict metadata + for (int i = 0; i < numColumnsInBucket; i++) { + if (encodings[i] == ENCODING_DICT) { + int numEntries = readVarint(data, pos); + pos += varintSize(numEntries); + int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]); + Object[] entries = new Object[numEntries]; + for (int j = 0; j < numEntries; j++) { + if (w > 0) { + entries[j] = readTypedValue(allColumnTypes[i], data, pos, w); + pos += w; + } else { + entries[j] = readVariableValue(allColumnTypes[i], data, pos); + int len = readVarint(data, pos); + pos += varintSize(len) + len; + } + } + dictValues[i] = entries; + } + } + + // 5. Read null bitmaps + int nullBitmapSize = (numRows + 7) / 8; + for (int i = 0; i < numColumnsInBucket; i++) { + if (hasNulls[i] && encodings[i] != ENCODING_ALL_NULL) { + nullBitmaps[i] = new byte[nullBitmapSize]; + System.arraycopy(data, pos, nullBitmaps[i], 0, nullBitmapSize); + pos += nullBitmapSize; + } + } + + // 6. Record column data start offsets + for (int i = 0; i < numColumnsInBucket; i++) { + dataCursors[i] = pos; + if (encodings[i] == ENCODING_PLAIN) { + // Skip past all plain data for this column to find next column's offset + int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]); + if (w > 0) { + int nonNullCount = countNonNull(i); + pos += nonNullCount * w; + } else { + // Variable-width: scan through + int nonNullCount = countNonNull(i); + for (int j = 0; j < nonNullCount; j++) { + int len = readVarint(data, pos); + pos += varintSize(len) + len; + } + } + } else if (encodings[i] == ENCODING_DICT) { + int nonNullCount = countNonNull(i); + pos += nonNullCount; // 1 byte per non-null cell + } + // CONST and ALL_NULL: no data to skip + } + } + + public void readRow(Object[] outputFields) { + for (int i = 0; i < numColumnsInBucket; i++) { + int outputPos = localToOutputMapping[i]; + + if (encodings[i] == ENCODING_ALL_NULL) { + if (outputPos >= 0) { + outputFields[outputPos] = null; + } + continue; + } + + boolean isNull = + hasNulls[i] && (nullBitmaps[i][currentRow / 8] & (1 << (currentRow % 8))) != 0; + + if (isNull) { + if (outputPos >= 0) { + outputFields[outputPos] = null; + } + continue; + } + + // Non-null value + switch (encodings[i]) { + case ENCODING_CONST: + if (outputPos >= 0) { + outputFields[outputPos] = constValues[i]; + } + break; + case ENCODING_DICT: + { + int idx = data[dataCursors[i]++] & 0xFF; + if (outputPos >= 0) { + outputFields[outputPos] = dictValues[i][idx]; + } + break; + } + case ENCODING_PLAIN: + { + int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]); + if (outputPos >= 0) { + if (w > 0) { + outputFields[outputPos] = + readTypedValue(allColumnTypes[i], data, dataCursors[i], w); + } else { + outputFields[outputPos] = + readVariableValue(allColumnTypes[i], data, dataCursors[i]); + } + } + // Advance cursor + if (w > 0) { + dataCursors[i] += w; + } else { + int len = readVarint(data, dataCursors[i]); + dataCursors[i] += varintSize(len) + len; + } + break; + } + default: + break; + } + } + currentRow++; + } + + // ======================== Value reading ======================== + + private static Object readTypedValue(DataType type, byte[] buf, int pos, int width) { + switch (type.getTypeRoot()) { + case BOOLEAN: + return buf[pos] != 0; + case TINYINT: + return buf[pos]; + case SMALLINT: + return (short) ((buf[pos] << 8) | (buf[pos + 1] & 0xFF)); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return readInt(buf, pos); + case BIGINT: + return readLong(buf, pos); + case FLOAT: + return Float.intBitsToFloat(readInt(buf, pos)); + case DOUBLE: + return Double.longBitsToDouble(readLong(buf, pos)); + case DECIMAL: + { + DecimalType dt = (DecimalType) type; + return Decimal.fromUnscaledLong( + readLong(buf, pos), dt.getPrecision(), dt.getScale()); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + { + long millis = readLong(buf, pos); + if (width == 12) { + int nanos = readInt(buf, pos + 8); + return Timestamp.fromEpochMillis(millis, nanos); + } + return Timestamp.fromEpochMillis(millis); + } + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + { + long millis = readLong(buf, pos); + if (width == 12) { + int nanos = readInt(buf, pos + 8); + return Timestamp.fromEpochMillis(millis, nanos); + } + return Timestamp.fromEpochMillis(millis); + } + default: + throw new UnsupportedOperationException("Unsupported fixed type: " + type); + } + } + + private static Object readVariableValue(DataType type, byte[] buf, int pos) { + int len = readVarint(buf, pos); + int dataStart = pos + varintSize(len); + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return BinaryString.fromBytes(buf, dataStart, len); + case BINARY: + case VARBINARY: + { + byte[] bytes = new byte[len]; + System.arraycopy(buf, dataStart, bytes, 0, len); + return bytes; + } + case DECIMAL: + { + DecimalType dt = (DecimalType) type; + byte[] bytes = new byte[len]; + System.arraycopy(buf, dataStart, bytes, 0, len); + return Decimal.fromUnscaledBytes(bytes, dt.getPrecision(), dt.getScale()); + } + default: + throw new UnsupportedOperationException("Unsupported variable type: " + type); + } + } + + // ======================== Helpers ======================== + + private int countNonNull(int colIdx) { + if (!hasNulls[colIdx]) { + return numRows; + } + if (encodings[colIdx] == ENCODING_ALL_NULL) { + return 0; + } + int count = 0; + int fullBytes = numRows / 8; + for (int b = 0; b < fullBytes; b++) { + count += Integer.bitCount(nullBitmaps[colIdx][b] & 0xFF); + } + int remaining = numRows % 8; + if (remaining > 0) { + int mask = (1 << remaining) - 1; + count += Integer.bitCount(nullBitmaps[colIdx][fullBytes] & mask); + } + return numRows - count; + } + + private static int readInt(byte[] buf, int pos) { + return ((buf[pos] & 0xFF) << 24) + | ((buf[pos + 1] & 0xFF) << 16) + | ((buf[pos + 2] & 0xFF) << 8) + | (buf[pos + 3] & 0xFF); + } + + private static long readLong(byte[] buf, int pos) { + return ((long) (buf[pos] & 0xFF) << 56) + | ((long) (buf[pos + 1] & 0xFF) << 48) + | ((long) (buf[pos + 2] & 0xFF) << 40) + | ((long) (buf[pos + 3] & 0xFF) << 32) + | ((long) (buf[pos + 4] & 0xFF) << 24) + | ((long) (buf[pos + 5] & 0xFF) << 16) + | ((long) (buf[pos + 6] & 0xFF) << 8) + | (buf[pos + 7] & 0xFF); + } + + private static int readVarint(byte[] buf, int pos) { + int value = 0; + int shift = 0; + int b; + do { + b = buf[pos++] & 0xFF; + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + private static int varintSize(int value) { + int size = 1; + while ((value & ~0x7F) != 0) { + size++; + value >>>= 7; + } + return size; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java new file mode 100644 index 000000000000..fce7ebdc6edf --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java @@ -0,0 +1,713 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.LocalZonedTimestampType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.TimestampType; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_ALL_NULL; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_CONST; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_DICT; +import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_PLAIN; +import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint; + +/** + * Columnar bucket writer for the Mosaic format. Buffers values per-column and produces a + * column-oriented byte array with CONST/DICT/PLAIN/ALL_NULL encoding per column. + * + *

CONST detection uses a lightweight byte-comparison tracker that works for all types and value + * sizes, independent of dictionary tracking. Dictionary tracking uses primitive long keys for + * fixed-width types (≤8 bytes) and byte-array keys for variable-width types. Variable-width dict + * tracking is bounded by a cumulative byte budget ({@link #MAX_DICT_TOTAL_BYTES}). DICT encoding is + * chosen only when it produces fewer bytes than PLAIN (cost-based selection). + */ +public class MosaicBucketWriter { + + private static final int MAX_DICT_TOTAL_BYTES = 16384; + + private final InternalRow.FieldGetter[] fieldGetters; + private final int numColumns; + private final int[] fixedWidths; + private final boolean[] isVariableWidth; + + // Per-column buffers + private byte[][] nullBitmaps; + private byte[][] valueBuffers; + private int[] valueBufPos; + private int[] nonNullCounts; + + // CONST tracking: byte comparison against first non-null value (works for any size) + private boolean[] constTracking; + private int[] firstValueLen; + + // Fixed-width ≤8 bytes: primitive long-based dict tracking + private Map[] longDictMaps; + // Variable-width and width>8: byte-array-based dict tracking with cumulative budget + private Map[] byteDictMaps; + private int[] dictTotalBytes; + + private int numRows; + + public MosaicBucketWriter(RowType fullRowType, int[] globalColumnIndices) { + this.numColumns = globalColumnIndices.length; + this.fieldGetters = new InternalRow.FieldGetter[numColumns]; + this.fixedWidths = new int[numColumns]; + this.isVariableWidth = new boolean[numColumns]; + + for (int i = 0; i < numColumns; i++) { + int globalIdx = globalColumnIndices[i]; + DataType type = fullRowType.getTypeAt(globalIdx); + fieldGetters[i] = InternalRow.createFieldGetter(type, globalIdx); + fixedWidths[i] = getFixedWidth(type); + isVariableWidth[i] = fixedWidths[i] < 0; + } + + initBuffers(); + } + + @SuppressWarnings("unchecked") + private void initBuffers() { + this.nullBitmaps = new byte[numColumns][]; + this.valueBuffers = new byte[numColumns][]; + this.valueBufPos = new int[numColumns]; + this.nonNullCounts = new int[numColumns]; + this.constTracking = new boolean[numColumns]; + this.firstValueLen = new int[numColumns]; + this.longDictMaps = new Map[numColumns]; + this.byteDictMaps = new Map[numColumns]; + this.dictTotalBytes = new int[numColumns]; + + for (int i = 0; i < numColumns; i++) { + nullBitmaps[i] = new byte[128]; + valueBuffers[i] = new byte[1024]; + constTracking[i] = true; + if (usesLongDict(i)) { + longDictMaps[i] = new HashMap<>(); + } else { + byteDictMaps[i] = new HashMap<>(); + } + } + this.numRows = 0; + } + + private boolean usesLongDict(int colIdx) { + return fixedWidths[colIdx] > 0 && fixedWidths[colIdx] <= 8; + } + + public boolean isEmpty() { + return numRows == 0; + } + + public int writeRow(InternalRow row) { + int bitmapIdx = numRows / 8; + + int totalSize = 0; + for (int i = 0; i < numColumns; i++) { + // Ensure null bitmap capacity + if (bitmapIdx >= nullBitmaps[i].length) { + byte[] newBm = new byte[nullBitmaps[i].length * 2]; + System.arraycopy(nullBitmaps[i], 0, newBm, 0, nullBitmaps[i].length); + nullBitmaps[i] = newBm; + } + + Object value = fieldGetters[i].getFieldOrNull(row); + if (value == null) { + nullBitmaps[i][bitmapIdx] |= (byte) (1 << (numRows % 8)); + } else { + nonNullCounts[i]++; + int before = valueBufPos[i]; + writeValue(i, value); + int written = valueBufPos[i] - before; + totalSize += written; + + // CONST tracking: compare against first non-null value + if (constTracking[i]) { + if (nonNullCounts[i] == 1) { + firstValueLen[i] = written; + } else if (written != firstValueLen[i] + || !regionEquals(valueBuffers[i], 0, before, written)) { + constTracking[i] = false; + } + } + + // Dict tracking (separate from CONST) + if (longDictMaps[i] != null) { + long key = extractFixedKey(valueBuffers[i], before, fixedWidths[i]); + longDictMaps[i].putIfAbsent(key, longDictMaps[i].size()); + if (longDictMaps[i].size() > 255) { + longDictMaps[i] = null; + } + } else if (byteDictMaps[i] != null) { + ByteKey key = new ByteKey(valueBuffers[i], before, written); + int sizeBefore = byteDictMaps[i].size(); + byteDictMaps[i].putIfAbsent(key, sizeBefore); + if (byteDictMaps[i].size() > sizeBefore) { + dictTotalBytes[i] += written; + } + if (byteDictMaps[i].size() > 255 || dictTotalBytes[i] > MAX_DICT_TOTAL_BYTES) { + byteDictMaps[i] = null; + } + } + } + } + numRows++; + // Include null bitmap overhead (~1 bit per column per row) + totalSize += (numColumns + 7) / 8; + return totalSize; + } + + public byte[] finish() { + return finish(false); + } + + public byte[] finish(boolean pruneAllNull) { + if (numRows == 0) { + return new byte[0]; + } + + // 1. Determine encoding per column + byte[] encodings = new byte[numColumns]; + boolean[] hasNulls = new boolean[numColumns]; + + for (int i = 0; i < numColumns; i++) { + if (nonNullCounts[i] == 0) { + encodings[i] = ENCODING_ALL_NULL; + hasNulls[i] = false; + } else if (constTracking[i]) { + encodings[i] = ENCODING_CONST; + hasNulls[i] = nonNullCounts[i] < numRows; + } else { + int dictSize = getDictSize(i); + if (dictSize >= 2 && dictSize <= 255 && dictEncodedSize(i) < valueBufPos[i]) { + encodings[i] = ENCODING_DICT; + } else { + encodings[i] = ENCODING_PLAIN; + } + hasNulls[i] = nonNullCounts[i] < numRows; + } + } + + // Count output columns (skip ALL_NULL when pruning) + int numOutputCols = numColumns; + if (pruneAllNull) { + numOutputCols = 0; + for (int i = 0; i < numColumns; i++) { + if (encodings[i] != ENCODING_ALL_NULL) { + numOutputCols++; + } + } + } + + // 2. Compute exact output size + byte[] out = computeOutBuffer(numOutputCols, encodings, hasNulls); + int pos = 0; + + // 2a. Encoding flags: 2 bits per output column + int encodingFlagsBytes = (numOutputCols * 2 + 7) / 8; + int outputIdx = 0; + for (int i = 0; i < numColumns; i++) { + if (pruneAllNull && encodings[i] == ENCODING_ALL_NULL) { + continue; + } + int byteIdx = (outputIdx * 2) / 8; + int bitIdx = (outputIdx * 2) % 8; + out[pos + byteIdx] |= (byte) (encodings[i] << bitIdx); + outputIdx++; + } + pos += encodingFlagsBytes; + + // 2b. Has-nulls flags: 1 bit per output column + int hasNullsFlagsBytes = (numOutputCols + 7) / 8; + outputIdx = 0; + for (int i = 0; i < numColumns; i++) { + if (pruneAllNull && encodings[i] == ENCODING_ALL_NULL) { + continue; + } + if (hasNulls[i]) { + out[pos + outputIdx / 8] |= (byte) (1 << (outputIdx % 8)); + } + outputIdx++; + } + pos += hasNullsFlagsBytes; + + // 2c. Const metadata — first non-null value from value buffer + for (int i = 0; i < numColumns; i++) { + if (encodings[i] == ENCODING_CONST) { + System.arraycopy(valueBuffers[i], 0, out, pos, firstValueLen[i]); + pos += firstValueLen[i]; + } + } + + // 2d. Dict metadata + for (int i = 0; i < numColumns; i++) { + if (encodings[i] == ENCODING_DICT) { + if (longDictMaps[i] != null) { + int numEntries = longDictMaps[i].size(); + pos = writeVarint(out, pos, numEntries); + int w = fixedWidths[i]; + long[] keys = new long[numEntries]; + for (Map.Entry e : longDictMaps[i].entrySet()) { + keys[e.getValue()] = e.getKey(); + } + for (int j = 0; j < numEntries; j++) { + pos = writeFixedKey(out, pos, keys[j], w); + } + } else { + int numEntries = byteDictMaps[i].size(); + pos = writeVarint(out, pos, numEntries); + ByteKey[] keys = new ByteKey[numEntries]; + for (Map.Entry e : byteDictMaps[i].entrySet()) { + keys[e.getValue()] = e.getKey(); + } + for (int j = 0; j < numEntries; j++) { + System.arraycopy(keys[j].data, 0, out, pos, keys[j].data.length); + pos += keys[j].data.length; + } + } + } + } + + // 2e. Null bitmaps (only for cols with nulls and not ALL_NULL) + int nullBitmapBytes = (numRows + 7) / 8; + for (int i = 0; i < numColumns; i++) { + if (hasNulls[i] && encodings[i] != ENCODING_ALL_NULL) { + System.arraycopy(nullBitmaps[i], 0, out, pos, nullBitmapBytes); + pos += nullBitmapBytes; + } + } + + // 2f. Column data + for (int i = 0; i < numColumns; i++) { + if (encodings[i] == ENCODING_PLAIN) { + System.arraycopy(valueBuffers[i], 0, out, pos, valueBufPos[i]); + pos += valueBufPos[i]; + } else if (encodings[i] == ENCODING_DICT) { + int w = fixedWidths[i]; + int valPos = 0; + for (int r = 0; r < numRows; r++) { + boolean isNull = (nullBitmaps[i][r / 8] & (1 << (r % 8))) != 0; + if (!isNull) { + if (longDictMaps[i] != null) { + long key = extractFixedKey(valueBuffers[i], valPos, w); + valPos += w; + out[pos++] = (byte) (int) longDictMaps[i].get(key); + } else { + int valueLen; + if (w > 0) { + valueLen = w; + } else { + int varLen = readVarint(valueBuffers[i], valPos); + valueLen = varintSize(varLen) + varLen; + } + ByteKey key = new ByteKey(valueBuffers[i], valPos, valueLen); + valPos += valueLen; + out[pos++] = (byte) (int) byteDictMaps[i].get(key); + } + } + } + } + // CONST and ALL_NULL: no column data + } + + return out; + } + + private byte[] computeOutBuffer(int numOutputCols, byte[] encodings, boolean[] hasNulls) { + int nullBitmapBytesPerCol = (numRows + 7) / 8; + int exactSize = (numOutputCols * 2 + 7) / 8 + (numOutputCols + 7) / 8; + for (int i = 0; i < numColumns; i++) { + if (encodings[i] == ENCODING_ALL_NULL) { + continue; + } + if (hasNulls[i]) { + exactSize += nullBitmapBytesPerCol; + } + if (encodings[i] == ENCODING_CONST) { + exactSize += firstValueLen[i]; + } else if (encodings[i] == ENCODING_DICT) { + if (longDictMaps[i] != null) { + int numEntries = longDictMaps[i].size(); + exactSize += + varintSize(numEntries) + numEntries * fixedWidths[i] + nonNullCounts[i]; + } else { + int numEntries = byteDictMaps[i].size(); + exactSize += varintSize(numEntries); + for (ByteKey key : byteDictMaps[i].keySet()) { + exactSize += key.data.length; + } + exactSize += nonNullCounts[i]; + } + } else if (encodings[i] == ENCODING_PLAIN) { + exactSize += valueBufPos[i]; + } + } + return new byte[exactSize]; + } + + private int getDictSize(int colIdx) { + if (longDictMaps[colIdx] != null) { + return longDictMaps[colIdx].size(); + } + if (byteDictMaps[colIdx] != null) { + return byteDictMaps[colIdx].size(); + } + return -1; + } + + /** Compare dict encoded size vs plain size (pre-compression). */ + private int dictEncodedSize(int colIdx) { + int numEntries; + int entryBytes; + if (longDictMaps[colIdx] != null) { + numEntries = longDictMaps[colIdx].size(); + entryBytes = numEntries * fixedWidths[colIdx]; + } else if (byteDictMaps[colIdx] != null) { + numEntries = byteDictMaps[colIdx].size(); + entryBytes = 0; + for (ByteKey key : byteDictMaps[colIdx].keySet()) { + entryBytes += key.data.length; + } + } else { + return Integer.MAX_VALUE; + } + return varintSize(numEntries) + entryBytes + nonNullCounts[colIdx]; + } + + public boolean[] getAllNullFlags() { + boolean[] flags = new boolean[numColumns]; + for (int i = 0; i < numColumns; i++) { + flags[i] = nonNullCounts[i] == 0; + } + return flags; + } + + public void reset() { + for (int i = 0; i < numColumns; i++) { + Arrays.fill(nullBitmaps[i], (byte) 0); + valueBufPos[i] = 0; + nonNullCounts[i] = 0; + constTracking[i] = true; + firstValueLen[i] = 0; + dictTotalBytes[i] = 0; + if (usesLongDict(i)) { + if (longDictMaps[i] != null) { + longDictMaps[i].clear(); + } else { + longDictMaps[i] = new HashMap<>(); + } + } else { + if (byteDictMaps[i] != null) { + byteDictMaps[i].clear(); + } else { + byteDictMaps[i] = new HashMap<>(); + } + } + } + numRows = 0; + } + + // ======================== Value writing ======================== + + private void writeValue(int colIdx, Object value) { + int w = fixedWidths[colIdx]; + if (w > 0) { + ensureValueCapacity(colIdx, w); + writeFixedValue(valueBuffers[colIdx], valueBufPos[colIdx], value, w); + valueBufPos[colIdx] += w; + } else { + writeVariableValue(colIdx, value); + } + } + + private static void writeFixedValue(byte[] buf, int pos, Object value, int width) { + switch (width) { + case 1: + if (value instanceof Boolean) { + buf[pos] = (byte) ((Boolean) value ? 1 : 0); + } else { + buf[pos] = (Byte) value; + } + break; + case 2: + { + short v = (Short) value; + buf[pos] = (byte) (v >>> 8); + buf[pos + 1] = (byte) v; + break; + } + case 4: + { + int v; + if (value instanceof Float) { + v = Float.floatToRawIntBits((Float) value); + } else { + v = (Integer) value; + } + buf[pos] = (byte) (v >>> 24); + buf[pos + 1] = (byte) (v >>> 16); + buf[pos + 2] = (byte) (v >>> 8); + buf[pos + 3] = (byte) v; + break; + } + case 8: + { + long v; + if (value instanceof Long) { + v = (Long) value; + } else if (value instanceof Double) { + v = Double.doubleToRawLongBits((Double) value); + } else if (value instanceof Decimal) { + v = ((Decimal) value).toUnscaledLong(); + } else if (value instanceof Timestamp) { + v = ((Timestamp) value).getMillisecond(); + } else { + throw new IllegalArgumentException("Unsupported type: " + value.getClass()); + } + writeLong(buf, pos, v); + break; + } + case 12: + { + Timestamp ts = (Timestamp) value; + long millis = ts.getMillisecond(); + int nanos = ts.getNanoOfMillisecond(); + writeLong(buf, pos, millis); + buf[pos + 8] = (byte) (nanos >>> 24); + buf[pos + 9] = (byte) (nanos >>> 16); + buf[pos + 10] = (byte) (nanos >>> 8); + buf[pos + 11] = (byte) nanos; + break; + } + default: + break; + } + } + + private static void writeLong(byte[] buf, int pos, long v) { + buf[pos] = (byte) (v >>> 56); + buf[pos + 1] = (byte) (v >>> 48); + buf[pos + 2] = (byte) (v >>> 40); + buf[pos + 3] = (byte) (v >>> 32); + buf[pos + 4] = (byte) (v >>> 24); + buf[pos + 5] = (byte) (v >>> 16); + buf[pos + 6] = (byte) (v >>> 8); + buf[pos + 7] = (byte) v; + } + + private void writeVariableValue(int colIdx, Object value) { + byte[] bytes; + if (value instanceof BinaryString) { + bytes = ((BinaryString) value).toBytes(); + } else if (value instanceof byte[]) { + bytes = (byte[]) value; + } else if (value instanceof Decimal) { + bytes = ((Decimal) value).toUnscaledBytes(); + } else { + throw new UnsupportedOperationException("Unsupported variable-width type: " + value); + } + ensureValueCapacity(colIdx, 5 + bytes.length); + valueBufPos[colIdx] = writeVarint(valueBuffers[colIdx], valueBufPos[colIdx], bytes.length); + System.arraycopy(bytes, 0, valueBuffers[colIdx], valueBufPos[colIdx], bytes.length); + valueBufPos[colIdx] += bytes.length; + } + + // ======================== Fixed-width key helpers ======================== + + private static long extractFixedKey(byte[] buf, int pos, int width) { + switch (width) { + case 1: + return buf[pos] & 0xFFL; + case 2: + return ((buf[pos] & 0xFFL) << 8) | (buf[pos + 1] & 0xFFL); + case 4: + return ((buf[pos] & 0xFFL) << 24) + | ((buf[pos + 1] & 0xFFL) << 16) + | ((buf[pos + 2] & 0xFFL) << 8) + | (buf[pos + 3] & 0xFFL); + case 8: + return ((buf[pos] & 0xFFL) << 56) + | ((buf[pos + 1] & 0xFFL) << 48) + | ((buf[pos + 2] & 0xFFL) << 40) + | ((buf[pos + 3] & 0xFFL) << 32) + | ((buf[pos + 4] & 0xFFL) << 24) + | ((buf[pos + 5] & 0xFFL) << 16) + | ((buf[pos + 6] & 0xFFL) << 8) + | (buf[pos + 7] & 0xFFL); + default: + return 0; + } + } + + private static int writeFixedKey(byte[] buf, int pos, long key, int width) { + switch (width) { + case 1: + buf[pos++] = (byte) key; + break; + case 2: + buf[pos++] = (byte) (key >>> 8); + buf[pos++] = (byte) key; + break; + case 4: + buf[pos++] = (byte) (key >>> 24); + buf[pos++] = (byte) (key >>> 16); + buf[pos++] = (byte) (key >>> 8); + buf[pos++] = (byte) key; + break; + case 8: + buf[pos++] = (byte) (key >>> 56); + buf[pos++] = (byte) (key >>> 48); + buf[pos++] = (byte) (key >>> 40); + buf[pos++] = (byte) (key >>> 32); + buf[pos++] = (byte) (key >>> 24); + buf[pos++] = (byte) (key >>> 16); + buf[pos++] = (byte) (key >>> 8); + buf[pos++] = (byte) key; + break; + default: + break; + } + return pos; + } + + // ======================== Buffer helpers ======================== + + private void ensureValueCapacity(int colIdx, int additional) { + int required = valueBufPos[colIdx] + additional; + if (required > valueBuffers[colIdx].length) { + int newLen = Math.max(valueBuffers[colIdx].length * 2, required); + byte[] newBuf = new byte[newLen]; + System.arraycopy(valueBuffers[colIdx], 0, newBuf, 0, valueBufPos[colIdx]); + valueBuffers[colIdx] = newBuf; + } + } + + private static boolean regionEquals(byte[] buf, int off1, int off2, int len) { + for (int i = 0; i < len; i++) { + if (buf[off1 + i] != buf[off2 + i]) { + return false; + } + } + return true; + } + + // ======================== Type width ======================== + + static int getFixedWidth(DataType type) { + switch (type.getTypeRoot()) { + case BOOLEAN: + case TINYINT: + return 1; + case SMALLINT: + return 2; + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case FLOAT: + return 4; + case BIGINT: + case DOUBLE: + return 8; + case DECIMAL: + if (Decimal.isCompact(((DecimalType) type).getPrecision())) { + return 8; + } + return -1; + case TIMESTAMP_WITHOUT_TIME_ZONE: + if (Timestamp.isCompact(((TimestampType) type).getPrecision())) { + return 8; + } + return 12; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (Timestamp.isCompact(((LocalZonedTimestampType) type).getPrecision())) { + return 8; + } + return 12; + default: + return -1; + } + } + + // ======================== Varint helpers ======================== + + private static int readVarint(byte[] buf, int pos) { + int value = 0; + int shift = 0; + int b; + do { + b = buf[pos++] & 0xFF; + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + private static int varintSize(int value) { + int size = 1; + while ((value & ~0x7F) != 0) { + size++; + value >>>= 7; + } + return size; + } + + // ======================== ByteKey ======================== + + /** Immutable byte array wrapper with value-based hash and equals for dict tracking. */ + static final class ByteKey { + final byte[] data; + private final int hash; + + ByteKey(byte[] source, int offset, int length) { + this.data = new byte[length]; + System.arraycopy(source, offset, this.data, 0, length); + int h = 1; + for (int i = 0; i < length; i++) { + h = 31 * h + this.data[i]; + } + this.hash = h; + } + + @Override + public int hashCode() { + return hash; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof ByteKey)) { + return false; + } + return Arrays.equals(data, ((ByteKey) obj).data); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java new file mode 100644 index 000000000000..f7f1bac2d329 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.apache.paimon.format.mosaic.MosaicUtils.readLong; +import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint; +import static org.apache.paimon.utils.IOUtils.readFully; + +/** Utility to analyze the storage breakdown of a Mosaic file. */ +public class MosaicFileAnalyzer { + + public static String analyze(FileIO fileIO, Path path) throws IOException { + long fileSize = fileIO.getFileSize(path); + try (SeekableInputStream in = fileIO.newInputStream(path)) { + return analyze(in, fileSize); + } + } + + public static String analyze(SeekableInputStream in, long fileSize) throws IOException { + in.seek(fileSize - MosaicSpec.FOOTER_SIZE); + byte[] footerBytes = new byte[MosaicSpec.FOOTER_SIZE]; + readFully(in, footerBytes); + ByteBuffer footer = ByteBuffer.wrap(footerBytes).order(ByteOrder.BIG_ENDIAN); + long indexOffset = footer.getLong(); + long schemaBlockOffset = footer.getLong(); + int numBuckets = footer.getInt(); + int numRowGroups = footer.getInt(); + byte compression = footer.get(); + byte version = footer.get(); + + long schemaBlockSize = indexOffset - schemaBlockOffset; + long indexSize = fileSize - MosaicSpec.FOOTER_SIZE - indexOffset; + + // Schema uncompressed size + in.seek(schemaBlockOffset); + byte[] lenBuf = new byte[4]; + readFully(in, lenBuf); + int schemaUncompressed = ByteBuffer.wrap(lenBuf).order(ByteOrder.BIG_ENDIAN).getInt(); + long schemaCompressed = schemaBlockSize - 4; + + // Per-bucket stats from row group index (varint encoded, non-empty only) + in.seek(indexOffset); + byte[] indexBytes = new byte[(int) indexSize]; + readFully(in, indexBytes); + int[] idxPos = {0}; + + long totalCompressed = 0; + long totalUncompressed = 0; + int nonEmptyBuckets = 0; + int totalRows = 0; + + for (int rg = 0; rg < numRowGroups; rg++) { + totalRows += readVarint(indexBytes, idxPos); + int nonEmpty = readVarint(indexBytes, idxPos); + nonEmptyBuckets += nonEmpty; + for (int i = 0; i < nonEmpty; i++) { + readVarint(indexBytes, idxPos); // bucketId + readLong(indexBytes, idxPos); // offset + int cs = readVarint(indexBytes, idxPos); + int us = readVarint(indexBytes, idxPos); + totalCompressed += cs; + totalUncompressed += us; + } + } + + return String.format( + "=== Mosaic File Analysis ===%n" + + "File size: %,d bytes (%.1f KB)%n" + + "Version: %d%n" + + "Compression: %d%n" + + "Buckets: %d (%d non-empty)%n" + + "Row groups: %d%n" + + "Total rows: %,d%n%n", + fileSize, + fileSize / 1024.0, + version, + compression, + numBuckets, + nonEmptyBuckets, + numRowGroups, + totalRows) + + String.format( + "--- Section Sizes ---%n" + + "Bucket data: %,9d bytes (%5.1f KB, %5.1f%%)%n" + + "Schema block: %,9d bytes (%5.1f KB, %5.1f%%)%n" + + "Row group index: %,9d bytes (%5.1f KB, %5.1f%%)%n" + + "Footer: %,9d bytes (%5.1f KB, %5.1f%%)%n%n", + schemaBlockOffset, + schemaBlockOffset / 1024.0, + 100.0 * schemaBlockOffset / fileSize, + schemaBlockSize, + schemaBlockSize / 1024.0, + 100.0 * schemaBlockSize / fileSize, + indexSize, + indexSize / 1024.0, + 100.0 * indexSize / fileSize, + (long) MosaicSpec.FOOTER_SIZE, + MosaicSpec.FOOTER_SIZE / 1024.0, + 100.0 * MosaicSpec.FOOTER_SIZE / fileSize) + + String.format( + "--- Compression ---%n" + + "Schema: %,9d -> %,9d bytes (%.1fx)%n" + + "Bucket data: %,9d -> %,9d bytes (%.1fx)%n", + schemaUncompressed, + schemaCompressed, + schemaCompressed > 0 ? (double) schemaUncompressed / schemaCompressed : 0, + totalUncompressed, + totalCompressed, + totalCompressed > 0 ? (double) totalUncompressed / totalCompressed : 0); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java new file mode 100644 index 000000000000..ebb969344677 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory.FormatContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.util.List; + +/** + * Mosaic file format: a column-bucket hybrid format optimized for wide tables (1,000-100,000+ + * columns). Columns are hashed into buckets, row-stored within each bucket, and independently + * compressed. Projection pushdown works at bucket granularity. + */ +public class MosaicFileFormat extends FileFormat { + + private final int numBuckets; + private final int zstdLevel; + private final long rowGroupMaxSize; + + public MosaicFileFormat(FormatContext formatContext) { + super(MosaicFileFormatFactory.IDENTIFIER); + this.numBuckets = + formatContext + .options() + .getOptional(MosaicOptions.NUM_COLUMN_BUCKETS) + .orElse(MosaicSpec.DEFAULT_NUM_BUCKETS); + this.zstdLevel = formatContext.zstdLevel(); + this.rowGroupMaxSize = formatContext.writeBatchMemory().getBytes(); + } + + @Override + public FormatReaderFactory createReaderFactory( + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List filters) { + return new MosaicReaderFactory(projectedRowType); + } + + @Override + public FormatWriterFactory createWriterFactory(RowType type) { + return new MosaicWriterFactory(type, numBuckets, zstdLevel, rowGroupMaxSize); + } + + @Override + public void validateDataFields(RowType rowType) { + rowType.getFields().forEach(f -> validateFieldType(f.type().getTypeRoot(), f.name())); + } + + private static void validateFieldType(DataTypeRoot root, String fieldName) { + switch (root) { + case ARRAY: + case VECTOR: + case MAP: + case MULTISET: + case ROW: + case VARIANT: + case BLOB: + throw new UnsupportedOperationException( + "Unsupported type: " + root + " for field: " + fieldName); + default: + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java new file mode 100644 index 000000000000..d94aff596ed7 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; + +/** Factory for creating Mosaic file format instances via SPI. */ +public class MosaicFileFormatFactory implements FileFormatFactory { + + public static final String IDENTIFIER = "mosaic"; + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public FileFormat create(FormatContext formatContext) { + return new MosaicFileFormat(formatContext); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java new file mode 100644 index 000000000000..51f0aa9557f5 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.options.ConfigOption; +import org.apache.paimon.options.ConfigOptions; + +/** Configuration options for the Mosaic file format. */ +public class MosaicOptions { + + public static final ConfigOption NUM_COLUMN_BUCKETS = + ConfigOptions.key("mosaic.num-column-buckets") + .intType() + .defaultValue(MosaicSpec.DEFAULT_NUM_BUCKETS) + .withDescription( + "Number of column buckets in the Mosaic format. " + + "Columns are hashed into this many buckets. " + + "Default is 100."); +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java new file mode 100644 index 000000000000..b53befc243ff --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.IteratorResultIterator; +import org.apache.paimon.utils.IteratorWithException; + +import com.github.luben.zstd.Zstd; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.Set; + +import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_NONE; +import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_ZSTD; +import static org.apache.paimon.format.mosaic.MosaicUtils.readLong; +import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint; + +/** Reader for the Mosaic file format with row group support. */ +public class MosaicReader implements FileRecordReader { + + private final Path filePath; + private final SeekableInputStream inputStream; + private final RowType projectedRowType; + + private byte compression; + private int[] sortedRequiredBuckets; + private MosaicSpec.RowGroupMeta[] rowGroupMetas; + private MosaicBucketReader[] bucketReaders; + private int currentRowGroup; + private byte[] compressedBuf; + + public MosaicReader(FileIO fileIO, Path filePath, long fileSize, RowType projectedRowType) + throws IOException { + this.filePath = filePath; + this.inputStream = fileIO.newInputStream(filePath); + this.projectedRowType = projectedRowType; + this.currentRowGroup = 0; + + readFooterAndInit(fileSize); + } + + private void readFooterAndInit(long fileSize) throws IOException { + // Read footer (last 32 bytes) + inputStream.seek(fileSize - MosaicSpec.FOOTER_SIZE); + byte[] footerBytes = new byte[MosaicSpec.FOOTER_SIZE]; + readFully(footerBytes); + + ByteBuffer footer = ByteBuffer.wrap(footerBytes).order(ByteOrder.BIG_ENDIAN); + long indexOffset = footer.getLong(); + long schemaBlockOffset = footer.getLong(); + int numBuckets = footer.getInt(); + int numRowGroups = footer.getInt(); + this.compression = footer.get(); + byte version = footer.get(); + footer.getShort(); // padding + byte[] magic = new byte[4]; + footer.get(magic); + + if (magic[0] != 'M' || magic[1] != 'O' || magic[2] != 'S' || magic[3] != 'A') { + throw new IOException("Invalid Mosaic file: bad magic bytes"); + } + + if (version != MosaicSpec.VERSION) { + throw new IOException( + "Unsupported Mosaic file version: " + + version + + ", expected: " + + MosaicSpec.VERSION); + } + + // Read schema block + inputStream.seek(schemaBlockOffset); + int schemaUncompressedSize = readInt(); + int schemaCompressedSize = (int) (indexOffset - schemaBlockOffset - 4); + byte[] schemaCompressed = new byte[schemaCompressedSize]; + readFully(schemaCompressed); + + byte[] schemaRaw; + switch (compression) { + case COMPRESSION_NONE: + schemaRaw = schemaCompressed; + break; + case COMPRESSION_ZSTD: + schemaRaw = new byte[schemaUncompressedSize]; + Zstd.decompress(schemaRaw, schemaCompressed); + break; + default: + throw new UnsupportedEncodingException("Unsupported compression: " + compression); + } + MosaicSchema schema = MosaicSchema.deserialize(schemaRaw); + + // Determine which buckets we need + Set requiredBuckets = schema.getRequiredBuckets(projectedRowType); + + // Read row group index (varint encoded, only non-empty buckets) + inputStream.seek(indexOffset); + int indexSize = (int) (fileSize - MosaicSpec.FOOTER_SIZE - indexOffset); + byte[] indexBytes = new byte[indexSize]; + readFully(indexBytes); + int[] idxPos = {0}; + + this.rowGroupMetas = new MosaicSpec.RowGroupMeta[numRowGroups]; + for (int rg = 0; rg < numRowGroups; rg++) { + int numRows = readVarint(indexBytes, idxPos); + int nonEmpty = readVarint(indexBytes, idxPos); + + long[] bucketOffsets = new long[numBuckets]; + int[] compressedSizes = new int[numBuckets]; + int[] uncompressedSizes = new int[numBuckets]; + + for (int i = 0; i < nonEmpty; i++) { + int bucketId = readVarint(indexBytes, idxPos); + bucketOffsets[bucketId] = readLong(indexBytes, idxPos); + compressedSizes[bucketId] = readVarint(indexBytes, idxPos); + uncompressedSizes[bucketId] = readVarint(indexBytes, idxPos); + } + + rowGroupMetas[rg] = + new MosaicSpec.RowGroupMeta( + numRows, bucketOffsets, compressedSizes, uncompressedSizes); + } + + this.bucketReaders = new MosaicBucketReader[numBuckets]; + int count = 0; + for (int b : requiredBuckets) { + DataType[] bucketTypes = schema.getBucketColumnTypes(b); + int[] projMapping = schema.getProjectionMapping(b, projectedRowType); + if (projMapping != null) { + bucketReaders[b] = new MosaicBucketReader(bucketTypes, projMapping); + count++; + } + } + this.sortedRequiredBuckets = new int[count]; + int idx = 0; + for (int b : requiredBuckets) { + if (bucketReaders[b] != null) { + sortedRequiredBuckets[idx++] = b; + } + } + this.compressedBuf = new byte[0]; + } + + @Nullable + @Override + public FileRecordIterator readBatch() throws IOException { + if (currentRowGroup >= rowGroupMetas.length) { + return null; + } + + MosaicSpec.RowGroupMeta meta = rowGroupMetas[currentRowGroup++]; + if (meta.numRows == 0) { + return readBatch(); + } + + final MosaicBucketReader[] readers = this.bucketReaders; + + // Sort required buckets by file offset for sequential I/O + int[] ordered = Arrays.copyOf(sortedRequiredBuckets, sortedRequiredBuckets.length); + final long[] offsets = meta.bucketOffsets; + // insertion sort — array is small (number of projected buckets) + for (int i = 1; i < ordered.length; i++) { + int key = ordered[i]; + long keyOff = offsets[key]; + int j = i - 1; + while (j >= 0 && offsets[ordered[j]] > keyOff) { + ordered[j + 1] = ordered[j]; + j--; + } + ordered[j + 1] = key; + } + + int activeCount = 0; + int[] activeBuckets = new int[ordered.length]; + + for (int b : ordered) { + if (meta.compressedSizes[b] == 0) { + continue; + } + + int compSize = meta.compressedSizes[b]; + inputStream.seek(meta.bucketOffsets[b]); + + byte[] bucketData; + switch (compression) { + case COMPRESSION_NONE: + bucketData = new byte[compSize]; + readFully(bucketData); + break; + case COMPRESSION_ZSTD: + if (compressedBuf.length < compSize) { + compressedBuf = new byte[compSize]; + } + readFully(compressedBuf, compSize); + int uncompSize = meta.uncompressedSizes[b]; + bucketData = new byte[uncompSize]; + Zstd.decompressByteArray(bucketData, 0, uncompSize, compressedBuf, 0, compSize); + break; + default: + throw new UnsupportedEncodingException( + "Unsupported compression: " + compression); + } + + readers[b].init(bucketData, meta.numRows); + activeBuckets[activeCount++] = b; + } + + final int[] active = Arrays.copyOf(activeBuckets, activeCount); + return new IteratorResultIterator( + toIterator(meta.numRows, active, readers), null, filePath, 0); + } + + private IteratorWithException toIterator( + int totalRows, int[] active, MosaicBucketReader[] readers) { + final int projectedFieldCount = projectedRowType.getFieldCount(); + return new IteratorWithException() { + int currentRow = 0; + final Object[] fields = new Object[projectedFieldCount]; + + @Override + public boolean hasNext() { + return currentRow < totalRows; + } + + @Override + public InternalRow next() { + Arrays.fill(fields, null); + for (int j : active) { + readers[j].readRow(fields); + } + currentRow++; + return GenericRow.of(fields); + } + }; + } + + @Override + public void close() throws IOException { + inputStream.close(); + } + + private void readFully(byte[] buf) throws IOException { + readFully(buf, buf.length); + } + + private void readFully(byte[] buf, int len) throws IOException { + int offset = 0; + while (offset < len) { + int read = inputStream.read(buf, offset, len - offset); + if (read < 0) { + throw new IOException("Unexpected EOF"); + } + offset += read; + } + } + + private int readInt() throws IOException { + byte[] buf = new byte[4]; + readFully(buf); + return ByteBuffer.wrap(buf).order(ByteOrder.BIG_ENDIAN).getInt(); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java new file mode 100644 index 000000000000..3a5704eb1b0a --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; + +import java.io.IOException; + +/** Factory for creating {@link MosaicReader} instances. */ +public class MosaicReaderFactory implements FormatReaderFactory { + + private final RowType projectedRowType; + + public MosaicReaderFactory(RowType projectedRowType) { + this.projectedRowType = projectedRowType; + } + + @Override + public FileRecordReader createReader(Context context) throws IOException { + return new MosaicReader( + context.fileIO(), context.filePath(), context.fileSize(), projectedRowType); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java new file mode 100644 index 000000000000..03d9f16edb57 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.RowType; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint; +import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint; + +/** Schema block for the Mosaic file format. Stores column metadata and bucket assignments. */ +public class MosaicSchema { + + private final int numBuckets; + private final List columns; + private final int[][] bucketToGlobalIndices; + + private MosaicSchema(int numBuckets, List columns, int[][] bucketToGlobalIndices) { + this.numBuckets = numBuckets; + this.columns = columns; + this.bucketToGlobalIndices = bucketToGlobalIndices; + } + + public static MosaicSchema create(RowType rowType, int numBuckets) { + int[][] bucketMapping = MosaicSpec.groupColumnsByBucket(rowType, numBuckets); + List fields = rowType.getFields(); + List columns = new ArrayList<>(fields.size()); + + int[] columnToBucket = new int[fields.size()]; + int[] columnToIndexInBucket = new int[fields.size()]; + for (int b = 0; b < numBuckets; b++) { + for (int localIdx = 0; localIdx < bucketMapping[b].length; localIdx++) { + int globalIdx = bucketMapping[b][localIdx]; + columnToBucket[globalIdx] = b; + columnToIndexInBucket[globalIdx] = localIdx; + } + } + + for (int i = 0; i < fields.size(); i++) { + DataField field = fields.get(i); + columns.add( + new ColumnMeta( + field.id(), + field.name(), + field.type(), + columnToBucket[i], + columnToIndexInBucket[i])); + } + + return new MosaicSchema(numBuckets, columns, bucketMapping); + } + + public int numBuckets() { + return numBuckets; + } + + public int[][] bucketToGlobalIndices() { + return bucketToGlobalIndices; + } + + public DataType[] getBucketColumnTypes(int bucketId) { + int[] globalIndices = bucketToGlobalIndices[bucketId]; + DataType[] types = new DataType[globalIndices.length]; + for (int i = 0; i < globalIndices.length; i++) { + types[i] = columns.get(globalIndices[i]).type; + } + return types; + } + + /** Returns the set of bucket IDs that contain at least one projected column. */ + public Set getRequiredBuckets(RowType projectedRowType) { + Set projectedNames = new HashSet<>(projectedRowType.getFieldNames()); + Set requiredBuckets = new HashSet<>(); + for (ColumnMeta col : columns) { + if (projectedNames.contains(col.name)) { + requiredBuckets.add(col.bucketId); + } + } + return requiredBuckets; + } + + /** + * For a given bucket, returns the mapping from local column indices within the bucket to output + * positions in the projected row. The array index is the local column index, and the value is + * the output position (-1 means skip). Returns null if no columns in this bucket are projected. + */ + public int[] getProjectionMapping(int bucketId, RowType projectedRowType) { + Map projectedNameToPos = new HashMap<>(); + List projectedNames = projectedRowType.getFieldNames(); + for (int i = 0; i < projectedNames.size(); i++) { + projectedNameToPos.put(projectedNames.get(i), i); + } + + int[] globalIndices = bucketToGlobalIndices[bucketId]; + int[] localToOutput = new int[globalIndices.length]; + Arrays.fill(localToOutput, -1); + boolean hasProjection = false; + for (int localIdx = 0; localIdx < globalIndices.length; localIdx++) { + ColumnMeta col = columns.get(globalIndices[localIdx]); + Integer outputPos = projectedNameToPos.get(col.name); + if (outputPos != null) { + localToOutput[localIdx] = outputPos; + hasProjection = true; + } + } + return hasProjection ? localToOutput : null; + } + + public byte[] serialize() throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(baos); + + writeVarint(out, columns.size()); + writeVarint(out, numBuckets); + + // Front coding: each column name stored as (sharedPrefixLen, suffix) + byte[] prevNameBytes = new byte[0]; + for (ColumnMeta col : columns) { + writeVarint(out, col.fieldId); + writeVarint(out, col.bucketId); + writeVarint(out, col.indexInBucket); + + byte[] nameBytes = col.name.getBytes(StandardCharsets.UTF_8); + int shared = commonPrefixLength(prevNameBytes, nameBytes); + writeVarint(out, shared); + writeVarint(out, nameBytes.length - shared); + out.write(nameBytes, shared, nameBytes.length - shared); + prevNameBytes = nameBytes; + + MosaicTypes.writeType(out, col.type); + } + + out.flush(); + return baos.toByteArray(); + } + + public static MosaicSchema deserialize(byte[] data) throws IOException { + DataInputStream in = new DataInputStream(new ByteArrayInputStream(data)); + + int numColumns = readVarint(in); + int numBuckets = readVarint(in); + + List columns = new ArrayList<>(numColumns); + List> bucketLists = new ArrayList<>(numBuckets); + for (int i = 0; i < numBuckets; i++) { + bucketLists.add(new ArrayList<>()); + } + + byte[] prevNameBytes = new byte[0]; + for (int i = 0; i < numColumns; i++) { + int fieldId = readVarint(in); + int bucketId = readVarint(in); + int indexInBucket = readVarint(in); + + int shared = readVarint(in); + int suffixLen = readVarint(in); + byte[] nameBytes = new byte[shared + suffixLen]; + System.arraycopy(prevNameBytes, 0, nameBytes, 0, shared); + in.readFully(nameBytes, shared, suffixLen); + prevNameBytes = nameBytes; + + String name = new String(nameBytes, StandardCharsets.UTF_8); + DataType type = MosaicTypes.readType(in); + columns.add(new ColumnMeta(fieldId, name, type, bucketId, indexInBucket)); + bucketLists.get(bucketId).add(i); + } + + int[][] bucketToGlobal = new int[numBuckets][]; + for (int b = 0; b < numBuckets; b++) { + List list = bucketLists.get(b); + bucketToGlobal[b] = new int[list.size()]; + for (int j = 0; j < list.size(); j++) { + bucketToGlobal[b][j] = list.get(j); + } + } + + return new MosaicSchema(numBuckets, columns, bucketToGlobal); + } + + public MosaicSchema pruneAllNullColumns(boolean[][] allNullByBucket) { + Set prunedGlobalIndices = new HashSet<>(); + for (int b = 0; b < numBuckets; b++) { + if (allNullByBucket[b] == null) { + continue; + } + int[] globalIndices = bucketToGlobalIndices[b]; + for (int local = 0; local < globalIndices.length; local++) { + if (allNullByBucket[b][local]) { + prunedGlobalIndices.add(globalIndices[local]); + } + } + } + + if (prunedGlobalIndices.isEmpty()) { + return this; + } + + List newColumns = new ArrayList<>(); + Map oldToNew = new HashMap<>(); + for (int i = 0; i < columns.size(); i++) { + if (!prunedGlobalIndices.contains(i)) { + oldToNew.put(i, newColumns.size()); + newColumns.add(columns.get(i)); + } + } + + int[][] newBucketToGlobal = new int[numBuckets][]; + for (int b = 0; b < numBuckets; b++) { + List kept = new ArrayList<>(); + for (int globalIdx : bucketToGlobalIndices[b]) { + Integer newIdx = oldToNew.get(globalIdx); + if (newIdx != null) { + kept.add(newIdx); + } + } + newBucketToGlobal[b] = new int[kept.size()]; + for (int j = 0; j < kept.size(); j++) { + newBucketToGlobal[b][j] = kept.get(j); + ColumnMeta old = newColumns.get(kept.get(j)); + newColumns.set( + kept.get(j), + new ColumnMeta(old.fieldId, old.name, old.type, old.bucketId, j)); + } + } + + return new MosaicSchema(numBuckets, newColumns, newBucketToGlobal); + } + + private static int commonPrefixLength(byte[] a, byte[] b) { + int len = Math.min(a.length, b.length); + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) { + return i; + } + } + return len; + } + + /** Metadata for a single column. */ + public static class ColumnMeta { + public final int fieldId; + public final String name; + public final DataType type; + public final int bucketId; + public final int indexInBucket; + + public ColumnMeta( + int fieldId, String name, DataType type, int bucketId, int indexInBucket) { + this.fieldId = fieldId; + this.name = name; + this.type = type; + this.bucketId = bucketId; + this.indexInBucket = indexInBucket; + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java new file mode 100644 index 000000000000..36bb50d76e23 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; + +import java.util.ArrayList; +import java.util.List; + +/** Constants and utilities for the Mosaic file format. */ +public class MosaicSpec { + + public static final byte[] MAGIC = new byte[] {'M', 'O', 'S', 'A'}; + public static final byte VERSION = 1; + + public static final int FOOTER_SIZE = 32; + + public static final byte COMPRESSION_NONE = 0; + public static final byte COMPRESSION_ZSTD = 1; + + public static final int DEFAULT_NUM_BUCKETS = 100; + + // Column encoding types (2 bits each in encoding flags) + public static final byte ENCODING_PLAIN = 0; + public static final byte ENCODING_CONST = 1; + public static final byte ENCODING_DICT = 2; + public static final byte ENCODING_ALL_NULL = 3; + + public static int assignBucket(String fieldName, int numBuckets) { + return Math.floorMod(fieldName.hashCode(), numBuckets); + } + + /** + * Groups columns by bucket. Returns an array where each element is the list of global column + * indices assigned to that bucket. + */ + public static int[][] groupColumnsByBucket(RowType rowType, int numBuckets) { + List fields = rowType.getFields(); + List> buckets = new ArrayList<>(numBuckets); + for (int i = 0; i < numBuckets; i++) { + buckets.add(new ArrayList<>()); + } + for (int i = 0; i < fields.size(); i++) { + int bucketId = assignBucket(fields.get(i).name(), numBuckets); + buckets.get(bucketId).add(i); + } + int[][] result = new int[numBuckets][]; + for (int i = 0; i < numBuckets; i++) { + List list = buckets.get(i); + result[i] = new int[list.size()]; + for (int j = 0; j < list.size(); j++) { + result[i][j] = list.get(j); + } + } + return result; + } + + public static byte compressionToByte(String compression) { + if (compression == null || compression.isEmpty() || "none".equalsIgnoreCase(compression)) { + return COMPRESSION_NONE; + } + if ("zstd".equalsIgnoreCase(compression)) { + return COMPRESSION_ZSTD; + } + throw new IllegalArgumentException("Unsupported Mosaic compression: " + compression); + } + + /** Metadata for a single row group. */ + public static class RowGroupMeta { + public final int numRows; + public final long[] bucketOffsets; + public final int[] compressedSizes; + public final int[] uncompressedSizes; + + public RowGroupMeta( + int numRows, long[] bucketOffsets, int[] compressedSizes, int[] uncompressedSizes) { + this.numRows = numRows; + this.bucketOffsets = bucketOffsets; + this.compressedSizes = compressedSizes; + this.uncompressedSizes = uncompressedSizes; + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java new file mode 100644 index 000000000000..bc0892c7d807 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.CharType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.DateType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.LocalZonedTimestampType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TimeType; +import org.apache.paimon.types.TimestampType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint; +import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint; + +/** Recursive binary serialization/deserialization for {@link DataType}. */ +public class MosaicTypes { + + private static final byte TYPE_BOOLEAN = 0; + private static final byte TYPE_TINYINT = 1; + private static final byte TYPE_SMALLINT = 2; + private static final byte TYPE_INTEGER = 3; + private static final byte TYPE_BIGINT = 4; + private static final byte TYPE_FLOAT = 5; + private static final byte TYPE_DOUBLE = 6; + private static final byte TYPE_DATE = 7; + private static final byte TYPE_CHAR = 8; + private static final byte TYPE_VARCHAR = 9; + private static final byte TYPE_STRING = 10; + private static final byte TYPE_BINARY = 11; + private static final byte TYPE_VARBINARY = 12; + private static final byte TYPE_BYTES = 13; + private static final byte TYPE_DECIMAL = 14; + private static final byte TYPE_TIME = 15; + private static final byte TYPE_TIMESTAMP = 16; + private static final byte TYPE_TIMESTAMP_LTZ = 17; + + @FunctionalInterface + interface TypeWriter { + void write(DataOutputStream out, DataType type) throws IOException; + } + + @FunctionalInterface + interface TypeReader { + DataType read(DataInputStream in, boolean nullable) throws IOException; + } + + private static final TypeWriter[] WRITERS = new TypeWriter[DataTypeRoot.values().length]; + private static final TypeReader[] READERS = new TypeReader[18]; + + static { + // simple types + reg(DataTypeRoot.BOOLEAN, TYPE_BOOLEAN, (in, n) -> new BooleanType(n)); + reg(DataTypeRoot.TINYINT, TYPE_TINYINT, (in, n) -> new TinyIntType(n)); + reg(DataTypeRoot.SMALLINT, TYPE_SMALLINT, (in, n) -> new SmallIntType(n)); + reg(DataTypeRoot.INTEGER, TYPE_INTEGER, (in, n) -> new IntType(n)); + reg(DataTypeRoot.BIGINT, TYPE_BIGINT, (in, n) -> new BigIntType(n)); + reg(DataTypeRoot.FLOAT, TYPE_FLOAT, (in, n) -> new FloatType(n)); + reg(DataTypeRoot.DOUBLE, TYPE_DOUBLE, (in, n) -> new DoubleType(n)); + reg(DataTypeRoot.DATE, TYPE_DATE, (in, n) -> new DateType(n)); + + // CHAR + WRITERS[DataTypeRoot.CHAR.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_CHAR); + out.writeBoolean(type.isNullable()); + writeVarint(out, ((CharType) type).getLength()); + }; + READERS[TYPE_CHAR] = (in, n) -> new CharType(n, readVarint(in)); + + // VARCHAR / STRING + WRITERS[DataTypeRoot.VARCHAR.ordinal()] = + (out, type) -> { + int len = ((VarCharType) type).getLength(); + if (len == VarCharType.MAX_LENGTH) { + out.writeByte(TYPE_STRING); + out.writeBoolean(type.isNullable()); + } else { + out.writeByte(TYPE_VARCHAR); + out.writeBoolean(type.isNullable()); + writeVarint(out, len); + } + }; + READERS[TYPE_VARCHAR] = (in, n) -> new VarCharType(n, readVarint(in)); + READERS[TYPE_STRING] = (in, n) -> new VarCharType(n, VarCharType.MAX_LENGTH); + + // BINARY + WRITERS[DataTypeRoot.BINARY.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_BINARY); + out.writeBoolean(type.isNullable()); + writeVarint(out, ((BinaryType) type).getLength()); + }; + READERS[TYPE_BINARY] = (in, n) -> new BinaryType(n, readVarint(in)); + + // VARBINARY / BYTES + WRITERS[DataTypeRoot.VARBINARY.ordinal()] = + (out, type) -> { + int len = ((VarBinaryType) type).getLength(); + if (len == VarBinaryType.MAX_LENGTH) { + out.writeByte(TYPE_BYTES); + out.writeBoolean(type.isNullable()); + } else { + out.writeByte(TYPE_VARBINARY); + out.writeBoolean(type.isNullable()); + writeVarint(out, len); + } + }; + READERS[TYPE_VARBINARY] = (in, n) -> new VarBinaryType(n, readVarint(in)); + READERS[TYPE_BYTES] = (in, n) -> new VarBinaryType(n, VarBinaryType.MAX_LENGTH); + + // DECIMAL + WRITERS[DataTypeRoot.DECIMAL.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_DECIMAL); + out.writeBoolean(type.isNullable()); + DecimalType dt = (DecimalType) type; + writeVarint(out, dt.getPrecision()); + writeVarint(out, dt.getScale()); + }; + READERS[TYPE_DECIMAL] = (in, n) -> new DecimalType(n, readVarint(in), readVarint(in)); + + // TIME + WRITERS[DataTypeRoot.TIME_WITHOUT_TIME_ZONE.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_TIME); + out.writeBoolean(type.isNullable()); + writeVarint(out, ((TimeType) type).getPrecision()); + }; + READERS[TYPE_TIME] = (in, n) -> new TimeType(n, readVarint(in)); + + // TIMESTAMP + WRITERS[DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_TIMESTAMP); + out.writeBoolean(type.isNullable()); + writeVarint(out, ((TimestampType) type).getPrecision()); + }; + READERS[TYPE_TIMESTAMP] = (in, n) -> new TimestampType(n, readVarint(in)); + + // TIMESTAMP WITH LOCAL TIME ZONE + WRITERS[DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE.ordinal()] = + (out, type) -> { + out.writeByte(TYPE_TIMESTAMP_LTZ); + out.writeBoolean(type.isNullable()); + writeVarint(out, ((LocalZonedTimestampType) type).getPrecision()); + }; + READERS[TYPE_TIMESTAMP_LTZ] = (in, n) -> new LocalZonedTimestampType(n, readVarint(in)); + } + + private static void reg(DataTypeRoot root, byte typeId, TypeReader reader) { + WRITERS[root.ordinal()] = + (out, type) -> { + out.writeByte(typeId); + out.writeBoolean(type.isNullable()); + }; + READERS[typeId] = reader; + } + + public static void writeType(DataOutputStream out, DataType type) throws IOException { + TypeWriter writer = WRITERS[type.getTypeRoot().ordinal()]; + if (writer == null) { + throw new IOException("Unsupported Mosaic type: " + type.getTypeRoot()); + } + writer.write(out, type); + } + + public static DataType readType(DataInputStream in) throws IOException { + int typeId = in.readByte() & 0xFF; + boolean nullable = in.readBoolean(); + TypeReader reader = typeId < READERS.length ? READERS[typeId] : null; + if (reader == null) { + throw new IOException("Unsupported Mosaic type ID: " + typeId); + } + return reader.read(in, nullable); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java new file mode 100644 index 000000000000..a7f4cdcbe48b --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +/** Shared varint and long encoding/decoding utilities for the Mosaic file format. */ +public class MosaicUtils { + + // ==================== byte[] based ==================== + + public static int readVarint(byte[] buf, int[] pos) { + int value = 0; + int shift = 0; + int b; + do { + b = buf[pos[0]++] & 0xFF; + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + public static long readLong(byte[] buf, int[] pos) { + long v = 0; + for (int i = 0; i < 8; i++) { + v = (v << 8) | (buf[pos[0]++] & 0xFF); + } + return v; + } + + public static int writeVarint(byte[] buf, int pos, int value) { + while ((value & ~0x7F) != 0) { + buf[pos++] = (byte) ((value & 0x7F) | 0x80); + value >>>= 7; + } + buf[pos++] = (byte) value; + return pos; + } + + public static int writeLong(byte[] buf, int pos, long value) { + buf[pos++] = (byte) (value >>> 56); + buf[pos++] = (byte) (value >>> 48); + buf[pos++] = (byte) (value >>> 40); + buf[pos++] = (byte) (value >>> 32); + buf[pos++] = (byte) (value >>> 24); + buf[pos++] = (byte) (value >>> 16); + buf[pos++] = (byte) (value >>> 8); + buf[pos++] = (byte) value; + return pos; + } + + // ==================== stream based ==================== + + public static void writeVarint(DataOutputStream out, int value) throws IOException { + while ((value & ~0x7F) != 0) { + out.writeByte((value & 0x7F) | 0x80); + value >>>= 7; + } + out.writeByte(value); + } + + public static int readVarint(DataInputStream in) throws IOException { + int value = 0; + int shift = 0; + int b; + do { + b = in.readByte() & 0xFF; + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java new file mode 100644 index 000000000000..78083c283d8d --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.mosaic.MosaicSpec.RowGroupMeta; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; + +import com.github.luben.zstd.Zstd; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_NONE; +import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_ZSTD; +import static org.apache.paimon.format.mosaic.MosaicUtils.writeLong; +import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint; + +/** Writer for the Mosaic file format with row group support. */ +public class MosaicWriter implements FormatWriter { + + private final PositionOutputStream out; + private final MosaicSchema schema; + private MosaicSchema prunedSchema; + private final MosaicBucketWriter[] bucketWriters; + private final int numBuckets; + private final int zstdLevel; + private final byte compressionByte; + private final long rowGroupMaxSize; + + private final List rowGroupMetas; + private byte[] compressBuffer; + private int currentRowGroupRows; + private long currentBufferedSize; + private double compressionRatio; + private boolean closed; + + public MosaicWriter( + PositionOutputStream out, + RowType rowType, + int numBuckets, + int zstdLevel, + String compression, + long rowGroupMaxSize) { + this.out = out; + this.numBuckets = Math.min(numBuckets, rowType.getFieldCount()); + this.zstdLevel = zstdLevel; + this.compressionByte = MosaicSpec.compressionToByte(compression); + this.rowGroupMaxSize = rowGroupMaxSize; + this.schema = MosaicSchema.create(rowType, this.numBuckets); + this.bucketWriters = new MosaicBucketWriter[this.numBuckets]; + + int[][] bucketMapping = schema.bucketToGlobalIndices(); + for (int b = 0; b < this.numBuckets; b++) { + if (bucketMapping[b].length > 0) { + bucketWriters[b] = new MosaicBucketWriter(rowType, bucketMapping[b]); + } + } + + this.rowGroupMetas = new ArrayList<>(); + this.compressBuffer = new byte[0]; + this.currentRowGroupRows = 0; + this.currentBufferedSize = 0; + this.compressionRatio = this.compressionByte == COMPRESSION_NONE ? 1.0 : 0.3; + this.closed = false; + } + + @Override + public void addElement(InternalRow element) throws IOException { + long size = 0; + for (int i = 0; i < numBuckets; i++) { + if (bucketWriters[i] != null) { + size += bucketWriters[i].writeRow(element); + } + } + currentRowGroupRows++; + currentBufferedSize += size; + + if (currentBufferedSize >= rowGroupMaxSize) { + flushRowGroup(); + } + } + + @Override + public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException { + long estimatedSize = out.getPos() + (long) (currentBufferedSize * compressionRatio); + return estimatedSize >= targetSize; + } + + private void flushRowGroup() throws IOException { + if (currentRowGroupRows == 0) { + return; + } + + long[] bucketOffsets = new long[numBuckets]; + int[] compressedSizes = new int[numBuckets]; + int[] uncompressedSizes = new int[numBuckets]; + + for (int b = 0; b < numBuckets; b++) { + MosaicBucketWriter bucketWriter = bucketWriters[b]; + if (bucketWriter == null || bucketWriter.isEmpty()) { + continue; + } + byte[] raw = bucketWriter.finish(); + compressedSizes[b] = writeCompressed(raw); + uncompressedSizes[b] = raw.length; + bucketOffsets[b] = out.getPos() - compressedSizes[b]; + bucketWriter.reset(); + } + + rowGroupMetas.add( + new RowGroupMeta( + currentRowGroupRows, bucketOffsets, compressedSizes, uncompressedSizes)); + + long totalCompressed = 0; + long totalUncompressed = 0; + for (int b = 0; b < numBuckets; b++) { + totalCompressed += compressedSizes[b]; + totalUncompressed += uncompressedSizes[b]; + } + if (totalUncompressed > 0) { + compressionRatio = (double) totalCompressed / totalUncompressed; + } + + currentRowGroupRows = 0; + currentBufferedSize = 0; + } + + private void flushRowGroupPruned() throws IOException { + if (currentRowGroupRows == 0) { + return; + } + + boolean[][] allNullByBucket = new boolean[numBuckets][]; + long[] bucketOffsets = new long[numBuckets]; + int[] compressedSizes = new int[numBuckets]; + int[] uncompressedSizes = new int[numBuckets]; + + for (int b = 0; b < numBuckets; b++) { + MosaicBucketWriter bucketWriter = bucketWriters[b]; + if (bucketWriter == null || bucketWriter.isEmpty()) { + continue; + } + allNullByBucket[b] = bucketWriter.getAllNullFlags(); + byte[] raw = bucketWriter.finish(true); + compressedSizes[b] = writeCompressed(raw); + uncompressedSizes[b] = raw.length; + bucketOffsets[b] = out.getPos() - compressedSizes[b]; + bucketWriter.reset(); + } + + rowGroupMetas.add( + new RowGroupMeta( + currentRowGroupRows, bucketOffsets, compressedSizes, uncompressedSizes)); + + prunedSchema = schema.pruneAllNullColumns(allNullByBucket); + + currentRowGroupRows = 0; + currentBufferedSize = 0; + } + + private int writeCompressed(byte[] raw) throws IOException { + switch (compressionByte) { + case COMPRESSION_NONE: + out.write(raw); + return raw.length; + case COMPRESSION_ZSTD: + int bound = (int) Zstd.compressBound(raw.length); + if (compressBuffer.length < bound) { + compressBuffer = new byte[bound]; + } + int compLen = (int) Zstd.compress(compressBuffer, raw, zstdLevel); + out.write(compressBuffer, 0, compLen); + return compLen; + default: + throw new UnsupportedEncodingException( + "Unsupported compression: " + compressionByte); + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + + // Flush remaining rows as the last row group + boolean singleRowGroup = rowGroupMetas.isEmpty() && currentRowGroupRows > 0; + if (singleRowGroup) { + flushRowGroupPruned(); + } else { + flushRowGroup(); + } + + // Write schema block (use pruned schema if available) + MosaicSchema schemaToWrite = prunedSchema != null ? prunedSchema : schema; + byte[] schemaRaw = schemaToWrite.serialize(); + long schemaBlockOffset = out.getPos(); + switch (compressionByte) { + case COMPRESSION_NONE: + { + ByteBuffer lenBuf = ByteBuffer.allocate(4).order(ByteOrder.BIG_ENDIAN); + lenBuf.putInt(schemaRaw.length); + out.write(lenBuf.array()); + out.write(schemaRaw); + break; + } + case COMPRESSION_ZSTD: + { + int schemaBound = (int) Zstd.compressBound(schemaRaw.length); + if (compressBuffer.length < schemaBound) { + compressBuffer = new byte[schemaBound]; + } + long compLen = Zstd.compress(compressBuffer, schemaRaw, zstdLevel); + ByteBuffer lenBuf = ByteBuffer.allocate(4).order(ByteOrder.BIG_ENDIAN); + lenBuf.putInt(schemaRaw.length); + out.write(lenBuf.array()); + out.write(compressBuffer, 0, (int) compLen); + break; + } + default: + throw new UnsupportedEncodingException( + "Unsupported compression: " + compressionByte); + } + + // Write row group index (varint encoded, only non-empty buckets) + long indexOffset = out.getPos(); + int numRowGroups = rowGroupMetas.size(); + byte[] indexBuf = new byte[numRowGroups * (5 + numBuckets * 25)]; + int idxPos = 0; + for (RowGroupMeta meta : rowGroupMetas) { + idxPos = writeVarint(indexBuf, idxPos, meta.numRows); + int nonEmpty = 0; + for (int b = 0; b < numBuckets; b++) { + if (meta.compressedSizes[b] > 0) { + nonEmpty++; + } + } + idxPos = writeVarint(indexBuf, idxPos, nonEmpty); + for (int b = 0; b < numBuckets; b++) { + if (meta.compressedSizes[b] > 0) { + idxPos = writeVarint(indexBuf, idxPos, b); + idxPos = writeLong(indexBuf, idxPos, meta.bucketOffsets[b]); + idxPos = writeVarint(indexBuf, idxPos, meta.compressedSizes[b]); + idxPos = writeVarint(indexBuf, idxPos, meta.uncompressedSizes[b]); + } + } + } + out.write(indexBuf, 0, idxPos); + + // Write footer + ByteBuffer footer = ByteBuffer.allocate(MosaicSpec.FOOTER_SIZE).order(ByteOrder.BIG_ENDIAN); + footer.putLong(indexOffset); + footer.putLong(schemaBlockOffset); + footer.putInt(numBuckets); + footer.putInt(numRowGroups); + footer.put(compressionByte); + footer.put(MosaicSpec.VERSION); + footer.putShort((short) 0); + footer.put(MosaicSpec.MAGIC); + out.write(footer.array()); + + out.flush(); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java new file mode 100644 index 000000000000..5393ebcc0396 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; + +import java.io.IOException; + +/** Factory for creating {@link MosaicWriter} instances. */ +public class MosaicWriterFactory implements FormatWriterFactory { + + private final RowType rowType; + private final int numBuckets; + private final int zstdLevel; + private final long rowGroupMaxSize; + + public MosaicWriterFactory( + RowType rowType, int numBuckets, int zstdLevel, long rowGroupMaxSize) { + this.rowType = rowType; + this.numBuckets = numBuckets; + this.zstdLevel = zstdLevel; + this.rowGroupMaxSize = rowGroupMaxSize; + } + + @Override + public FormatWriter create(PositionOutputStream out, String compression) throws IOException { + return new MosaicWriter(out, rowType, numBuckets, zstdLevel, compression, rowGroupMaxSize); + } +} diff --git a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory index 80cfe4b946b8..777fcb65f545 100644 --- a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory +++ b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory @@ -20,3 +20,4 @@ org.apache.paimon.format.csv.CsvFileFormatFactory org.apache.paimon.format.text.TextFileFormatFactory org.apache.paimon.format.json.JsonFileFormatFactory org.apache.paimon.format.blob.BlobFileFormatFactory +org.apache.paimon.format.mosaic.MosaicFileFormatFactory diff --git a/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java b/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java new file mode 100644 index 000000000000..4d915960f0f6 --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FileFormatFactory.FormatContext; +import org.apache.paimon.format.mosaic.MosaicFileFormat; +import org.apache.paimon.format.orc.OrcFileFormat; +import org.apache.paimon.format.parquet.ParquetFileFormat; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; + +/** + * Benchmark to compare file sizes and projection read performance between Parquet, ORC and Mosaic + * for wide tables (10,000+ columns). + * + *

Run manually: {@code mvn exec:java -pl paimon-format + * -Dexec.mainClass="org.apache.paimon.format.WideTableFileFormatSizeTest" + * -Dexec.classpathScope="test"} + */ +public class WideTableFormatBenchmark { + + private static final int COLUMN_COUNT = 10000; + private static final int ROW_COUNT = 10; + private static final String COMPRESSION = "zstd"; + + public static void main(String[] args) throws Exception { + run(WideTableFormatBenchmark::fileSizeComparison); + run(tempDir -> projectionReadPerformance(tempDir, 500)); + run(tempDir -> projectionReadPerformance(tempDir, 4500)); + } + + private static void run(Runner runner) throws IOException { + java.nio.file.Path tempDir = Files.createTempDirectory("mosaic-benchmark"); + try { + runner.run(tempDir); + } finally { + deleteRecursively(tempDir); + } + } + + private static void fileSizeComparison(java.nio.file.Path tempDir) throws IOException { + RowType rowType = buildWideRowType(); + int fieldCount = rowType.getFieldCount(); + LocalFileIO fileIO = new LocalFileIO(); + + long parquetSize = + writeParquet( + rowType, + ROW_COUNT, + new Path(tempDir.toString(), "wide_table.parquet"), + fileIO); + + long orcSize = + writeOrc( + rowType, ROW_COUNT, new Path(tempDir.toString(), "wide_table.orc"), fileIO); + + Path mosaicPath = new Path(tempDir.toString(), "wide_table.mosaic"); + long mosaicSize = writeMosaic(rowType, ROW_COUNT, mosaicPath, fileIO); + + System.out.println("=== Wide Table File Size Comparison ==="); + System.out.println("Columns: " + COLUMN_COUNT + ", Rows: " + ROW_COUNT); + System.out.println("Column name avg length: ~80 bytes"); + System.out.println("Compression: " + COMPRESSION + " (level 9)"); + System.out.println("---------------------------------------"); + System.out.printf("Parquet: %,d bytes (%.1f KB)%n", parquetSize, parquetSize / 1024.0); + System.out.printf("ORC: %,d bytes (%.1f KB)%n", orcSize, orcSize / 1024.0); + System.out.printf("Mosaic: %,d bytes (%.1f KB)%n", mosaicSize, mosaicSize / 1024.0); + System.out.println("---------------------------------------"); + + // verify Mosaic correctness + List mosaicResult = readMosaic(rowType, rowType, mosaicPath, fileIO); + check(mosaicResult.size() == ROW_COUNT, "Row count mismatch"); + for (int r = 0; r < ROW_COUNT; r++) { + GenericRow expected = generateRow(r, fieldCount); + for (int c = 0; c < COLUMN_COUNT; c++) { + assertCellEqual(mosaicResult.get(r), expected, c); + } + } + System.out.println("Correctness check: PASSED"); + } + + private static void projectionReadPerformance(java.nio.file.Path tempDir, int rows) + throws IOException { + RowType rowType = buildWideRowType(); + LocalFileIO fileIO = new LocalFileIO(); + + Path parquetPath = new Path(tempDir.toString(), "proj_test.parquet"); + long parquetFileSize = writeParquet(rowType, rows, parquetPath, fileIO); + + Path orcPath = new Path(tempDir.toString(), "proj_test.orc"); + long orcFileSize = writeOrc(rowType, rows, orcPath, fileIO); + + Path mosaicPath = new Path(tempDir.toString(), "proj_test.mosaic"); + long mosaicFileSize = writeMosaic(rowType, rows, mosaicPath, fileIO); + + int[] projected10Cols = {0, 100, 500, 1000, 2000, 5000, 7000, 8000, 9000, 9999}; + int[] projected1Col = {1000}; + + System.out.printf("\n=== Projection Read Performance (%d rows) ===%n", rows); + System.out.printf( + "File size - Parquet: %.1f MB, ORC: %.1f MB, Mosaic: %.1f MB%n", + parquetFileSize / 1024.0 / 1024.0, + orcFileSize / 1024.0 / 1024.0, + mosaicFileSize / 1024.0 / 1024.0); + System.out.println("---------------------------------------"); + + benchmarkProjection( + rowType, projected10Cols, rows, parquetPath, orcPath, mosaicPath, fileIO); + benchmarkProjection(rowType, projected1Col, rows, parquetPath, orcPath, mosaicPath, fileIO); + } + + private static void benchmarkProjection( + RowType rowType, + int[] projectedColumns, + int rows, + Path parquetPath, + Path orcPath, + Path mosaicPath, + LocalFileIO fileIO) + throws IOException { + RowType projectedType = rowType.project(projectedColumns); + + int warmup = 3; + int iterations = 10; + + for (int i = 0; i < warmup; i++) { + readParquetProjected(rowType, projectedType, parquetPath, fileIO); + } + long parquetStart = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + readParquetProjected(rowType, projectedType, parquetPath, fileIO); + } + long parquetTimeNs = (System.nanoTime() - parquetStart) / iterations; + + for (int i = 0; i < warmup; i++) { + readOrcProjected(rowType, projectedType, orcPath, fileIO); + } + long orcStart = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + readOrcProjected(rowType, projectedType, orcPath, fileIO); + } + long orcTimeNs = (System.nanoTime() - orcStart) / iterations; + + for (int i = 0; i < warmup; i++) { + readMosaic(rowType, projectedType, mosaicPath, fileIO); + } + long mosaicStart = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + readMosaic(rowType, projectedType, mosaicPath, fileIO); + } + long mosaicTimeNs = (System.nanoTime() - mosaicStart) / iterations; + + System.out.printf( + "Project %2d / %d cols: Parquet %,d us, ORC %,d us, Mosaic %,d us%n", + projectedColumns.length, + COLUMN_COUNT, + parquetTimeNs / 1000, + orcTimeNs / 1000, + mosaicTimeNs / 1000); + + // verify projection results + List parquetResult = + readParquetProjected(rowType, projectedType, parquetPath, fileIO); + List mosaicResult = readMosaic(rowType, projectedType, mosaicPath, fileIO); + check( + mosaicResult.size() == parquetResult.size(), + "Projection row count mismatch: parquet=" + + parquetResult.size() + + " mosaic=" + + mosaicResult.size()); + for (int r = 0; r < parquetResult.size(); r++) { + for (int c = 0; c < projectedColumns.length; c++) { + int origCol = projectedColumns[c]; + if (isIntColumn(origCol)) { + check( + mosaicResult.get(r).getInt(c) == parquetResult.get(r).getInt(c), + "INT mismatch at row=" + r + " col=" + c); + } else { + check( + mosaicResult + .get(r) + .getString(c) + .toString() + .equals(parquetResult.get(r).getString(c).toString()), + "STRING mismatch at row=" + r + " col=" + c); + } + } + } + } + + // ==================== Parquet helpers ==================== + + private static long writeParquet(RowType rowType, int rowCount, Path path, LocalFileIO fileIO) + throws IOException { + ParquetFileFormat parquet = new ParquetFileFormat(createFormatContext()); + FormatWriterFactory writerFactory = parquet.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, COMPRESSION); + int fieldCount = rowType.getFieldCount(); + for (int r = 0; r < rowCount; r++) { + writer.addElement(generateRow(r, fieldCount)); + } + writer.close(); + out.close(); + return fileIO.getFileSize(path); + } + + private static List readParquetProjected( + RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO) + throws IOException { + ParquetFileFormat parquet = new ParquetFileFormat(createFormatContext()); + RecordReader reader = + parquet.createReaderFactory(fullType, projectedType, null) + .createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + Object[] fields = new Object[projectedType.getFieldCount()]; + for (int i = 0; i < fields.length; i++) { + if (row.isNullAt(i)) { + fields[i] = null; + } else if (projectedType.getTypeAt(i).getTypeRoot() + == DataTypeRoot.INTEGER) { + fields[i] = row.getInt(i); + } else { + fields[i] = BinaryString.fromString(row.getString(i).toString()); + } + } + result.add(GenericRow.of(fields)); + }); + reader.close(); + return result; + } + + // ==================== ORC helpers ==================== + + private static long writeOrc(RowType rowType, int rowCount, Path path, LocalFileIO fileIO) + throws IOException { + OrcFileFormat orc = new OrcFileFormat(createFormatContext()); + FormatWriterFactory writerFactory = orc.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, COMPRESSION); + int fieldCount = rowType.getFieldCount(); + for (int r = 0; r < rowCount; r++) { + writer.addElement(generateRow(r, fieldCount)); + } + writer.close(); + out.close(); + return fileIO.getFileSize(path); + } + + private static List readOrcProjected( + RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO) + throws IOException { + OrcFileFormat orc = new OrcFileFormat(createFormatContext()); + RecordReader reader = + orc.createReaderFactory(fullType, projectedType, new ArrayList<>()) + .createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + Object[] fields = new Object[projectedType.getFieldCount()]; + for (int i = 0; i < fields.length; i++) { + if (row.isNullAt(i)) { + fields[i] = null; + } else if (projectedType.getTypeAt(i).getTypeRoot() + == DataTypeRoot.INTEGER) { + fields[i] = row.getInt(i); + } else { + fields[i] = row.getString(i); + } + } + result.add(GenericRow.of(fields)); + }); + reader.close(); + return result; + } + + // ==================== Mosaic helpers ==================== + + private static long writeMosaic(RowType rowType, int rowCount, Path path, LocalFileIO fileIO) + throws IOException { + MosaicFileFormat mosaic = new MosaicFileFormat(createFormatContext()); + FormatWriterFactory writerFactory = mosaic.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, COMPRESSION); + int fieldCount = rowType.getFieldCount(); + for (int r = 0; r < rowCount; r++) { + writer.addElement(generateRow(r, fieldCount)); + } + writer.close(); + out.close(); + return fileIO.getFileSize(path); + } + + private static List readMosaic( + RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO) + throws IOException { + MosaicFileFormat mosaic = new MosaicFileFormat(createFormatContext()); + RecordReader reader = + mosaic.createReaderFactory(fullType, projectedType, null) + .createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + Object[] fields = new Object[projectedType.getFieldCount()]; + for (int i = 0; i < fields.length; i++) { + if (row.isNullAt(i)) { + fields[i] = null; + } else if (projectedType.getTypeAt(i).getTypeRoot() + == DataTypeRoot.INTEGER) { + fields[i] = row.getInt(i); + } else { + fields[i] = row.getString(i); + } + } + result.add(GenericRow.of(fields)); + }); + reader.close(); + return result; + } + + // ==================== Helpers ==================== + + private static final int INT_COLUMN_INTERVAL = 10; + private static final String[] STRING_SAMPLES = { + "uuid: 550e8400-e29b-41d4-a716-446655440000", + "{\"user_id\": 12345, \"action\": \"click\", \"page\": \"home\"}", + "https://example.com/api/v1/resource/abc123?query=active&sort=desc", + "customer_service@company-name.example.com", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "2024-01-15T09:23:47.123Z", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "error: connection timeout after 30000ms, retrying...", + "session_token_a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6q7r8s9t0", + "active,verified,premium,notifications_enabled,marketing_opt_in", + "New York, NY 10001, United States", + "REF-ORD-2024-8847293-XJ", + "0x7f8a9b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9", + "Approved by manager at 2024-01-15T10:00:00Z", + "file:///data/storage/partition_2024_01/batch_17.parquet", + "[ERROR] NullPointerException at com.example.Service.processLine(42)", + "User preferences: theme=dark, lang=zh-CN, timezone=Asia/Shanghai", + "192.168.1.105", + "Batch job completed successfully. Processed 1,234,567 records in 45.3s.", + "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0", + "Shipping via FedEx Ground, tracking #: 784930123456, est. 3 business days", + "comment: This product exceeded my expectations! Would recommend to everyone.", + "department=engineering|team=platform|role=senior|level=L6", + "version=3.2.1-SNAPSHOT, build=20240115.1423, commit=abc123def", + "payment_method=visa_ending_4242|billing_cycle=monthly|amount=99.99USD" + }; + + private static RowType buildWideRowType() { + RowType.Builder builder = RowType.builder(); + for (int i = 0; i < COLUMN_COUNT; i++) { + String name = + String.format( + "this_is_a_very_long_column_name_for_testing_compression_ratio_column_index_%05d", + i); + if (i % INT_COLUMN_INTERVAL == 0) { + builder.field(name, DataTypes.INT()); + } else { + builder.field(name, DataTypes.STRING()); + } + } + return builder.build(); + } + + private static GenericRow generateRow(int rowIndex, int fieldCount) { + Object[] fields = new Object[fieldCount]; + for (int c = 0; c < fieldCount; c++) { + if (c % INT_COLUMN_INTERVAL == 0) { + fields[c] = rowIndex * fieldCount + c; + } else { + int sampleIdx = (rowIndex + c) % STRING_SAMPLES.length; + fields[c] = + BinaryString.fromString( + STRING_SAMPLES[sampleIdx] + + " [row=" + + rowIndex + + ",col=" + + c + + "]"); + } + } + return GenericRow.of(fields); + } + + private static boolean isIntColumn(int index) { + return index % INT_COLUMN_INTERVAL == 0; + } + + private static void assertCellEqual(InternalRow actual, InternalRow expected, int col) { + if (isIntColumn(col)) { + check(actual.getInt(col) == expected.getInt(col), "INT mismatch at col=" + col); + } else { + check( + actual.getString(col).toString().equals(expected.getString(col).toString()), + "STRING mismatch at col=" + col); + } + } + + private static void check(boolean condition, String message) { + if (!condition) { + throw new AssertionError(message); + } + } + + private static FormatContext createFormatContext() { + return new FormatContext(new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 9, null); + } + + private static void deleteRecursively(java.nio.file.Path dir) { + try { + Files.walk(dir) + .sorted(java.util.Comparator.reverseOrder()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + // ignore + } + }); + } catch (IOException e) { + // ignore + } + } + + private interface Runner { + void run(java.nio.file.Path tempDir) throws IOException; + } +} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java new file mode 100644 index 000000000000..17e4d3a5a392 --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java @@ -0,0 +1,1244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.format.FileFormatFactory.FormatContext; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.format.orc.OrcFileFormat; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for the Mosaic file format. */ +public class MosaicFileFormatTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testBasicRoundTrip() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .field("value", DataTypes.DOUBLE()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, BinaryString.fromString("name_" + i), i * 1.5)); + } + + Path path = new Path(tempDir.toString(), "basic.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getString(1).toString()).isEqualTo("name_" + i); + assertThat(result.get(i).getDouble(2)).isEqualTo(i * 1.5); + } + } + + @Test + public void testProjectionPushdown() throws IOException { + RowType rowType = + RowType.builder() + .field("a", DataTypes.INT()) + .field("b", DataTypes.STRING()) + .field("c", DataTypes.BIGINT()) + .field("d", DataTypes.DOUBLE()) + .field("e", DataTypes.FLOAT()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 50; i++) { + data.add( + GenericRow.of( + i, + BinaryString.fromString("val_" + i), + (long) i * 100, + i * 2.5, + (float) i * 0.1f)); + } + + Path path = new Path(tempDir.toString(), "proj.mosaic"); + write(rowType, data, path); + + // Project only columns a and c + RowType projectedType = + RowType.builder() + .field("a", DataTypes.INT()) + .field("c", DataTypes.BIGINT()) + .build(); + + List result = read(rowType, projectedType, path); + + assertThat(result).hasSize(50); + for (int i = 0; i < 50; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getLong(1)).isEqualTo((long) i * 100); + } + } + + @Test + public void testProjectionSkipsVariableLengthColumns() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_str1", DataTypes.STRING()) + .field("f_bytes", DataTypes.BYTES()) + .field("f_str2", DataTypes.STRING()) + .field("f_decimal_large", DataTypes.DECIMAL(30, 5)) + .field("f_target", DataTypes.BIGINT()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add( + GenericRow.of( + i, + BinaryString.fromString("variable_length_string_" + i), + ("binary_data_" + i).getBytes(), + BinaryString.fromString("another_string_value_" + i), + Decimal.fromBigDecimal( + new BigDecimal("123456789012345678901234.12345"), 30, 5), + (long) i * 1000)); + } + + Path path = new Path(tempDir.toString(), "skip_varlen.mosaic"); + write(rowType, data, path); + + // Project only f_int and f_target, forcing reader to skip variable-length columns in + // between + RowType projectedType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_target", DataTypes.BIGINT()) + .build(); + + List result = read(rowType, projectedType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getLong(1)).isEqualTo((long) i * 1000); + } + } + + @Test + public void testNullValues() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING().nullable()) + .field("value", DataTypes.DOUBLE().nullable()) + .build(); + + List data = new ArrayList<>(); + data.add(GenericRow.of(1, BinaryString.fromString("hello"), 1.0)); + data.add(GenericRow.of(2, null, 2.0)); + data.add(GenericRow.of(3, BinaryString.fromString("world"), null)); + data.add(GenericRow.of(4, null, null)); + + Path path = new Path(tempDir.toString(), "nulls.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(4); + + assertThat(result.get(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getString(1).toString()).isEqualTo("hello"); + assertThat(result.get(0).getDouble(2)).isEqualTo(1.0); + + assertThat(result.get(1).getInt(0)).isEqualTo(2); + assertThat(result.get(1).isNullAt(1)).isTrue(); + assertThat(result.get(1).getDouble(2)).isEqualTo(2.0); + + assertThat(result.get(2).getInt(0)).isEqualTo(3); + assertThat(result.get(2).getString(1).toString()).isEqualTo("world"); + assertThat(result.get(2).isNullAt(2)).isTrue(); + + assertThat(result.get(3).getInt(0)).isEqualTo(4); + assertThat(result.get(3).isNullAt(1)).isTrue(); + assertThat(result.get(3).isNullAt(2)).isTrue(); + } + + @Test + public void testAllPrimitiveTypes() throws IOException { + RowType rowType = + RowType.builder() + .field("f_boolean", DataTypes.BOOLEAN()) + .field("f_tinyint", DataTypes.TINYINT()) + .field("f_smallint", DataTypes.SMALLINT()) + .field("f_int", DataTypes.INT()) + .field("f_bigint", DataTypes.BIGINT()) + .field("f_float", DataTypes.FLOAT()) + .field("f_double", DataTypes.DOUBLE()) + .field("f_string", DataTypes.STRING()) + .field("f_bytes", DataTypes.BYTES()) + .field("f_decimal_compact", DataTypes.DECIMAL(10, 2)) + .field("f_decimal_large", DataTypes.DECIMAL(30, 5)) + .field("f_date", DataTypes.DATE()) + .field("f_timestamp", DataTypes.TIMESTAMP(3)) + .field("f_timestamp_high", DataTypes.TIMESTAMP(9)) + .build(); + + List data = new ArrayList<>(); + data.add( + GenericRow.of( + true, + (byte) 42, + (short) 1234, + 999999, + 123456789012345L, + 3.14f, + 2.718281828, + BinaryString.fromString("hello world"), + new byte[] {1, 2, 3, 4, 5}, + Decimal.fromBigDecimal(new BigDecimal("12345.67"), 10, 2), + Decimal.fromBigDecimal( + new BigDecimal("123456789012345678901234.12345"), 30, 5), + 19000, // days since epoch + Timestamp.fromEpochMillis(1700000000000L), + Timestamp.fromEpochMillis(1700000000000L, 123456))); + + Path path = new Path(tempDir.toString(), "all_types.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(1); + InternalRow row = result.get(0); + assertThat(row.getBoolean(0)).isTrue(); + assertThat(row.getByte(1)).isEqualTo((byte) 42); + assertThat(row.getShort(2)).isEqualTo((short) 1234); + assertThat(row.getInt(3)).isEqualTo(999999); + assertThat(row.getLong(4)).isEqualTo(123456789012345L); + assertThat(row.getFloat(5)).isEqualTo(3.14f); + assertThat(row.getDouble(6)).isEqualTo(2.718281828); + assertThat(row.getString(7).toString()).isEqualTo("hello world"); + assertThat(row.getBinary(8)).isEqualTo(new byte[] {1, 2, 3, 4, 5}); + assertThat(row.getDecimal(9, 10, 2).toBigDecimal()) + .isEqualByComparingTo(new BigDecimal("12345.67")); + assertThat(row.getDecimal(10, 30, 5).toBigDecimal()) + .isEqualByComparingTo(new BigDecimal("123456789012345678901234.12345")); + assertThat(row.getInt(11)).isEqualTo(19000); + assertThat(row.getTimestamp(12, 3).getMillisecond()).isEqualTo(1700000000000L); + assertThat(row.getTimestamp(13, 9).getMillisecond()).isEqualTo(1700000000000L); + assertThat(row.getTimestamp(13, 9).getNanoOfMillisecond()).isEqualTo(123456); + } + + @Test + public void testWideTable() throws IOException { + int columnCount = 10000; + int rowCount = 10; + + RowType rowType = buildWideRowType(columnCount); + List data = new ArrayList<>(); + for (int r = 0; r < rowCount; r++) { + Object[] fields = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + fields[c] = r * columnCount + c; + } + data.add(GenericRow.of(fields)); + } + + Path path = new Path(tempDir.toString(), "wide.mosaic"); + LocalFileIO fileIO = new LocalFileIO(); + write(rowType, data, path); + long mosaicSize = fileIO.getFileSize(path); + + // Compare with ORC + Path orcPath = new Path(tempDir.toString(), "wide.orc"); + OrcFileFormat orc = + new OrcFileFormat( + new FormatContext( + new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 9, null)); + FormatWriterFactory orcWriterFactory = orc.createWriterFactory(rowType); + PositionOutputStream orcOut = fileIO.newOutputStream(orcPath, false); + FormatWriter orcWriter = orcWriterFactory.create(orcOut, "zstd"); + for (InternalRow row : data) { + orcWriter.addElement(row); + } + orcWriter.close(); + orcOut.close(); + long orcSize = fileIO.getFileSize(orcPath); + + System.out.println("=== Wide Table: Mosaic vs ORC ==="); + System.out.printf("Mosaic: %,d bytes (%.1f KB)%n", mosaicSize, mosaicSize / 1024.0); + System.out.printf("ORC: %,d bytes (%.1f KB)%n", orcSize, orcSize / 1024.0); + System.out.printf("Ratio: ORC is %.1fx larger%n", (double) orcSize / mosaicSize); + + assertThat(mosaicSize).isLessThan(orcSize); + + // Verify correctness + List result = read(rowType, rowType, path); + assertThat(result).hasSize(rowCount); + for (int r = 0; r < rowCount; r++) { + for (int c = 0; c < columnCount; c++) { + assertThat(result.get(r).getInt(c)).isEqualTo(r * columnCount + c); + } + } + } + + @Test + public void testWideTableProjection() throws IOException { + int columnCount = 10000; + int rowCount = 100; + + RowType rowType = buildWideRowType(columnCount); + List data = new ArrayList<>(); + for (int r = 0; r < rowCount; r++) { + Object[] fields = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + fields[c] = r * columnCount + c; + } + data.add(GenericRow.of(fields)); + } + + Path path = new Path(tempDir.toString(), "wide_proj.mosaic"); + write(rowType, data, path); + + // Project 10 columns + int[] projectedIndices = {0, 100, 500, 1000, 2000, 5000, 7000, 8000, 9000, 9999}; + RowType projectedType = rowType.project(projectedIndices); + + List result = read(rowType, projectedType, path); + + assertThat(result).hasSize(rowCount); + for (int r = 0; r < rowCount; r++) { + for (int i = 0; i < projectedIndices.length; i++) { + int c = projectedIndices[i]; + assertThat(result.get(r).getInt(i)).isEqualTo(r * columnCount + c); + } + } + } + + @Test + public void testEmptyTable() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .build(); + + Path path = new Path(tempDir.toString(), "empty.mosaic"); + write(rowType, new ArrayList<>(), path); + List result = read(rowType, rowType, path); + assertThat(result).isEmpty(); + } + + @Test + public void testSingleColumn() throws IOException { + RowType rowType = RowType.builder().field("id", DataTypes.INT()).build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + data.add(GenericRow.of(i)); + } + + Path path = new Path(tempDir.toString(), "single.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(10); + for (int i = 0; i < 10; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testMultiRowGroupStringStability() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .build(); + + // Use tiny writeBatchMemory to force multiple row groups + MosaicFileFormat format = + new MosaicFileFormat( + new FormatContext( + new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null)); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, BinaryString.fromString("string_value_" + i))); + } + + Path path = new Path(tempDir.toString(), "multi_rg_string.mosaic"); + LocalFileIO fileIO = new LocalFileIO(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, "zstd"); + for (InternalRow row : data) { + writer.addElement(row); + } + writer.close(); + out.close(); + + // Project only the string column + RowType projectedType = RowType.builder().field("name", DataTypes.STRING()).build(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, projectedType, null); + FileRecordReader reader = + (FileRecordReader) + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + // Read batches one by one; retain string values from earlier batches + List allStrings = new ArrayList<>(); + RecordReader.RecordIterator batch; + while ((batch = reader.readBatch()) != null) { + InternalRow row; + while ((row = batch.next()) != null) { + allStrings.add(row.getString(0)); + } + batch.releaseBatch(); + } + reader.close(); + + // Verify all retained strings are still correct + assertThat(allStrings).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(allStrings.get(i).toString()).isEqualTo("string_value_" + i); + } + } + + // ==================== Columnar Encoding Tests ==================== + + @Test + public void testConstEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("const_int", DataTypes.INT()) + .field("const_long", DataTypes.BIGINT()) + .field("const_double", DataTypes.DOUBLE()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 200; i++) { + data.add(GenericRow.of(i, 42, 999L, 3.14)); + } + + Path path = new Path(tempDir.toString(), "const_enc.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(200); + for (int i = 0; i < 200; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getInt(1)).isEqualTo(42); + assertThat(result.get(i).getLong(2)).isEqualTo(999L); + assertThat(result.get(i).getDouble(3)).isEqualTo(3.14); + } + } + + @Test + public void testConstEncodingWithNulls() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("const_nullable", DataTypes.INT().nullable()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, i % 3 == 0 ? null : 42)); + } + + Path path = new Path(tempDir.toString(), "const_null.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + if (i % 3 == 0) { + assertThat(result.get(i).isNullAt(1)).isTrue(); + } else { + assertThat(result.get(i).getInt(1)).isEqualTo(42); + } + } + } + + @Test + public void testBooleanConstEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("flag_true", DataTypes.BOOLEAN()) + .field("flag_false", DataTypes.BOOLEAN()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, true, false)); + } + + Path path = new Path(tempDir.toString(), "bool_const.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getBoolean(1)).isTrue(); + assertThat(result.get(i).getBoolean(2)).isFalse(); + } + } + + @Test + public void testBooleanDictEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("flag", DataTypes.BOOLEAN()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, i % 2 == 0)); + } + + Path path = new Path(tempDir.toString(), "bool_dict.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getBoolean(1)).isEqualTo(i % 2 == 0); + } + } + + @Test + public void testDictEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("status", DataTypes.INT()) + .field("category", DataTypes.BIGINT()) + .field("level", DataTypes.SMALLINT()) + .build(); + + int[] statuses = {1, 2, 3, 4, 5}; + long[] categories = {100L, 200L, 300L}; + short[] levels = {10, 20}; + + List data = new ArrayList<>(); + for (int i = 0; i < 200; i++) { + data.add(GenericRow.of(i, statuses[i % 5], categories[i % 3], levels[i % 2])); + } + + Path path = new Path(tempDir.toString(), "dict_enc.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(200); + for (int i = 0; i < 200; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getInt(1)).isEqualTo(statuses[i % 5]); + assertThat(result.get(i).getLong(2)).isEqualTo(categories[i % 3]); + assertThat(result.get(i).getShort(3)).isEqualTo(levels[i % 2]); + } + } + + @Test + public void testDictEncodingWithNulls() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("flag", DataTypes.TINYINT().nullable()) + .build(); + + byte[] flags = {1, 2, 3}; + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, i % 4 == 0 ? null : flags[i % 3])); + } + + Path path = new Path(tempDir.toString(), "dict_null.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + if (i % 4 == 0) { + assertThat(result.get(i).isNullAt(1)).isTrue(); + } else { + assertThat(result.get(i).getByte(1)).isEqualTo(flags[i % 3]); + } + } + } + + @Test + public void testDictEncodingBoundary() throws IOException { + RowType rowType = + RowType.builder() + .field("dict_255", DataTypes.INT()) + .field("plain_256", DataTypes.INT()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 512; i++) { + data.add(GenericRow.of(i % 255, i % 256)); + } + + Path path = new Path(tempDir.toString(), "dict_boundary.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(512); + for (int i = 0; i < 512; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i % 255); + assertThat(result.get(i).getInt(1)).isEqualTo(i % 256); + } + } + + @Test + public void testFloatDictEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("f_float", DataTypes.FLOAT()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + + float[] floats = {1.5f, 2.5f, 3.5f}; + double[] doubles = {10.1, 20.2}; + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, floats[i % 3], doubles[i % 2])); + } + + Path path = new Path(tempDir.toString(), "float_dict.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getFloat(1)).isEqualTo(floats[i % 3]); + assertThat(result.get(i).getDouble(2)).isEqualTo(doubles[i % 2]); + } + } + + @Test + public void testAllNullEncoding() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("always_null_str", DataTypes.STRING().nullable()) + .field("always_null_dbl", DataTypes.DOUBLE().nullable()) + .field("always_null_int", DataTypes.INT().nullable()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 50; i++) { + data.add(GenericRow.of(i, null, null, null)); + } + + Path path = new Path(tempDir.toString(), "all_null_enc.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(50); + for (int i = 0; i < 50; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).isNullAt(1)).isTrue(); + assertThat(result.get(i).isNullAt(2)).isTrue(); + assertThat(result.get(i).isNullAt(3)).isTrue(); + } + } + + @Test + public void testMixedEncodings() throws IOException { + RowType rowType = + RowType.builder() + .field("plain_col", DataTypes.INT()) + .field("const_col", DataTypes.BIGINT()) + .field("dict_col", DataTypes.SMALLINT()) + .field("all_null_col", DataTypes.DOUBLE().nullable()) + .field("plain_str", DataTypes.STRING()) + .build(); + + short[] dictValues = {10, 20, 30, 40, 50}; + List data = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + data.add( + GenericRow.of( + i, 999L, dictValues[i % 5], null, BinaryString.fromString("str_" + i))); + } + + Path path = new Path(tempDir.toString(), "mixed_enc.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(1000); + for (int i = 0; i < 1000; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getLong(1)).isEqualTo(999L); + assertThat(result.get(i).getShort(2)).isEqualTo(dictValues[i % 5]); + assertThat(result.get(i).isNullAt(3)).isTrue(); + assertThat(result.get(i).getString(4).toString()).isEqualTo("str_" + i); + } + } + + @Test + public void testMixedEncodingsWithProjection() throws IOException { + RowType rowType = + RowType.builder() + .field("plain_col", DataTypes.INT()) + .field("const_col", DataTypes.BIGINT()) + .field("dict_col", DataTypes.SMALLINT()) + .field("all_null_col", DataTypes.DOUBLE().nullable()) + .field("plain_str", DataTypes.STRING()) + .build(); + + short[] dictValues = {10, 20, 30}; + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add( + GenericRow.of( + i, 42L, dictValues[i % 3], null, BinaryString.fromString("s" + i))); + } + + Path path = new Path(tempDir.toString(), "mixed_proj.mosaic"); + write(rowType, data, path); + + RowType projectedType = + RowType.builder() + .field("dict_col", DataTypes.SMALLINT()) + .field("const_col", DataTypes.BIGINT()) + .build(); + + List result = read(rowType, projectedType, path); + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getShort(0)).isEqualTo(dictValues[i % 3]); + assertThat(result.get(i).getLong(1)).isEqualTo(42L); + } + } + + // ==================== Schema Prefix Compression Tests ==================== + + @Test + public void testSchemaPrefixCompression() throws IOException { + int numCols = 100; + RowType.Builder builder = RowType.builder(); + for (int i = 0; i < numCols; i++) { + builder.field( + "com.example.sensors.signal_" + String.format("%03d", i), + DataTypes.DOUBLE().nullable()); + } + RowType rowType = builder.build(); + + List data = new ArrayList<>(); + for (int r = 0; r < 50; r++) { + Object[] fields = new Object[numCols]; + for (int c = 0; c < numCols; c++) { + fields[c] = (double) (r * numCols + c); + } + data.add(GenericRow.of(fields)); + } + + Path path = new Path(tempDir.toString(), "prefix.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(50); + for (int r = 0; r < 50; r++) { + for (int c = 0; c < numCols; c++) { + assertThat(result.get(r).getDouble(c)).isEqualTo((double) (r * numCols + c)); + } + } + + RowType projectedType = + RowType.builder() + .field("com.example.sensors.signal_050", DataTypes.DOUBLE().nullable()) + .build(); + List projected = read(rowType, projectedType, path); + assertThat(projected).hasSize(50); + for (int r = 0; r < 50; r++) { + assertThat(projected.get(r).getDouble(0)).isEqualTo((double) (r * numCols + 50)); + } + } + + @Test + public void testSchemaMixedPrefixAndNonPrefix() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("group.a.signal_1", DataTypes.DOUBLE()) + .field("group.a.signal_2", DataTypes.DOUBLE()) + .field("name", DataTypes.STRING()) + .field("group.b.signal_1", DataTypes.FLOAT()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 20; i++) { + data.add( + GenericRow.of( + i, + (double) i, + (double) (i * 2), + BinaryString.fromString("n" + i), + (float) i)); + } + + Path path = new Path(tempDir.toString(), "mixed_prefix.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(20); + for (int i = 0; i < 20; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getDouble(1)).isEqualTo((double) i); + assertThat(result.get(i).getDouble(2)).isEqualTo((double) (i * 2)); + assertThat(result.get(i).getString(3).toString()).isEqualTo("n" + i); + assertThat(result.get(i).getFloat(4)).isEqualTo((float) i); + } + } + + @Test + public void testSchemaSerializationRoundTrip() throws IOException { + RowType rowType = + RowType.builder() + .field("simple", DataTypes.INT()) + .field("a.b.col1", DataTypes.DOUBLE()) + .field("a.b.col2", DataTypes.STRING()) + .field("x.y.z.col3", DataTypes.BIGINT()) + .build(); + + MosaicSchema original = MosaicSchema.create(rowType, 10); + byte[] serialized = original.serialize(); + MosaicSchema restored = MosaicSchema.deserialize(serialized); + + assertThat(restored.numBuckets()).isEqualTo(10); + + RowType projAll = rowType; + for (int b = 0; b < 10; b++) { + int[] origMapping = original.getProjectionMapping(b, projAll); + int[] restoredMapping = restored.getProjectionMapping(b, projAll); + if (origMapping == null) { + assertThat(restoredMapping).isNull(); + } else { + assertThat(restoredMapping).isEqualTo(origMapping); + } + } + } + + // ==================== ALL_NULL Column Pruning Tests ==================== + + @Test + public void testAllNullColumnPruningRoundTrip() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("null_col_1", DataTypes.DOUBLE().nullable()) + .field("value", DataTypes.BIGINT()) + .field("null_col_2", DataTypes.STRING().nullable()) + .field("null_col_3", DataTypes.INT().nullable()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, null, (long) i * 10, null, null)); + } + + Path path = new Path(tempDir.toString(), "prune.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).isNullAt(1)).isTrue(); + assertThat(result.get(i).getLong(2)).isEqualTo((long) i * 10); + assertThat(result.get(i).isNullAt(3)).isTrue(); + assertThat(result.get(i).isNullAt(4)).isTrue(); + } + } + + @Test + public void testProjectPrunedAllNullColumn() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("always_null", DataTypes.DOUBLE().nullable()) + .field("value", DataTypes.INT()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 50; i++) { + data.add(GenericRow.of(i, null, i * 2)); + } + + Path path = new Path(tempDir.toString(), "proj_pruned.mosaic"); + write(rowType, data, path); + + RowType projNull = + RowType.builder().field("always_null", DataTypes.DOUBLE().nullable()).build(); + List result = read(rowType, projNull, path); + assertThat(result).hasSize(50); + for (int i = 0; i < 50; i++) { + assertThat(result.get(i).isNullAt(0)).isTrue(); + } + + RowType projMixed = + RowType.builder() + .field("always_null", DataTypes.DOUBLE().nullable()) + .field("value", DataTypes.INT()) + .build(); + List result2 = read(rowType, projMixed, path); + assertThat(result2).hasSize(50); + for (int i = 0; i < 50; i++) { + assertThat(result2.get(i).isNullAt(0)).isTrue(); + assertThat(result2.get(i).getInt(1)).isEqualTo(i * 2); + } + } + + @Test + public void testAllNullPruningWideTable() throws IOException { + int totalCols = 500; + int nonNullCols = 50; + + RowType.Builder builder = RowType.builder(); + for (int i = 0; i < totalCols; i++) { + builder.field("col_" + String.format("%04d", i), DataTypes.INT().nullable()); + } + RowType rowType = builder.build(); + + List data = new ArrayList<>(); + for (int r = 0; r < 100; r++) { + Object[] fields = new Object[totalCols]; + for (int c = 0; c < nonNullCols; c++) { + fields[c] = r * totalCols + c; + } + data.add(GenericRow.of(fields)); + } + + Path path = new Path(tempDir.toString(), "wide_prune.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int r = 0; r < 100; r++) { + for (int c = 0; c < nonNullCols; c++) { + assertThat(result.get(r).getInt(c)).isEqualTo(r * totalCols + c); + } + for (int c = nonNullCols; c < totalCols; c++) { + assertThat(result.get(r).isNullAt(c)).isTrue(); + } + } + + // Verify pruning reduced schema size (compared to no pruning) + LocalFileIO fileIO = new LocalFileIO(); + long prunedFileSize = fileIO.getFileSize(path); + + // Write same data without pruning (multi-row-group forces no pruning) + Path noPrunePath = new Path(tempDir.toString(), "wide_no_prune.mosaic"); + MosaicFileFormat tinyFormat = + new MosaicFileFormat( + new FormatContext( + new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null)); + FormatWriterFactory noPruneFactory = tinyFormat.createWriterFactory(rowType); + PositionOutputStream noPruneOut = fileIO.newOutputStream(noPrunePath, false); + FormatWriter noPruneWriter = noPruneFactory.create(noPruneOut, "zstd"); + for (InternalRow row : data) { + noPruneWriter.addElement(row); + } + noPruneWriter.close(); + noPruneOut.close(); + long noPruneSize = fileIO.getFileSize(noPrunePath); + + System.out.printf( + "Pruning test: pruned=%,d bytes, unpruned=%,d bytes, saved=%.0f%%%n", + prunedFileSize, noPruneSize, (1.0 - (double) prunedFileSize / noPruneSize) * 100); + assertThat(prunedFileSize).isLessThan(noPruneSize); + } + + @Test + public void testMultiRowGroupNoPruning() throws IOException { + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("nullable", DataTypes.INT().nullable()) + .build(); + + MosaicFileFormat format = + new MosaicFileFormat( + new FormatContext( + new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null)); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, i == 0 ? 42 : null)); + } + + Path path = new Path(tempDir.toString(), "multi_rg_no_prune.mosaic"); + LocalFileIO fileIO = new LocalFileIO(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, "zstd"); + for (InternalRow row : data) { + writer.addElement(row); + } + writer.close(); + out.close(); + + FormatReaderFactory readerFactory = format.createReaderFactory(rowType, rowType, null); + List result = new ArrayList<>(); + try (RecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)))) { + reader.forEachRemaining( + row -> { + Object[] fields = new Object[rowType.getFieldCount()]; + for (int i = 0; i < fields.length; i++) { + if (!row.isNullAt(i)) { + fields[i] = + InternalRow.createFieldGetter(rowType.getTypeAt(i), i) + .getFieldOrNull(row); + } + } + result.add(GenericRow.of(fields)); + }); + } + + assertThat(result).hasSize(100); + assertThat(result.get(0).getInt(0)).isEqualTo(0); + assertThat(result.get(0).getInt(1)).isEqualTo(42); + for (int i = 1; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).isNullAt(1)).isTrue(); + } + } + + @Test + public void testAllColumnsAllNull() throws IOException { + RowType rowType = + RowType.builder() + .field("a", DataTypes.INT().nullable()) + .field("b", DataTypes.STRING().nullable()) + .field("c", DataTypes.DOUBLE().nullable()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 30; i++) { + data.add(GenericRow.of(null, null, null)); + } + + Path path = new Path(tempDir.toString(), "all_cols_null.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(30); + for (int i = 0; i < 30; i++) { + assertThat(result.get(i).isNullAt(0)).isTrue(); + assertThat(result.get(i).isNullAt(1)).isTrue(); + assertThat(result.get(i).isNullAt(2)).isTrue(); + } + } + + // ==================== Helpers ==================== + + private void write(RowType rowType, List data, Path path) throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + MosaicFileFormat format = createFormat(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = writerFactory.create(out, "zstd"); + for (InternalRow row : data) { + writer.addElement(row); + } + writer.close(); + out.close(); + } + + private List read(RowType dataType, RowType projectedType, Path path) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + MosaicFileFormat format = createFormat(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataType, projectedType, null); + RecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + int fieldCount = projectedType.getFieldCount(); + Object[] fields = new Object[fieldCount]; + for (int i = 0; i < fieldCount; i++) { + if (row.isNullAt(i)) { + fields[i] = null; + } else { + fields[i] = + InternalRow.createFieldGetter(projectedType.getTypeAt(i), i) + .getFieldOrNull(row); + } + } + result.add(GenericRow.of(fields)); + }); + reader.close(); + return result; + } + + @Test + public void testLongConstantString() throws IOException { + // 1KB constant string — CONST should work regardless of value length + String longStr = repeatChar('x', 1024); + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("long_const", DataTypes.STRING()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 200; i++) { + data.add(GenericRow.of(i, BinaryString.fromString(longStr))); + } + + Path path = new Path(tempDir.toString(), "long_const.mosaic"); + write(rowType, data, path); + + // Verify CONST is smaller than PLAIN (200 * 1KB = 200KB plain, CONST = 1KB) + long fileSize = tempDir.toFile().toPath().resolve("long_const.mosaic").toFile().length(); + + List result = read(rowType, rowType, path); + assertThat(result).hasSize(200); + for (int i = 0; i < 200; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getString(1).toString()).isEqualTo(longStr); + } + } + + @Test + public void testLongConstantStringWithNulls() throws IOException { + String longStr = repeatChar('y', 2048); + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("long_const_nullable", DataTypes.STRING().nullable()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + data.add(GenericRow.of(i, i % 3 == 0 ? null : BinaryString.fromString(longStr))); + } + + Path path = new Path(tempDir.toString(), "long_const_null.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(100); + for (int i = 0; i < 100; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + if (i % 3 == 0) { + assertThat(result.get(i).isNullAt(1)).isTrue(); + } else { + assertThat(result.get(i).getString(1).toString()).isEqualTo(longStr); + } + } + } + + @Test + public void testRepeatedLongStringsDict() throws IOException { + // 5 distinct 500-byte strings — should use DICT encoding + String[] values = new String[5]; + for (int i = 0; i < 5; i++) { + values[i] = repeatChar((char) ('A' + i), 500); + } + + RowType rowType = + RowType.builder() + .field("id", DataTypes.INT()) + .field("long_dict", DataTypes.STRING()) + .build(); + + List data = new ArrayList<>(); + for (int i = 0; i < 500; i++) { + data.add(GenericRow.of(i, BinaryString.fromString(values[i % 5]))); + } + + Path path = new Path(tempDir.toString(), "long_dict.mosaic"); + write(rowType, data, path); + List result = read(rowType, rowType, path); + + assertThat(result).hasSize(500); + for (int i = 0; i < 500; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getString(1).toString()).isEqualTo(values[i % 5]); + } + } + + private MosaicFileFormat createFormat() { + return new MosaicFileFormat( + new FormatContext(new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 3, null)); + } + + private static String repeatChar(char c, int count) { + char[] chars = new char[count]; + Arrays.fill(chars, c); + return new String(chars); + } + + private RowType buildWideRowType(int columnCount) { + RowType.Builder builder = RowType.builder(); + for (int i = 0; i < columnCount; i++) { + builder.field( + String.format( + "this_is_a_very_long_column_name_for_testing_compression_ratio_column_index_%05d", + i), + DataTypes.INT()); + } + return builder.build(); + } +}