diff --git a/docs/content/concepts/spec/fileformat.md b/docs/content/concepts/spec/fileformat.md
index 27d4f63d8182..ec4a133f6835 100644
--- a/docs/content/concepts/spec/fileformat.md
+++ b/docs/content/concepts/spec/fileformat.md
@@ -791,3 +791,11 @@ Limitations:
3. Statistics collection is not supported for BLOB columns.
For usage details, configuration options, and examples, see [Blob Type]({{< ref "append-table/blob" >}}).
+
+## MOSAIC
+
+Mosaic is a columnar-bucket hybrid format optimized for wide tables (10,000+ columns). Columns are hashed into buckets
+by name, stored row-oriented within each bucket, and independently compressed. This enables efficient projection pushdown
+at bucket granularity — reading 10 columns out of 10,000 only decompresses the buckets that contain those 10 columns.
+
+For the detailed file format specification, see [Mosaic File Format]({{< ref "concepts/spec/mosaic" >}}).
diff --git a/docs/content/concepts/spec/mosaic.md b/docs/content/concepts/spec/mosaic.md
new file mode 100644
index 000000000000..6c5169a3b0c3
--- /dev/null
+++ b/docs/content/concepts/spec/mosaic.md
@@ -0,0 +1,313 @@
+---
+title: "Mosaic"
+weight: 9
+type: docs
+aliases:
+- /concepts/spec/mosaic.html
+---
+
+
+# Mosaic File Format
+
+Mosaic is a columnar-bucket hybrid format optimized for wide tables (10,000+ columns). Columns are hashed into buckets
+by name, stored column-oriented within each bucket, and independently compressed. This enables efficient projection
+pushdown at bucket granularity — reading 10 columns out of 10,000 only decompresses the buckets that contain those
+10 columns.
+
+## File Layout
+
+```
++--------------------------------------------+
+| Row Group 0: Bucket Data |
+| [Bucket 0 compressed block] |
+| [Bucket 3 compressed block] |
+| ... (only non-empty buckets) |
++--------------------------------------------+
+| Row Group 1: Bucket Data |
+| ... |
++--------------------------------------------+
+| Schema Block |
+| [4 bytes: uncompressed size (BE int)] |
+| [schema data (possibly compressed)] |
++--------------------------------------------+
+| Row Group Index (varint encoded) |
++--------------------------------------------+
+| Footer (32 bytes, fixed) |
++--------------------------------------------+
+```
+
+## Footer (32 bytes, big-endian)
+
+| Offset | Size | Field | Description |
+|--------|------|-------------------|------------------------------------|
+| 0 | 8 | indexOffset | Absolute offset of Row Group Index |
+| 8 | 8 | schemaBlockOffset | Absolute offset of Schema Block |
+| 16 | 4 | numBuckets | Total number of buckets |
+| 20 | 4 | numRowGroups | Total number of row groups |
+| 24 | 1 | compression | 0 = none, 1 = zstd |
+| 25 | 1 | version | Format version (currently 1) |
+| 26 | 2 | (reserved) | Padding, set to 0 |
+| 28 | 4 | magic | `MOSA` (0x4D4F5341) |
+
+## Row Group Index
+
+Varint-encoded, only non-empty buckets are stored. For each row group:
+
+```
+varint numRows
+varint nonEmptyCount
+repeated nonEmptyCount times:
+ varint bucketId
+ 8 bytes bucketOffset (big-endian, absolute file offset)
+ varint compressedSize
+ varint uncompressedSize
+```
+
+## Schema Block
+
+Prefixed with a 4-byte big-endian int (uncompressed size), followed by the schema data (compressed with the file's
+compression method).
+
+Column names are stored using **front coding** (incremental encoding): each name shares a prefix with the previous name,
+and only the suffix is stored. This is the same technique used by Lucene, LevelDB, and RocksDB for their block index
+entries.
+
+```
+varint numColumns
+varint numBuckets
+repeated numColumns times:
+ varint fieldId
+ varint bucketId
+ varint indexInBucket
+ varint sharedPrefixLen (bytes shared with previous column name)
+ varint suffixLen (bytes of new suffix)
+ bytes suffix (UTF-8) (suffixLen bytes)
+ TypeDescriptor
+```
+
+The first column has `sharedPrefixLen = 0`. To reconstruct a column name, take the first `sharedPrefixLen` bytes from
+the previous name and append the suffix.
+
+### TypeDescriptor
+
+```
+1 byte typeId
+1 byte nullable (0 = not null, 1 = nullable)
+[type-specific params]
+```
+
+
+
+
+ | typeId |
+ Type |
+ Params |
+
+
+
+ | 0 | BOOLEAN | (none) |
+ | 1 | TINYINT | (none) |
+ | 2 | SMALLINT | (none) |
+ | 3 | INTEGER | (none) |
+ | 4 | BIGINT | (none) |
+ | 5 | FLOAT | (none) |
+ | 6 | DOUBLE | (none) |
+ | 7 | DATE | (none) |
+ | 8 | CHAR | varint length |
+ | 9 | VARCHAR | varint length |
+ | 10 | STRING | (none) — VARCHAR with MAX_LENGTH |
+ | 11 | BINARY | varint length |
+ | 12 | VARBINARY | varint length |
+ | 13 | BYTES | (none) — VARBINARY with MAX_LENGTH |
+ | 14 | DECIMAL | varint precision, varint scale |
+ | 15 | TIME | varint precision |
+ | 16 | TIMESTAMP | varint precision |
+ | 17 | TIMESTAMP_LTZ | varint precision |
+
+
+
+Complex types (ARRAY, MAP, ROW, etc.), VARIANT, and BLOB are not supported.
+
+## Bucket Data
+
+Each bucket is stored as a **column-oriented** block. Within a bucket, each column is independently encoded using one
+of four encodings (PLAIN, CONST, DICT, or ALL_NULL), chosen automatically based on the column's value distribution.
+
+### Bucket Block Layout (before compression)
+
+```
++--------------------------------------------+
+| Encoding Flags |
+| 2 bits per column, packed into bytes |
++--------------------------------------------+
+| Has-Nulls Flags |
+| 1 bit per column, packed into bytes |
++--------------------------------------------+
+| Const Metadata (CONST columns only) |
+| serialized value for each CONST column |
++--------------------------------------------+
+| Dict Metadata (DICT columns only) |
+| for each DICT column: |
+| varint numEntries |
+| repeated: serialized value per entry |
++--------------------------------------------+
+| Null Bitmaps |
+| ceil(numRows/8) bytes per column |
+| (only for columns with nulls, |
+| excluding ALL_NULL columns) |
++--------------------------------------------+
+| Column Data |
+| PLAIN: raw serialized values |
+| DICT: 1-byte index per non-null cell |
+| CONST/ALL_NULL: (nothing) |
++--------------------------------------------+
+```
+
+**Encoding Flags**: 2 bits per column, packed left-to-right. Encoding values:
+
+| Value | Encoding | Description |
+|-------|----------|-------------|
+| 0 | PLAIN | Raw serialized values for each non-null cell |
+| 1 | CONST | All non-null values are identical; the single value is stored in metadata |
+| 2 | DICT | 2-255 distinct values; each non-null cell stores a 1-byte dictionary index |
+| 3 | ALL_NULL | Every cell in this column is null; no data or null bitmap stored |
+
+**Has-Nulls Flags**: 1 bit per column. If set, a null bitmap exists for that column. ALL_NULL columns always have
+this flag cleared (no bitmap is stored for them).
+
+**Null Bitmap**: `ceil(numRows / 8)` bytes per column. Bit `i` = 1 means row `i` is null. Only present for columns
+where has-nulls flag is set.
+
+### Column Encoding Selection
+
+The encoding for each column is chosen automatically during writing based on value distribution and cost:
+
+- **ALL_NULL**: 0 non-null values
+- **CONST**: exactly 1 distinct non-null value (any number of nulls allowed)
+- **DICT**: 2-255 distinct non-null values, **and** the dictionary-encoded size is smaller than plain — the writer
+ compares `varint(numEntries) + sum(entryBytes) + nonNullCount` against the raw value buffer size
+- **PLAIN**: 256+ distinct values, dict tracking was abandoned, or dict encoding would be larger than plain
+
+CONST detection is independent of dictionary tracking — it uses a lightweight byte comparison against the first non-null
+value, so it works for all types and value sizes (including long strings).
+
+Dictionary encoding works for all data types including variable-width types (VARCHAR, VARBINARY, DECIMAL). The writer
+uses primitive long keys for fixed-width types (≤8 bytes) and byte-array keys for variable-width types. Variable-width
+dictionary tracking is bounded by a cumulative byte budget and abandoned when cardinality exceeds 255 or total dictionary
+entry bytes exceed the budget.
+
+Dictionary indices are limited to 1 byte (max 255 entries). This is a deliberate simplicity trade-off for the first
+version — columns with 256+ distinct values fall back to PLAIN encoding.
+
+## Value Serialization
+
+Values are serialized in the same format for PLAIN data, CONST metadata, and DICT entries:
+
+
+
+
+ | Type |
+ Encoding |
+
+
+
+ | BOOLEAN | 1 byte (0 or 1) |
+ | TINYINT | 1 byte |
+ | SMALLINT | 2 bytes big-endian |
+ | INTEGER / DATE / TIME | 4 bytes big-endian |
+ | BIGINT | 8 bytes big-endian |
+ | FLOAT | 4 bytes IEEE 754 (big-endian) |
+ | DOUBLE | 8 bytes IEEE 754 (big-endian) |
+ | DECIMAL (compact, precision ≤ 18) | 8 bytes big-endian (unscaled long) |
+ | DECIMAL (large, precision > 18) | varint length + unscaled BigInteger bytes |
+ | TIMESTAMP (precision ≤ 3) | 8 bytes (epoch millis, big-endian) |
+ | TIMESTAMP (precision > 3) | 8 bytes (epoch millis) + 4 bytes (nanos of millis) |
+ | CHAR / VARCHAR / STRING | varint length + UTF-8 bytes |
+ | BINARY / VARBINARY / BYTES | varint length + raw bytes |
+
+
+
+## ALL_NULL Column Pruning
+
+For single-row-group files (the common case with small files), columns where every value is null are pruned from both
+the schema and bucket data. This reduces schema size for wide sparse tables where many columns are entirely null.
+
+- The writer detects ALL_NULL columns after buffering all rows
+- ALL_NULL columns are removed from the encoding/null flags in bucket data
+- ALL_NULL columns are removed from the schema block
+- The reader treats any projected column not found in the schema as all-null (returns null for every row)
+
+This optimization only applies to single-row-group files. Multi-row-group files retain all columns because a column may
+be ALL_NULL in one row group but have values in another.
+
+## Column-to-Bucket Assignment
+
+Columns are assigned to buckets by hashing the column name:
+
+```
+bucketId = Math.floorMod(fieldName.hashCode(), numBuckets)
+```
+
+Default number of buckets: `min(100, numColumns)`.
+
+## Compression
+
+Compression is applied independently to each bucket data block and to the schema block. Supported methods:
+
+- `0` — No compression
+- `1` — Zstd (configurable level)
+
+## Benchmark
+
+Test setup: 10,000 columns (90% STRING, 10% INT), column names ~80 bytes each, Zstd compression (level 9).
+
+**File Size (10 rows):**
+
+| Format | Size | vs Mosaic |
+|---------|------------|-----------|
+| Parquet | 9,696 KB | 14.8x |
+| ORC | 6,377 KB | 9.7x |
+| Mosaic | 654 KB | 1x |
+
+**Projection Read (500 rows):**
+
+| Projected Columns | Parquet | ORC | Mosaic |
+|-------------------|------------|------------|-----------|
+| 10 / 10,000 | 53,170 us | 72,729 us | 25,081 us |
+| 1 / 10,000 | 50,919 us | 70,712 us | 2,374 us |
+
+File size — Parquet: 57.4 MB, ORC: 95.4 MB, Mosaic: 11.5 MB
+
+**Projection Read (4,500 rows, ~458 MB Parquet):**
+
+| Projected Columns | Parquet | ORC | Mosaic |
+|-------------------|-------------|------------|------------|
+| 10 / 10,000 | 369,627 us | 89,344 us | 67,314 us |
+| 1 / 10,000 | 360,458 us | 81,934 us | 26,924 us |
+
+File size — Parquet: 458.4 MB, ORC: 827.9 MB, Mosaic: 100.2 MB
+
+When projecting a small subset of columns, Mosaic only decompresses the buckets containing the requested columns,
+avoiding I/O on the remaining data.
+
+## Limitations
+
+1. Complex types (ARRAY, MAP, MULTISET, ROW) are not supported.
+2. Mosaic format is designed for wide tables and may not be efficient for narrow tables with few columns.
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java
new file mode 100644
index 000000000000..0a614f5552b4
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketReader.java
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.data.Decimal;
+import org.apache.paimon.data.Timestamp;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DecimalType;
+
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_ALL_NULL;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_CONST;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_DICT;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_PLAIN;
+
+/**
+ * Columnar bucket reader for the Mosaic v2 format. Reads column-oriented data with
+ * CONST/DICT/PLAIN/ALL_NULL encoding.
+ */
+public class MosaicBucketReader {
+
+ private final DataType[] allColumnTypes;
+ private final int[] localToOutputMapping;
+ private final int numColumnsInBucket;
+
+ // Per-column state set during init()
+ private byte[] encodings;
+ private boolean[] hasNulls;
+ private byte[][] nullBitmaps;
+ private Object[] constValues;
+ private Object[][] dictValues;
+ private int[] dataCursors;
+ private byte[] data;
+ private int numRows;
+ private int currentRow;
+
+ public MosaicBucketReader(DataType[] allColumnTypes, int[] localToOutputMapping) {
+ this.allColumnTypes = allColumnTypes;
+ this.localToOutputMapping = localToOutputMapping;
+ this.numColumnsInBucket = allColumnTypes.length;
+ }
+
+ public void init(byte[] data, int numRows) {
+ this.data = data;
+ this.numRows = numRows;
+ this.currentRow = 0;
+
+ this.encodings = new byte[numColumnsInBucket];
+ this.hasNulls = new boolean[numColumnsInBucket];
+ this.nullBitmaps = new byte[numColumnsInBucket][];
+ this.constValues = new Object[numColumnsInBucket];
+ this.dictValues = new Object[numColumnsInBucket][];
+ this.dataCursors = new int[numColumnsInBucket];
+
+ int pos = 0;
+
+ // 1. Read encoding flags (2 bits per column)
+ int encodingFlagsBytes = (numColumnsInBucket * 2 + 7) / 8;
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ int byteIdx = (i * 2) / 8;
+ int bitIdx = (i * 2) % 8;
+ encodings[i] = (byte) ((data[pos + byteIdx] >>> bitIdx) & 0x03);
+ }
+ pos += encodingFlagsBytes;
+
+ // 2. Read has-nulls flags (1 bit per column)
+ int hasNullsFlagsBytes = (numColumnsInBucket + 7) / 8;
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ hasNulls[i] = (data[pos + i / 8] & (1 << (i % 8))) != 0;
+ }
+ pos += hasNullsFlagsBytes;
+
+ // 3. Read const metadata
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ if (encodings[i] == ENCODING_CONST) {
+ int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]);
+ if (w > 0) {
+ constValues[i] = readTypedValue(allColumnTypes[i], data, pos, w);
+ pos += w;
+ } else {
+ constValues[i] = readVariableValue(allColumnTypes[i], data, pos);
+ int len = readVarint(data, pos);
+ pos += varintSize(len) + len;
+ }
+ }
+ }
+
+ // 4. Read dict metadata
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ if (encodings[i] == ENCODING_DICT) {
+ int numEntries = readVarint(data, pos);
+ pos += varintSize(numEntries);
+ int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]);
+ Object[] entries = new Object[numEntries];
+ for (int j = 0; j < numEntries; j++) {
+ if (w > 0) {
+ entries[j] = readTypedValue(allColumnTypes[i], data, pos, w);
+ pos += w;
+ } else {
+ entries[j] = readVariableValue(allColumnTypes[i], data, pos);
+ int len = readVarint(data, pos);
+ pos += varintSize(len) + len;
+ }
+ }
+ dictValues[i] = entries;
+ }
+ }
+
+ // 5. Read null bitmaps
+ int nullBitmapSize = (numRows + 7) / 8;
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ if (hasNulls[i] && encodings[i] != ENCODING_ALL_NULL) {
+ nullBitmaps[i] = new byte[nullBitmapSize];
+ System.arraycopy(data, pos, nullBitmaps[i], 0, nullBitmapSize);
+ pos += nullBitmapSize;
+ }
+ }
+
+ // 6. Record column data start offsets
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ dataCursors[i] = pos;
+ if (encodings[i] == ENCODING_PLAIN) {
+ // Skip past all plain data for this column to find next column's offset
+ int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]);
+ if (w > 0) {
+ int nonNullCount = countNonNull(i);
+ pos += nonNullCount * w;
+ } else {
+ // Variable-width: scan through
+ int nonNullCount = countNonNull(i);
+ for (int j = 0; j < nonNullCount; j++) {
+ int len = readVarint(data, pos);
+ pos += varintSize(len) + len;
+ }
+ }
+ } else if (encodings[i] == ENCODING_DICT) {
+ int nonNullCount = countNonNull(i);
+ pos += nonNullCount; // 1 byte per non-null cell
+ }
+ // CONST and ALL_NULL: no data to skip
+ }
+ }
+
+ public void readRow(Object[] outputFields) {
+ for (int i = 0; i < numColumnsInBucket; i++) {
+ int outputPos = localToOutputMapping[i];
+
+ if (encodings[i] == ENCODING_ALL_NULL) {
+ if (outputPos >= 0) {
+ outputFields[outputPos] = null;
+ }
+ continue;
+ }
+
+ boolean isNull =
+ hasNulls[i] && (nullBitmaps[i][currentRow / 8] & (1 << (currentRow % 8))) != 0;
+
+ if (isNull) {
+ if (outputPos >= 0) {
+ outputFields[outputPos] = null;
+ }
+ continue;
+ }
+
+ // Non-null value
+ switch (encodings[i]) {
+ case ENCODING_CONST:
+ if (outputPos >= 0) {
+ outputFields[outputPos] = constValues[i];
+ }
+ break;
+ case ENCODING_DICT:
+ {
+ int idx = data[dataCursors[i]++] & 0xFF;
+ if (outputPos >= 0) {
+ outputFields[outputPos] = dictValues[i][idx];
+ }
+ break;
+ }
+ case ENCODING_PLAIN:
+ {
+ int w = MosaicBucketWriter.getFixedWidth(allColumnTypes[i]);
+ if (outputPos >= 0) {
+ if (w > 0) {
+ outputFields[outputPos] =
+ readTypedValue(allColumnTypes[i], data, dataCursors[i], w);
+ } else {
+ outputFields[outputPos] =
+ readVariableValue(allColumnTypes[i], data, dataCursors[i]);
+ }
+ }
+ // Advance cursor
+ if (w > 0) {
+ dataCursors[i] += w;
+ } else {
+ int len = readVarint(data, dataCursors[i]);
+ dataCursors[i] += varintSize(len) + len;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ currentRow++;
+ }
+
+ // ======================== Value reading ========================
+
+ private static Object readTypedValue(DataType type, byte[] buf, int pos, int width) {
+ switch (type.getTypeRoot()) {
+ case BOOLEAN:
+ return buf[pos] != 0;
+ case TINYINT:
+ return buf[pos];
+ case SMALLINT:
+ return (short) ((buf[pos] << 8) | (buf[pos + 1] & 0xFF));
+ case INTEGER:
+ case DATE:
+ case TIME_WITHOUT_TIME_ZONE:
+ return readInt(buf, pos);
+ case BIGINT:
+ return readLong(buf, pos);
+ case FLOAT:
+ return Float.intBitsToFloat(readInt(buf, pos));
+ case DOUBLE:
+ return Double.longBitsToDouble(readLong(buf, pos));
+ case DECIMAL:
+ {
+ DecimalType dt = (DecimalType) type;
+ return Decimal.fromUnscaledLong(
+ readLong(buf, pos), dt.getPrecision(), dt.getScale());
+ }
+ case TIMESTAMP_WITHOUT_TIME_ZONE:
+ {
+ long millis = readLong(buf, pos);
+ if (width == 12) {
+ int nanos = readInt(buf, pos + 8);
+ return Timestamp.fromEpochMillis(millis, nanos);
+ }
+ return Timestamp.fromEpochMillis(millis);
+ }
+ case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
+ {
+ long millis = readLong(buf, pos);
+ if (width == 12) {
+ int nanos = readInt(buf, pos + 8);
+ return Timestamp.fromEpochMillis(millis, nanos);
+ }
+ return Timestamp.fromEpochMillis(millis);
+ }
+ default:
+ throw new UnsupportedOperationException("Unsupported fixed type: " + type);
+ }
+ }
+
+ private static Object readVariableValue(DataType type, byte[] buf, int pos) {
+ int len = readVarint(buf, pos);
+ int dataStart = pos + varintSize(len);
+ switch (type.getTypeRoot()) {
+ case CHAR:
+ case VARCHAR:
+ return BinaryString.fromBytes(buf, dataStart, len);
+ case BINARY:
+ case VARBINARY:
+ {
+ byte[] bytes = new byte[len];
+ System.arraycopy(buf, dataStart, bytes, 0, len);
+ return bytes;
+ }
+ case DECIMAL:
+ {
+ DecimalType dt = (DecimalType) type;
+ byte[] bytes = new byte[len];
+ System.arraycopy(buf, dataStart, bytes, 0, len);
+ return Decimal.fromUnscaledBytes(bytes, dt.getPrecision(), dt.getScale());
+ }
+ default:
+ throw new UnsupportedOperationException("Unsupported variable type: " + type);
+ }
+ }
+
+ // ======================== Helpers ========================
+
+ private int countNonNull(int colIdx) {
+ if (!hasNulls[colIdx]) {
+ return numRows;
+ }
+ if (encodings[colIdx] == ENCODING_ALL_NULL) {
+ return 0;
+ }
+ int count = 0;
+ int fullBytes = numRows / 8;
+ for (int b = 0; b < fullBytes; b++) {
+ count += Integer.bitCount(nullBitmaps[colIdx][b] & 0xFF);
+ }
+ int remaining = numRows % 8;
+ if (remaining > 0) {
+ int mask = (1 << remaining) - 1;
+ count += Integer.bitCount(nullBitmaps[colIdx][fullBytes] & mask);
+ }
+ return numRows - count;
+ }
+
+ private static int readInt(byte[] buf, int pos) {
+ return ((buf[pos] & 0xFF) << 24)
+ | ((buf[pos + 1] & 0xFF) << 16)
+ | ((buf[pos + 2] & 0xFF) << 8)
+ | (buf[pos + 3] & 0xFF);
+ }
+
+ private static long readLong(byte[] buf, int pos) {
+ return ((long) (buf[pos] & 0xFF) << 56)
+ | ((long) (buf[pos + 1] & 0xFF) << 48)
+ | ((long) (buf[pos + 2] & 0xFF) << 40)
+ | ((long) (buf[pos + 3] & 0xFF) << 32)
+ | ((long) (buf[pos + 4] & 0xFF) << 24)
+ | ((long) (buf[pos + 5] & 0xFF) << 16)
+ | ((long) (buf[pos + 6] & 0xFF) << 8)
+ | (buf[pos + 7] & 0xFF);
+ }
+
+ private static int readVarint(byte[] buf, int pos) {
+ int value = 0;
+ int shift = 0;
+ int b;
+ do {
+ b = buf[pos++] & 0xFF;
+ value |= (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ return value;
+ }
+
+ private static int varintSize(int value) {
+ int size = 1;
+ while ((value & ~0x7F) != 0) {
+ size++;
+ value >>>= 7;
+ }
+ return size;
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java
new file mode 100644
index 000000000000..fce7ebdc6edf
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicBucketWriter.java
@@ -0,0 +1,713 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.data.Decimal;
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.data.Timestamp;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DecimalType;
+import org.apache.paimon.types.LocalZonedTimestampType;
+import org.apache.paimon.types.RowType;
+import org.apache.paimon.types.TimestampType;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_ALL_NULL;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_CONST;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_DICT;
+import static org.apache.paimon.format.mosaic.MosaicSpec.ENCODING_PLAIN;
+import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint;
+
+/**
+ * Columnar bucket writer for the Mosaic format. Buffers values per-column and produces a
+ * column-oriented byte array with CONST/DICT/PLAIN/ALL_NULL encoding per column.
+ *
+ * CONST detection uses a lightweight byte-comparison tracker that works for all types and value
+ * sizes, independent of dictionary tracking. Dictionary tracking uses primitive long keys for
+ * fixed-width types (≤8 bytes) and byte-array keys for variable-width types. Variable-width dict
+ * tracking is bounded by a cumulative byte budget ({@link #MAX_DICT_TOTAL_BYTES}). DICT encoding is
+ * chosen only when it produces fewer bytes than PLAIN (cost-based selection).
+ */
+public class MosaicBucketWriter {
+
+ private static final int MAX_DICT_TOTAL_BYTES = 16384;
+
+ private final InternalRow.FieldGetter[] fieldGetters;
+ private final int numColumns;
+ private final int[] fixedWidths;
+ private final boolean[] isVariableWidth;
+
+ // Per-column buffers
+ private byte[][] nullBitmaps;
+ private byte[][] valueBuffers;
+ private int[] valueBufPos;
+ private int[] nonNullCounts;
+
+ // CONST tracking: byte comparison against first non-null value (works for any size)
+ private boolean[] constTracking;
+ private int[] firstValueLen;
+
+ // Fixed-width ≤8 bytes: primitive long-based dict tracking
+ private Map[] longDictMaps;
+ // Variable-width and width>8: byte-array-based dict tracking with cumulative budget
+ private Map[] byteDictMaps;
+ private int[] dictTotalBytes;
+
+ private int numRows;
+
+ public MosaicBucketWriter(RowType fullRowType, int[] globalColumnIndices) {
+ this.numColumns = globalColumnIndices.length;
+ this.fieldGetters = new InternalRow.FieldGetter[numColumns];
+ this.fixedWidths = new int[numColumns];
+ this.isVariableWidth = new boolean[numColumns];
+
+ for (int i = 0; i < numColumns; i++) {
+ int globalIdx = globalColumnIndices[i];
+ DataType type = fullRowType.getTypeAt(globalIdx);
+ fieldGetters[i] = InternalRow.createFieldGetter(type, globalIdx);
+ fixedWidths[i] = getFixedWidth(type);
+ isVariableWidth[i] = fixedWidths[i] < 0;
+ }
+
+ initBuffers();
+ }
+
+ @SuppressWarnings("unchecked")
+ private void initBuffers() {
+ this.nullBitmaps = new byte[numColumns][];
+ this.valueBuffers = new byte[numColumns][];
+ this.valueBufPos = new int[numColumns];
+ this.nonNullCounts = new int[numColumns];
+ this.constTracking = new boolean[numColumns];
+ this.firstValueLen = new int[numColumns];
+ this.longDictMaps = new Map[numColumns];
+ this.byteDictMaps = new Map[numColumns];
+ this.dictTotalBytes = new int[numColumns];
+
+ for (int i = 0; i < numColumns; i++) {
+ nullBitmaps[i] = new byte[128];
+ valueBuffers[i] = new byte[1024];
+ constTracking[i] = true;
+ if (usesLongDict(i)) {
+ longDictMaps[i] = new HashMap<>();
+ } else {
+ byteDictMaps[i] = new HashMap<>();
+ }
+ }
+ this.numRows = 0;
+ }
+
+ private boolean usesLongDict(int colIdx) {
+ return fixedWidths[colIdx] > 0 && fixedWidths[colIdx] <= 8;
+ }
+
+ public boolean isEmpty() {
+ return numRows == 0;
+ }
+
+ public int writeRow(InternalRow row) {
+ int bitmapIdx = numRows / 8;
+
+ int totalSize = 0;
+ for (int i = 0; i < numColumns; i++) {
+ // Ensure null bitmap capacity
+ if (bitmapIdx >= nullBitmaps[i].length) {
+ byte[] newBm = new byte[nullBitmaps[i].length * 2];
+ System.arraycopy(nullBitmaps[i], 0, newBm, 0, nullBitmaps[i].length);
+ nullBitmaps[i] = newBm;
+ }
+
+ Object value = fieldGetters[i].getFieldOrNull(row);
+ if (value == null) {
+ nullBitmaps[i][bitmapIdx] |= (byte) (1 << (numRows % 8));
+ } else {
+ nonNullCounts[i]++;
+ int before = valueBufPos[i];
+ writeValue(i, value);
+ int written = valueBufPos[i] - before;
+ totalSize += written;
+
+ // CONST tracking: compare against first non-null value
+ if (constTracking[i]) {
+ if (nonNullCounts[i] == 1) {
+ firstValueLen[i] = written;
+ } else if (written != firstValueLen[i]
+ || !regionEquals(valueBuffers[i], 0, before, written)) {
+ constTracking[i] = false;
+ }
+ }
+
+ // Dict tracking (separate from CONST)
+ if (longDictMaps[i] != null) {
+ long key = extractFixedKey(valueBuffers[i], before, fixedWidths[i]);
+ longDictMaps[i].putIfAbsent(key, longDictMaps[i].size());
+ if (longDictMaps[i].size() > 255) {
+ longDictMaps[i] = null;
+ }
+ } else if (byteDictMaps[i] != null) {
+ ByteKey key = new ByteKey(valueBuffers[i], before, written);
+ int sizeBefore = byteDictMaps[i].size();
+ byteDictMaps[i].putIfAbsent(key, sizeBefore);
+ if (byteDictMaps[i].size() > sizeBefore) {
+ dictTotalBytes[i] += written;
+ }
+ if (byteDictMaps[i].size() > 255 || dictTotalBytes[i] > MAX_DICT_TOTAL_BYTES) {
+ byteDictMaps[i] = null;
+ }
+ }
+ }
+ }
+ numRows++;
+ // Include null bitmap overhead (~1 bit per column per row)
+ totalSize += (numColumns + 7) / 8;
+ return totalSize;
+ }
+
+ public byte[] finish() {
+ return finish(false);
+ }
+
+ public byte[] finish(boolean pruneAllNull) {
+ if (numRows == 0) {
+ return new byte[0];
+ }
+
+ // 1. Determine encoding per column
+ byte[] encodings = new byte[numColumns];
+ boolean[] hasNulls = new boolean[numColumns];
+
+ for (int i = 0; i < numColumns; i++) {
+ if (nonNullCounts[i] == 0) {
+ encodings[i] = ENCODING_ALL_NULL;
+ hasNulls[i] = false;
+ } else if (constTracking[i]) {
+ encodings[i] = ENCODING_CONST;
+ hasNulls[i] = nonNullCounts[i] < numRows;
+ } else {
+ int dictSize = getDictSize(i);
+ if (dictSize >= 2 && dictSize <= 255 && dictEncodedSize(i) < valueBufPos[i]) {
+ encodings[i] = ENCODING_DICT;
+ } else {
+ encodings[i] = ENCODING_PLAIN;
+ }
+ hasNulls[i] = nonNullCounts[i] < numRows;
+ }
+ }
+
+ // Count output columns (skip ALL_NULL when pruning)
+ int numOutputCols = numColumns;
+ if (pruneAllNull) {
+ numOutputCols = 0;
+ for (int i = 0; i < numColumns; i++) {
+ if (encodings[i] != ENCODING_ALL_NULL) {
+ numOutputCols++;
+ }
+ }
+ }
+
+ // 2. Compute exact output size
+ byte[] out = computeOutBuffer(numOutputCols, encodings, hasNulls);
+ int pos = 0;
+
+ // 2a. Encoding flags: 2 bits per output column
+ int encodingFlagsBytes = (numOutputCols * 2 + 7) / 8;
+ int outputIdx = 0;
+ for (int i = 0; i < numColumns; i++) {
+ if (pruneAllNull && encodings[i] == ENCODING_ALL_NULL) {
+ continue;
+ }
+ int byteIdx = (outputIdx * 2) / 8;
+ int bitIdx = (outputIdx * 2) % 8;
+ out[pos + byteIdx] |= (byte) (encodings[i] << bitIdx);
+ outputIdx++;
+ }
+ pos += encodingFlagsBytes;
+
+ // 2b. Has-nulls flags: 1 bit per output column
+ int hasNullsFlagsBytes = (numOutputCols + 7) / 8;
+ outputIdx = 0;
+ for (int i = 0; i < numColumns; i++) {
+ if (pruneAllNull && encodings[i] == ENCODING_ALL_NULL) {
+ continue;
+ }
+ if (hasNulls[i]) {
+ out[pos + outputIdx / 8] |= (byte) (1 << (outputIdx % 8));
+ }
+ outputIdx++;
+ }
+ pos += hasNullsFlagsBytes;
+
+ // 2c. Const metadata — first non-null value from value buffer
+ for (int i = 0; i < numColumns; i++) {
+ if (encodings[i] == ENCODING_CONST) {
+ System.arraycopy(valueBuffers[i], 0, out, pos, firstValueLen[i]);
+ pos += firstValueLen[i];
+ }
+ }
+
+ // 2d. Dict metadata
+ for (int i = 0; i < numColumns; i++) {
+ if (encodings[i] == ENCODING_DICT) {
+ if (longDictMaps[i] != null) {
+ int numEntries = longDictMaps[i].size();
+ pos = writeVarint(out, pos, numEntries);
+ int w = fixedWidths[i];
+ long[] keys = new long[numEntries];
+ for (Map.Entry e : longDictMaps[i].entrySet()) {
+ keys[e.getValue()] = e.getKey();
+ }
+ for (int j = 0; j < numEntries; j++) {
+ pos = writeFixedKey(out, pos, keys[j], w);
+ }
+ } else {
+ int numEntries = byteDictMaps[i].size();
+ pos = writeVarint(out, pos, numEntries);
+ ByteKey[] keys = new ByteKey[numEntries];
+ for (Map.Entry e : byteDictMaps[i].entrySet()) {
+ keys[e.getValue()] = e.getKey();
+ }
+ for (int j = 0; j < numEntries; j++) {
+ System.arraycopy(keys[j].data, 0, out, pos, keys[j].data.length);
+ pos += keys[j].data.length;
+ }
+ }
+ }
+ }
+
+ // 2e. Null bitmaps (only for cols with nulls and not ALL_NULL)
+ int nullBitmapBytes = (numRows + 7) / 8;
+ for (int i = 0; i < numColumns; i++) {
+ if (hasNulls[i] && encodings[i] != ENCODING_ALL_NULL) {
+ System.arraycopy(nullBitmaps[i], 0, out, pos, nullBitmapBytes);
+ pos += nullBitmapBytes;
+ }
+ }
+
+ // 2f. Column data
+ for (int i = 0; i < numColumns; i++) {
+ if (encodings[i] == ENCODING_PLAIN) {
+ System.arraycopy(valueBuffers[i], 0, out, pos, valueBufPos[i]);
+ pos += valueBufPos[i];
+ } else if (encodings[i] == ENCODING_DICT) {
+ int w = fixedWidths[i];
+ int valPos = 0;
+ for (int r = 0; r < numRows; r++) {
+ boolean isNull = (nullBitmaps[i][r / 8] & (1 << (r % 8))) != 0;
+ if (!isNull) {
+ if (longDictMaps[i] != null) {
+ long key = extractFixedKey(valueBuffers[i], valPos, w);
+ valPos += w;
+ out[pos++] = (byte) (int) longDictMaps[i].get(key);
+ } else {
+ int valueLen;
+ if (w > 0) {
+ valueLen = w;
+ } else {
+ int varLen = readVarint(valueBuffers[i], valPos);
+ valueLen = varintSize(varLen) + varLen;
+ }
+ ByteKey key = new ByteKey(valueBuffers[i], valPos, valueLen);
+ valPos += valueLen;
+ out[pos++] = (byte) (int) byteDictMaps[i].get(key);
+ }
+ }
+ }
+ }
+ // CONST and ALL_NULL: no column data
+ }
+
+ return out;
+ }
+
+ private byte[] computeOutBuffer(int numOutputCols, byte[] encodings, boolean[] hasNulls) {
+ int nullBitmapBytesPerCol = (numRows + 7) / 8;
+ int exactSize = (numOutputCols * 2 + 7) / 8 + (numOutputCols + 7) / 8;
+ for (int i = 0; i < numColumns; i++) {
+ if (encodings[i] == ENCODING_ALL_NULL) {
+ continue;
+ }
+ if (hasNulls[i]) {
+ exactSize += nullBitmapBytesPerCol;
+ }
+ if (encodings[i] == ENCODING_CONST) {
+ exactSize += firstValueLen[i];
+ } else if (encodings[i] == ENCODING_DICT) {
+ if (longDictMaps[i] != null) {
+ int numEntries = longDictMaps[i].size();
+ exactSize +=
+ varintSize(numEntries) + numEntries * fixedWidths[i] + nonNullCounts[i];
+ } else {
+ int numEntries = byteDictMaps[i].size();
+ exactSize += varintSize(numEntries);
+ for (ByteKey key : byteDictMaps[i].keySet()) {
+ exactSize += key.data.length;
+ }
+ exactSize += nonNullCounts[i];
+ }
+ } else if (encodings[i] == ENCODING_PLAIN) {
+ exactSize += valueBufPos[i];
+ }
+ }
+ return new byte[exactSize];
+ }
+
+ private int getDictSize(int colIdx) {
+ if (longDictMaps[colIdx] != null) {
+ return longDictMaps[colIdx].size();
+ }
+ if (byteDictMaps[colIdx] != null) {
+ return byteDictMaps[colIdx].size();
+ }
+ return -1;
+ }
+
+ /** Compare dict encoded size vs plain size (pre-compression). */
+ private int dictEncodedSize(int colIdx) {
+ int numEntries;
+ int entryBytes;
+ if (longDictMaps[colIdx] != null) {
+ numEntries = longDictMaps[colIdx].size();
+ entryBytes = numEntries * fixedWidths[colIdx];
+ } else if (byteDictMaps[colIdx] != null) {
+ numEntries = byteDictMaps[colIdx].size();
+ entryBytes = 0;
+ for (ByteKey key : byteDictMaps[colIdx].keySet()) {
+ entryBytes += key.data.length;
+ }
+ } else {
+ return Integer.MAX_VALUE;
+ }
+ return varintSize(numEntries) + entryBytes + nonNullCounts[colIdx];
+ }
+
+ public boolean[] getAllNullFlags() {
+ boolean[] flags = new boolean[numColumns];
+ for (int i = 0; i < numColumns; i++) {
+ flags[i] = nonNullCounts[i] == 0;
+ }
+ return flags;
+ }
+
+ public void reset() {
+ for (int i = 0; i < numColumns; i++) {
+ Arrays.fill(nullBitmaps[i], (byte) 0);
+ valueBufPos[i] = 0;
+ nonNullCounts[i] = 0;
+ constTracking[i] = true;
+ firstValueLen[i] = 0;
+ dictTotalBytes[i] = 0;
+ if (usesLongDict(i)) {
+ if (longDictMaps[i] != null) {
+ longDictMaps[i].clear();
+ } else {
+ longDictMaps[i] = new HashMap<>();
+ }
+ } else {
+ if (byteDictMaps[i] != null) {
+ byteDictMaps[i].clear();
+ } else {
+ byteDictMaps[i] = new HashMap<>();
+ }
+ }
+ }
+ numRows = 0;
+ }
+
+ // ======================== Value writing ========================
+
+ private void writeValue(int colIdx, Object value) {
+ int w = fixedWidths[colIdx];
+ if (w > 0) {
+ ensureValueCapacity(colIdx, w);
+ writeFixedValue(valueBuffers[colIdx], valueBufPos[colIdx], value, w);
+ valueBufPos[colIdx] += w;
+ } else {
+ writeVariableValue(colIdx, value);
+ }
+ }
+
+ private static void writeFixedValue(byte[] buf, int pos, Object value, int width) {
+ switch (width) {
+ case 1:
+ if (value instanceof Boolean) {
+ buf[pos] = (byte) ((Boolean) value ? 1 : 0);
+ } else {
+ buf[pos] = (Byte) value;
+ }
+ break;
+ case 2:
+ {
+ short v = (Short) value;
+ buf[pos] = (byte) (v >>> 8);
+ buf[pos + 1] = (byte) v;
+ break;
+ }
+ case 4:
+ {
+ int v;
+ if (value instanceof Float) {
+ v = Float.floatToRawIntBits((Float) value);
+ } else {
+ v = (Integer) value;
+ }
+ buf[pos] = (byte) (v >>> 24);
+ buf[pos + 1] = (byte) (v >>> 16);
+ buf[pos + 2] = (byte) (v >>> 8);
+ buf[pos + 3] = (byte) v;
+ break;
+ }
+ case 8:
+ {
+ long v;
+ if (value instanceof Long) {
+ v = (Long) value;
+ } else if (value instanceof Double) {
+ v = Double.doubleToRawLongBits((Double) value);
+ } else if (value instanceof Decimal) {
+ v = ((Decimal) value).toUnscaledLong();
+ } else if (value instanceof Timestamp) {
+ v = ((Timestamp) value).getMillisecond();
+ } else {
+ throw new IllegalArgumentException("Unsupported type: " + value.getClass());
+ }
+ writeLong(buf, pos, v);
+ break;
+ }
+ case 12:
+ {
+ Timestamp ts = (Timestamp) value;
+ long millis = ts.getMillisecond();
+ int nanos = ts.getNanoOfMillisecond();
+ writeLong(buf, pos, millis);
+ buf[pos + 8] = (byte) (nanos >>> 24);
+ buf[pos + 9] = (byte) (nanos >>> 16);
+ buf[pos + 10] = (byte) (nanos >>> 8);
+ buf[pos + 11] = (byte) nanos;
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ private static void writeLong(byte[] buf, int pos, long v) {
+ buf[pos] = (byte) (v >>> 56);
+ buf[pos + 1] = (byte) (v >>> 48);
+ buf[pos + 2] = (byte) (v >>> 40);
+ buf[pos + 3] = (byte) (v >>> 32);
+ buf[pos + 4] = (byte) (v >>> 24);
+ buf[pos + 5] = (byte) (v >>> 16);
+ buf[pos + 6] = (byte) (v >>> 8);
+ buf[pos + 7] = (byte) v;
+ }
+
+ private void writeVariableValue(int colIdx, Object value) {
+ byte[] bytes;
+ if (value instanceof BinaryString) {
+ bytes = ((BinaryString) value).toBytes();
+ } else if (value instanceof byte[]) {
+ bytes = (byte[]) value;
+ } else if (value instanceof Decimal) {
+ bytes = ((Decimal) value).toUnscaledBytes();
+ } else {
+ throw new UnsupportedOperationException("Unsupported variable-width type: " + value);
+ }
+ ensureValueCapacity(colIdx, 5 + bytes.length);
+ valueBufPos[colIdx] = writeVarint(valueBuffers[colIdx], valueBufPos[colIdx], bytes.length);
+ System.arraycopy(bytes, 0, valueBuffers[colIdx], valueBufPos[colIdx], bytes.length);
+ valueBufPos[colIdx] += bytes.length;
+ }
+
+ // ======================== Fixed-width key helpers ========================
+
+ private static long extractFixedKey(byte[] buf, int pos, int width) {
+ switch (width) {
+ case 1:
+ return buf[pos] & 0xFFL;
+ case 2:
+ return ((buf[pos] & 0xFFL) << 8) | (buf[pos + 1] & 0xFFL);
+ case 4:
+ return ((buf[pos] & 0xFFL) << 24)
+ | ((buf[pos + 1] & 0xFFL) << 16)
+ | ((buf[pos + 2] & 0xFFL) << 8)
+ | (buf[pos + 3] & 0xFFL);
+ case 8:
+ return ((buf[pos] & 0xFFL) << 56)
+ | ((buf[pos + 1] & 0xFFL) << 48)
+ | ((buf[pos + 2] & 0xFFL) << 40)
+ | ((buf[pos + 3] & 0xFFL) << 32)
+ | ((buf[pos + 4] & 0xFFL) << 24)
+ | ((buf[pos + 5] & 0xFFL) << 16)
+ | ((buf[pos + 6] & 0xFFL) << 8)
+ | (buf[pos + 7] & 0xFFL);
+ default:
+ return 0;
+ }
+ }
+
+ private static int writeFixedKey(byte[] buf, int pos, long key, int width) {
+ switch (width) {
+ case 1:
+ buf[pos++] = (byte) key;
+ break;
+ case 2:
+ buf[pos++] = (byte) (key >>> 8);
+ buf[pos++] = (byte) key;
+ break;
+ case 4:
+ buf[pos++] = (byte) (key >>> 24);
+ buf[pos++] = (byte) (key >>> 16);
+ buf[pos++] = (byte) (key >>> 8);
+ buf[pos++] = (byte) key;
+ break;
+ case 8:
+ buf[pos++] = (byte) (key >>> 56);
+ buf[pos++] = (byte) (key >>> 48);
+ buf[pos++] = (byte) (key >>> 40);
+ buf[pos++] = (byte) (key >>> 32);
+ buf[pos++] = (byte) (key >>> 24);
+ buf[pos++] = (byte) (key >>> 16);
+ buf[pos++] = (byte) (key >>> 8);
+ buf[pos++] = (byte) key;
+ break;
+ default:
+ break;
+ }
+ return pos;
+ }
+
+ // ======================== Buffer helpers ========================
+
+ private void ensureValueCapacity(int colIdx, int additional) {
+ int required = valueBufPos[colIdx] + additional;
+ if (required > valueBuffers[colIdx].length) {
+ int newLen = Math.max(valueBuffers[colIdx].length * 2, required);
+ byte[] newBuf = new byte[newLen];
+ System.arraycopy(valueBuffers[colIdx], 0, newBuf, 0, valueBufPos[colIdx]);
+ valueBuffers[colIdx] = newBuf;
+ }
+ }
+
+ private static boolean regionEquals(byte[] buf, int off1, int off2, int len) {
+ for (int i = 0; i < len; i++) {
+ if (buf[off1 + i] != buf[off2 + i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // ======================== Type width ========================
+
+ static int getFixedWidth(DataType type) {
+ switch (type.getTypeRoot()) {
+ case BOOLEAN:
+ case TINYINT:
+ return 1;
+ case SMALLINT:
+ return 2;
+ case INTEGER:
+ case DATE:
+ case TIME_WITHOUT_TIME_ZONE:
+ case FLOAT:
+ return 4;
+ case BIGINT:
+ case DOUBLE:
+ return 8;
+ case DECIMAL:
+ if (Decimal.isCompact(((DecimalType) type).getPrecision())) {
+ return 8;
+ }
+ return -1;
+ case TIMESTAMP_WITHOUT_TIME_ZONE:
+ if (Timestamp.isCompact(((TimestampType) type).getPrecision())) {
+ return 8;
+ }
+ return 12;
+ case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
+ if (Timestamp.isCompact(((LocalZonedTimestampType) type).getPrecision())) {
+ return 8;
+ }
+ return 12;
+ default:
+ return -1;
+ }
+ }
+
+ // ======================== Varint helpers ========================
+
+ private static int readVarint(byte[] buf, int pos) {
+ int value = 0;
+ int shift = 0;
+ int b;
+ do {
+ b = buf[pos++] & 0xFF;
+ value |= (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ return value;
+ }
+
+ private static int varintSize(int value) {
+ int size = 1;
+ while ((value & ~0x7F) != 0) {
+ size++;
+ value >>>= 7;
+ }
+ return size;
+ }
+
+ // ======================== ByteKey ========================
+
+ /** Immutable byte array wrapper with value-based hash and equals for dict tracking. */
+ static final class ByteKey {
+ final byte[] data;
+ private final int hash;
+
+ ByteKey(byte[] source, int offset, int length) {
+ this.data = new byte[length];
+ System.arraycopy(source, offset, this.data, 0, length);
+ int h = 1;
+ for (int i = 0; i < length; i++) {
+ h = 31 * h + this.data[i];
+ }
+ this.hash = h;
+ }
+
+ @Override
+ public int hashCode() {
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof ByteKey)) {
+ return false;
+ }
+ return Arrays.equals(data, ((ByteKey) obj).data);
+ }
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java
new file mode 100644
index 000000000000..f7f1bac2d329
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileAnalyzer.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.fs.FileIO;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.SeekableInputStream;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import static org.apache.paimon.format.mosaic.MosaicUtils.readLong;
+import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint;
+import static org.apache.paimon.utils.IOUtils.readFully;
+
+/** Utility to analyze the storage breakdown of a Mosaic file. */
+public class MosaicFileAnalyzer {
+
+ public static String analyze(FileIO fileIO, Path path) throws IOException {
+ long fileSize = fileIO.getFileSize(path);
+ try (SeekableInputStream in = fileIO.newInputStream(path)) {
+ return analyze(in, fileSize);
+ }
+ }
+
+ public static String analyze(SeekableInputStream in, long fileSize) throws IOException {
+ in.seek(fileSize - MosaicSpec.FOOTER_SIZE);
+ byte[] footerBytes = new byte[MosaicSpec.FOOTER_SIZE];
+ readFully(in, footerBytes);
+ ByteBuffer footer = ByteBuffer.wrap(footerBytes).order(ByteOrder.BIG_ENDIAN);
+ long indexOffset = footer.getLong();
+ long schemaBlockOffset = footer.getLong();
+ int numBuckets = footer.getInt();
+ int numRowGroups = footer.getInt();
+ byte compression = footer.get();
+ byte version = footer.get();
+
+ long schemaBlockSize = indexOffset - schemaBlockOffset;
+ long indexSize = fileSize - MosaicSpec.FOOTER_SIZE - indexOffset;
+
+ // Schema uncompressed size
+ in.seek(schemaBlockOffset);
+ byte[] lenBuf = new byte[4];
+ readFully(in, lenBuf);
+ int schemaUncompressed = ByteBuffer.wrap(lenBuf).order(ByteOrder.BIG_ENDIAN).getInt();
+ long schemaCompressed = schemaBlockSize - 4;
+
+ // Per-bucket stats from row group index (varint encoded, non-empty only)
+ in.seek(indexOffset);
+ byte[] indexBytes = new byte[(int) indexSize];
+ readFully(in, indexBytes);
+ int[] idxPos = {0};
+
+ long totalCompressed = 0;
+ long totalUncompressed = 0;
+ int nonEmptyBuckets = 0;
+ int totalRows = 0;
+
+ for (int rg = 0; rg < numRowGroups; rg++) {
+ totalRows += readVarint(indexBytes, idxPos);
+ int nonEmpty = readVarint(indexBytes, idxPos);
+ nonEmptyBuckets += nonEmpty;
+ for (int i = 0; i < nonEmpty; i++) {
+ readVarint(indexBytes, idxPos); // bucketId
+ readLong(indexBytes, idxPos); // offset
+ int cs = readVarint(indexBytes, idxPos);
+ int us = readVarint(indexBytes, idxPos);
+ totalCompressed += cs;
+ totalUncompressed += us;
+ }
+ }
+
+ return String.format(
+ "=== Mosaic File Analysis ===%n"
+ + "File size: %,d bytes (%.1f KB)%n"
+ + "Version: %d%n"
+ + "Compression: %d%n"
+ + "Buckets: %d (%d non-empty)%n"
+ + "Row groups: %d%n"
+ + "Total rows: %,d%n%n",
+ fileSize,
+ fileSize / 1024.0,
+ version,
+ compression,
+ numBuckets,
+ nonEmptyBuckets,
+ numRowGroups,
+ totalRows)
+ + String.format(
+ "--- Section Sizes ---%n"
+ + "Bucket data: %,9d bytes (%5.1f KB, %5.1f%%)%n"
+ + "Schema block: %,9d bytes (%5.1f KB, %5.1f%%)%n"
+ + "Row group index: %,9d bytes (%5.1f KB, %5.1f%%)%n"
+ + "Footer: %,9d bytes (%5.1f KB, %5.1f%%)%n%n",
+ schemaBlockOffset,
+ schemaBlockOffset / 1024.0,
+ 100.0 * schemaBlockOffset / fileSize,
+ schemaBlockSize,
+ schemaBlockSize / 1024.0,
+ 100.0 * schemaBlockSize / fileSize,
+ indexSize,
+ indexSize / 1024.0,
+ 100.0 * indexSize / fileSize,
+ (long) MosaicSpec.FOOTER_SIZE,
+ MosaicSpec.FOOTER_SIZE / 1024.0,
+ 100.0 * MosaicSpec.FOOTER_SIZE / fileSize)
+ + String.format(
+ "--- Compression ---%n"
+ + "Schema: %,9d -> %,9d bytes (%.1fx)%n"
+ + "Bucket data: %,9d -> %,9d bytes (%.1fx)%n",
+ schemaUncompressed,
+ schemaCompressed,
+ schemaCompressed > 0 ? (double) schemaUncompressed / schemaCompressed : 0,
+ totalUncompressed,
+ totalCompressed,
+ totalCompressed > 0 ? (double) totalUncompressed / totalCompressed : 0);
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java
new file mode 100644
index 000000000000..ebb969344677
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.format.FileFormat;
+import org.apache.paimon.format.FileFormatFactory.FormatContext;
+import org.apache.paimon.format.FormatReaderFactory;
+import org.apache.paimon.format.FormatWriterFactory;
+import org.apache.paimon.predicate.Predicate;
+import org.apache.paimon.types.DataTypeRoot;
+import org.apache.paimon.types.RowType;
+
+import javax.annotation.Nullable;
+
+import java.util.List;
+
+/**
+ * Mosaic file format: a column-bucket hybrid format optimized for wide tables (1,000-100,000+
+ * columns). Columns are hashed into buckets, row-stored within each bucket, and independently
+ * compressed. Projection pushdown works at bucket granularity.
+ */
+public class MosaicFileFormat extends FileFormat {
+
+ private final int numBuckets;
+ private final int zstdLevel;
+ private final long rowGroupMaxSize;
+
+ public MosaicFileFormat(FormatContext formatContext) {
+ super(MosaicFileFormatFactory.IDENTIFIER);
+ this.numBuckets =
+ formatContext
+ .options()
+ .getOptional(MosaicOptions.NUM_COLUMN_BUCKETS)
+ .orElse(MosaicSpec.DEFAULT_NUM_BUCKETS);
+ this.zstdLevel = formatContext.zstdLevel();
+ this.rowGroupMaxSize = formatContext.writeBatchMemory().getBytes();
+ }
+
+ @Override
+ public FormatReaderFactory createReaderFactory(
+ RowType dataSchemaRowType,
+ RowType projectedRowType,
+ @Nullable List filters) {
+ return new MosaicReaderFactory(projectedRowType);
+ }
+
+ @Override
+ public FormatWriterFactory createWriterFactory(RowType type) {
+ return new MosaicWriterFactory(type, numBuckets, zstdLevel, rowGroupMaxSize);
+ }
+
+ @Override
+ public void validateDataFields(RowType rowType) {
+ rowType.getFields().forEach(f -> validateFieldType(f.type().getTypeRoot(), f.name()));
+ }
+
+ private static void validateFieldType(DataTypeRoot root, String fieldName) {
+ switch (root) {
+ case ARRAY:
+ case VECTOR:
+ case MAP:
+ case MULTISET:
+ case ROW:
+ case VARIANT:
+ case BLOB:
+ throw new UnsupportedOperationException(
+ "Unsupported type: " + root + " for field: " + fieldName);
+ default:
+ }
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java
new file mode 100644
index 000000000000..d94aff596ed7
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.format.FileFormat;
+import org.apache.paimon.format.FileFormatFactory;
+
+/** Factory for creating Mosaic file format instances via SPI. */
+public class MosaicFileFormatFactory implements FileFormatFactory {
+
+ public static final String IDENTIFIER = "mosaic";
+
+ @Override
+ public String identifier() {
+ return IDENTIFIER;
+ }
+
+ @Override
+ public FileFormat create(FormatContext formatContext) {
+ return new MosaicFileFormat(formatContext);
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java
new file mode 100644
index 000000000000..51f0aa9557f5
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicOptions.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.options.ConfigOption;
+import org.apache.paimon.options.ConfigOptions;
+
+/** Configuration options for the Mosaic file format. */
+public class MosaicOptions {
+
+ public static final ConfigOption NUM_COLUMN_BUCKETS =
+ ConfigOptions.key("mosaic.num-column-buckets")
+ .intType()
+ .defaultValue(MosaicSpec.DEFAULT_NUM_BUCKETS)
+ .withDescription(
+ "Number of column buckets in the Mosaic format. "
+ + "Columns are hashed into this many buckets. "
+ + "Default is 100.");
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java
new file mode 100644
index 000000000000..b53befc243ff
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReader.java
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.fs.FileIO;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.SeekableInputStream;
+import org.apache.paimon.reader.FileRecordIterator;
+import org.apache.paimon.reader.FileRecordReader;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.RowType;
+import org.apache.paimon.utils.IteratorResultIterator;
+import org.apache.paimon.utils.IteratorWithException;
+
+import com.github.luben.zstd.Zstd;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.Set;
+
+import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_NONE;
+import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_ZSTD;
+import static org.apache.paimon.format.mosaic.MosaicUtils.readLong;
+import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint;
+
+/** Reader for the Mosaic file format with row group support. */
+public class MosaicReader implements FileRecordReader {
+
+ private final Path filePath;
+ private final SeekableInputStream inputStream;
+ private final RowType projectedRowType;
+
+ private byte compression;
+ private int[] sortedRequiredBuckets;
+ private MosaicSpec.RowGroupMeta[] rowGroupMetas;
+ private MosaicBucketReader[] bucketReaders;
+ private int currentRowGroup;
+ private byte[] compressedBuf;
+
+ public MosaicReader(FileIO fileIO, Path filePath, long fileSize, RowType projectedRowType)
+ throws IOException {
+ this.filePath = filePath;
+ this.inputStream = fileIO.newInputStream(filePath);
+ this.projectedRowType = projectedRowType;
+ this.currentRowGroup = 0;
+
+ readFooterAndInit(fileSize);
+ }
+
+ private void readFooterAndInit(long fileSize) throws IOException {
+ // Read footer (last 32 bytes)
+ inputStream.seek(fileSize - MosaicSpec.FOOTER_SIZE);
+ byte[] footerBytes = new byte[MosaicSpec.FOOTER_SIZE];
+ readFully(footerBytes);
+
+ ByteBuffer footer = ByteBuffer.wrap(footerBytes).order(ByteOrder.BIG_ENDIAN);
+ long indexOffset = footer.getLong();
+ long schemaBlockOffset = footer.getLong();
+ int numBuckets = footer.getInt();
+ int numRowGroups = footer.getInt();
+ this.compression = footer.get();
+ byte version = footer.get();
+ footer.getShort(); // padding
+ byte[] magic = new byte[4];
+ footer.get(magic);
+
+ if (magic[0] != 'M' || magic[1] != 'O' || magic[2] != 'S' || magic[3] != 'A') {
+ throw new IOException("Invalid Mosaic file: bad magic bytes");
+ }
+
+ if (version != MosaicSpec.VERSION) {
+ throw new IOException(
+ "Unsupported Mosaic file version: "
+ + version
+ + ", expected: "
+ + MosaicSpec.VERSION);
+ }
+
+ // Read schema block
+ inputStream.seek(schemaBlockOffset);
+ int schemaUncompressedSize = readInt();
+ int schemaCompressedSize = (int) (indexOffset - schemaBlockOffset - 4);
+ byte[] schemaCompressed = new byte[schemaCompressedSize];
+ readFully(schemaCompressed);
+
+ byte[] schemaRaw;
+ switch (compression) {
+ case COMPRESSION_NONE:
+ schemaRaw = schemaCompressed;
+ break;
+ case COMPRESSION_ZSTD:
+ schemaRaw = new byte[schemaUncompressedSize];
+ Zstd.decompress(schemaRaw, schemaCompressed);
+ break;
+ default:
+ throw new UnsupportedEncodingException("Unsupported compression: " + compression);
+ }
+ MosaicSchema schema = MosaicSchema.deserialize(schemaRaw);
+
+ // Determine which buckets we need
+ Set requiredBuckets = schema.getRequiredBuckets(projectedRowType);
+
+ // Read row group index (varint encoded, only non-empty buckets)
+ inputStream.seek(indexOffset);
+ int indexSize = (int) (fileSize - MosaicSpec.FOOTER_SIZE - indexOffset);
+ byte[] indexBytes = new byte[indexSize];
+ readFully(indexBytes);
+ int[] idxPos = {0};
+
+ this.rowGroupMetas = new MosaicSpec.RowGroupMeta[numRowGroups];
+ for (int rg = 0; rg < numRowGroups; rg++) {
+ int numRows = readVarint(indexBytes, idxPos);
+ int nonEmpty = readVarint(indexBytes, idxPos);
+
+ long[] bucketOffsets = new long[numBuckets];
+ int[] compressedSizes = new int[numBuckets];
+ int[] uncompressedSizes = new int[numBuckets];
+
+ for (int i = 0; i < nonEmpty; i++) {
+ int bucketId = readVarint(indexBytes, idxPos);
+ bucketOffsets[bucketId] = readLong(indexBytes, idxPos);
+ compressedSizes[bucketId] = readVarint(indexBytes, idxPos);
+ uncompressedSizes[bucketId] = readVarint(indexBytes, idxPos);
+ }
+
+ rowGroupMetas[rg] =
+ new MosaicSpec.RowGroupMeta(
+ numRows, bucketOffsets, compressedSizes, uncompressedSizes);
+ }
+
+ this.bucketReaders = new MosaicBucketReader[numBuckets];
+ int count = 0;
+ for (int b : requiredBuckets) {
+ DataType[] bucketTypes = schema.getBucketColumnTypes(b);
+ int[] projMapping = schema.getProjectionMapping(b, projectedRowType);
+ if (projMapping != null) {
+ bucketReaders[b] = new MosaicBucketReader(bucketTypes, projMapping);
+ count++;
+ }
+ }
+ this.sortedRequiredBuckets = new int[count];
+ int idx = 0;
+ for (int b : requiredBuckets) {
+ if (bucketReaders[b] != null) {
+ sortedRequiredBuckets[idx++] = b;
+ }
+ }
+ this.compressedBuf = new byte[0];
+ }
+
+ @Nullable
+ @Override
+ public FileRecordIterator readBatch() throws IOException {
+ if (currentRowGroup >= rowGroupMetas.length) {
+ return null;
+ }
+
+ MosaicSpec.RowGroupMeta meta = rowGroupMetas[currentRowGroup++];
+ if (meta.numRows == 0) {
+ return readBatch();
+ }
+
+ final MosaicBucketReader[] readers = this.bucketReaders;
+
+ // Sort required buckets by file offset for sequential I/O
+ int[] ordered = Arrays.copyOf(sortedRequiredBuckets, sortedRequiredBuckets.length);
+ final long[] offsets = meta.bucketOffsets;
+ // insertion sort — array is small (number of projected buckets)
+ for (int i = 1; i < ordered.length; i++) {
+ int key = ordered[i];
+ long keyOff = offsets[key];
+ int j = i - 1;
+ while (j >= 0 && offsets[ordered[j]] > keyOff) {
+ ordered[j + 1] = ordered[j];
+ j--;
+ }
+ ordered[j + 1] = key;
+ }
+
+ int activeCount = 0;
+ int[] activeBuckets = new int[ordered.length];
+
+ for (int b : ordered) {
+ if (meta.compressedSizes[b] == 0) {
+ continue;
+ }
+
+ int compSize = meta.compressedSizes[b];
+ inputStream.seek(meta.bucketOffsets[b]);
+
+ byte[] bucketData;
+ switch (compression) {
+ case COMPRESSION_NONE:
+ bucketData = new byte[compSize];
+ readFully(bucketData);
+ break;
+ case COMPRESSION_ZSTD:
+ if (compressedBuf.length < compSize) {
+ compressedBuf = new byte[compSize];
+ }
+ readFully(compressedBuf, compSize);
+ int uncompSize = meta.uncompressedSizes[b];
+ bucketData = new byte[uncompSize];
+ Zstd.decompressByteArray(bucketData, 0, uncompSize, compressedBuf, 0, compSize);
+ break;
+ default:
+ throw new UnsupportedEncodingException(
+ "Unsupported compression: " + compression);
+ }
+
+ readers[b].init(bucketData, meta.numRows);
+ activeBuckets[activeCount++] = b;
+ }
+
+ final int[] active = Arrays.copyOf(activeBuckets, activeCount);
+ return new IteratorResultIterator(
+ toIterator(meta.numRows, active, readers), null, filePath, 0);
+ }
+
+ private IteratorWithException toIterator(
+ int totalRows, int[] active, MosaicBucketReader[] readers) {
+ final int projectedFieldCount = projectedRowType.getFieldCount();
+ return new IteratorWithException() {
+ int currentRow = 0;
+ final Object[] fields = new Object[projectedFieldCount];
+
+ @Override
+ public boolean hasNext() {
+ return currentRow < totalRows;
+ }
+
+ @Override
+ public InternalRow next() {
+ Arrays.fill(fields, null);
+ for (int j : active) {
+ readers[j].readRow(fields);
+ }
+ currentRow++;
+ return GenericRow.of(fields);
+ }
+ };
+ }
+
+ @Override
+ public void close() throws IOException {
+ inputStream.close();
+ }
+
+ private void readFully(byte[] buf) throws IOException {
+ readFully(buf, buf.length);
+ }
+
+ private void readFully(byte[] buf, int len) throws IOException {
+ int offset = 0;
+ while (offset < len) {
+ int read = inputStream.read(buf, offset, len - offset);
+ if (read < 0) {
+ throw new IOException("Unexpected EOF");
+ }
+ offset += read;
+ }
+ }
+
+ private int readInt() throws IOException {
+ byte[] buf = new byte[4];
+ readFully(buf);
+ return ByteBuffer.wrap(buf).order(ByteOrder.BIG_ENDIAN).getInt();
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java
new file mode 100644
index 000000000000..3a5704eb1b0a
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.format.FormatReaderFactory;
+import org.apache.paimon.reader.FileRecordReader;
+import org.apache.paimon.types.RowType;
+
+import java.io.IOException;
+
+/** Factory for creating {@link MosaicReader} instances. */
+public class MosaicReaderFactory implements FormatReaderFactory {
+
+ private final RowType projectedRowType;
+
+ public MosaicReaderFactory(RowType projectedRowType) {
+ this.projectedRowType = projectedRowType;
+ }
+
+ @Override
+ public FileRecordReader createReader(Context context) throws IOException {
+ return new MosaicReader(
+ context.fileIO(), context.filePath(), context.fileSize(), projectedRowType);
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java
new file mode 100644
index 000000000000..03d9f16edb57
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSchema.java
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.RowType;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint;
+import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint;
+
+/** Schema block for the Mosaic file format. Stores column metadata and bucket assignments. */
+public class MosaicSchema {
+
+ private final int numBuckets;
+ private final List columns;
+ private final int[][] bucketToGlobalIndices;
+
+ private MosaicSchema(int numBuckets, List columns, int[][] bucketToGlobalIndices) {
+ this.numBuckets = numBuckets;
+ this.columns = columns;
+ this.bucketToGlobalIndices = bucketToGlobalIndices;
+ }
+
+ public static MosaicSchema create(RowType rowType, int numBuckets) {
+ int[][] bucketMapping = MosaicSpec.groupColumnsByBucket(rowType, numBuckets);
+ List fields = rowType.getFields();
+ List columns = new ArrayList<>(fields.size());
+
+ int[] columnToBucket = new int[fields.size()];
+ int[] columnToIndexInBucket = new int[fields.size()];
+ for (int b = 0; b < numBuckets; b++) {
+ for (int localIdx = 0; localIdx < bucketMapping[b].length; localIdx++) {
+ int globalIdx = bucketMapping[b][localIdx];
+ columnToBucket[globalIdx] = b;
+ columnToIndexInBucket[globalIdx] = localIdx;
+ }
+ }
+
+ for (int i = 0; i < fields.size(); i++) {
+ DataField field = fields.get(i);
+ columns.add(
+ new ColumnMeta(
+ field.id(),
+ field.name(),
+ field.type(),
+ columnToBucket[i],
+ columnToIndexInBucket[i]));
+ }
+
+ return new MosaicSchema(numBuckets, columns, bucketMapping);
+ }
+
+ public int numBuckets() {
+ return numBuckets;
+ }
+
+ public int[][] bucketToGlobalIndices() {
+ return bucketToGlobalIndices;
+ }
+
+ public DataType[] getBucketColumnTypes(int bucketId) {
+ int[] globalIndices = bucketToGlobalIndices[bucketId];
+ DataType[] types = new DataType[globalIndices.length];
+ for (int i = 0; i < globalIndices.length; i++) {
+ types[i] = columns.get(globalIndices[i]).type;
+ }
+ return types;
+ }
+
+ /** Returns the set of bucket IDs that contain at least one projected column. */
+ public Set getRequiredBuckets(RowType projectedRowType) {
+ Set projectedNames = new HashSet<>(projectedRowType.getFieldNames());
+ Set requiredBuckets = new HashSet<>();
+ for (ColumnMeta col : columns) {
+ if (projectedNames.contains(col.name)) {
+ requiredBuckets.add(col.bucketId);
+ }
+ }
+ return requiredBuckets;
+ }
+
+ /**
+ * For a given bucket, returns the mapping from local column indices within the bucket to output
+ * positions in the projected row. The array index is the local column index, and the value is
+ * the output position (-1 means skip). Returns null if no columns in this bucket are projected.
+ */
+ public int[] getProjectionMapping(int bucketId, RowType projectedRowType) {
+ Map projectedNameToPos = new HashMap<>();
+ List projectedNames = projectedRowType.getFieldNames();
+ for (int i = 0; i < projectedNames.size(); i++) {
+ projectedNameToPos.put(projectedNames.get(i), i);
+ }
+
+ int[] globalIndices = bucketToGlobalIndices[bucketId];
+ int[] localToOutput = new int[globalIndices.length];
+ Arrays.fill(localToOutput, -1);
+ boolean hasProjection = false;
+ for (int localIdx = 0; localIdx < globalIndices.length; localIdx++) {
+ ColumnMeta col = columns.get(globalIndices[localIdx]);
+ Integer outputPos = projectedNameToPos.get(col.name);
+ if (outputPos != null) {
+ localToOutput[localIdx] = outputPos;
+ hasProjection = true;
+ }
+ }
+ return hasProjection ? localToOutput : null;
+ }
+
+ public byte[] serialize() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutputStream out = new DataOutputStream(baos);
+
+ writeVarint(out, columns.size());
+ writeVarint(out, numBuckets);
+
+ // Front coding: each column name stored as (sharedPrefixLen, suffix)
+ byte[] prevNameBytes = new byte[0];
+ for (ColumnMeta col : columns) {
+ writeVarint(out, col.fieldId);
+ writeVarint(out, col.bucketId);
+ writeVarint(out, col.indexInBucket);
+
+ byte[] nameBytes = col.name.getBytes(StandardCharsets.UTF_8);
+ int shared = commonPrefixLength(prevNameBytes, nameBytes);
+ writeVarint(out, shared);
+ writeVarint(out, nameBytes.length - shared);
+ out.write(nameBytes, shared, nameBytes.length - shared);
+ prevNameBytes = nameBytes;
+
+ MosaicTypes.writeType(out, col.type);
+ }
+
+ out.flush();
+ return baos.toByteArray();
+ }
+
+ public static MosaicSchema deserialize(byte[] data) throws IOException {
+ DataInputStream in = new DataInputStream(new ByteArrayInputStream(data));
+
+ int numColumns = readVarint(in);
+ int numBuckets = readVarint(in);
+
+ List columns = new ArrayList<>(numColumns);
+ List> bucketLists = new ArrayList<>(numBuckets);
+ for (int i = 0; i < numBuckets; i++) {
+ bucketLists.add(new ArrayList<>());
+ }
+
+ byte[] prevNameBytes = new byte[0];
+ for (int i = 0; i < numColumns; i++) {
+ int fieldId = readVarint(in);
+ int bucketId = readVarint(in);
+ int indexInBucket = readVarint(in);
+
+ int shared = readVarint(in);
+ int suffixLen = readVarint(in);
+ byte[] nameBytes = new byte[shared + suffixLen];
+ System.arraycopy(prevNameBytes, 0, nameBytes, 0, shared);
+ in.readFully(nameBytes, shared, suffixLen);
+ prevNameBytes = nameBytes;
+
+ String name = new String(nameBytes, StandardCharsets.UTF_8);
+ DataType type = MosaicTypes.readType(in);
+ columns.add(new ColumnMeta(fieldId, name, type, bucketId, indexInBucket));
+ bucketLists.get(bucketId).add(i);
+ }
+
+ int[][] bucketToGlobal = new int[numBuckets][];
+ for (int b = 0; b < numBuckets; b++) {
+ List list = bucketLists.get(b);
+ bucketToGlobal[b] = new int[list.size()];
+ for (int j = 0; j < list.size(); j++) {
+ bucketToGlobal[b][j] = list.get(j);
+ }
+ }
+
+ return new MosaicSchema(numBuckets, columns, bucketToGlobal);
+ }
+
+ public MosaicSchema pruneAllNullColumns(boolean[][] allNullByBucket) {
+ Set prunedGlobalIndices = new HashSet<>();
+ for (int b = 0; b < numBuckets; b++) {
+ if (allNullByBucket[b] == null) {
+ continue;
+ }
+ int[] globalIndices = bucketToGlobalIndices[b];
+ for (int local = 0; local < globalIndices.length; local++) {
+ if (allNullByBucket[b][local]) {
+ prunedGlobalIndices.add(globalIndices[local]);
+ }
+ }
+ }
+
+ if (prunedGlobalIndices.isEmpty()) {
+ return this;
+ }
+
+ List newColumns = new ArrayList<>();
+ Map oldToNew = new HashMap<>();
+ for (int i = 0; i < columns.size(); i++) {
+ if (!prunedGlobalIndices.contains(i)) {
+ oldToNew.put(i, newColumns.size());
+ newColumns.add(columns.get(i));
+ }
+ }
+
+ int[][] newBucketToGlobal = new int[numBuckets][];
+ for (int b = 0; b < numBuckets; b++) {
+ List kept = new ArrayList<>();
+ for (int globalIdx : bucketToGlobalIndices[b]) {
+ Integer newIdx = oldToNew.get(globalIdx);
+ if (newIdx != null) {
+ kept.add(newIdx);
+ }
+ }
+ newBucketToGlobal[b] = new int[kept.size()];
+ for (int j = 0; j < kept.size(); j++) {
+ newBucketToGlobal[b][j] = kept.get(j);
+ ColumnMeta old = newColumns.get(kept.get(j));
+ newColumns.set(
+ kept.get(j),
+ new ColumnMeta(old.fieldId, old.name, old.type, old.bucketId, j));
+ }
+ }
+
+ return new MosaicSchema(numBuckets, newColumns, newBucketToGlobal);
+ }
+
+ private static int commonPrefixLength(byte[] a, byte[] b) {
+ int len = Math.min(a.length, b.length);
+ for (int i = 0; i < len; i++) {
+ if (a[i] != b[i]) {
+ return i;
+ }
+ }
+ return len;
+ }
+
+ /** Metadata for a single column. */
+ public static class ColumnMeta {
+ public final int fieldId;
+ public final String name;
+ public final DataType type;
+ public final int bucketId;
+ public final int indexInBucket;
+
+ public ColumnMeta(
+ int fieldId, String name, DataType type, int bucketId, int indexInBucket) {
+ this.fieldId = fieldId;
+ this.name = name;
+ this.type = type;
+ this.bucketId = bucketId;
+ this.indexInBucket = indexInBucket;
+ }
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java
new file mode 100644
index 000000000000..36bb50d76e23
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSpec.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.RowType;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/** Constants and utilities for the Mosaic file format. */
+public class MosaicSpec {
+
+ public static final byte[] MAGIC = new byte[] {'M', 'O', 'S', 'A'};
+ public static final byte VERSION = 1;
+
+ public static final int FOOTER_SIZE = 32;
+
+ public static final byte COMPRESSION_NONE = 0;
+ public static final byte COMPRESSION_ZSTD = 1;
+
+ public static final int DEFAULT_NUM_BUCKETS = 100;
+
+ // Column encoding types (2 bits each in encoding flags)
+ public static final byte ENCODING_PLAIN = 0;
+ public static final byte ENCODING_CONST = 1;
+ public static final byte ENCODING_DICT = 2;
+ public static final byte ENCODING_ALL_NULL = 3;
+
+ public static int assignBucket(String fieldName, int numBuckets) {
+ return Math.floorMod(fieldName.hashCode(), numBuckets);
+ }
+
+ /**
+ * Groups columns by bucket. Returns an array where each element is the list of global column
+ * indices assigned to that bucket.
+ */
+ public static int[][] groupColumnsByBucket(RowType rowType, int numBuckets) {
+ List fields = rowType.getFields();
+ List> buckets = new ArrayList<>(numBuckets);
+ for (int i = 0; i < numBuckets; i++) {
+ buckets.add(new ArrayList<>());
+ }
+ for (int i = 0; i < fields.size(); i++) {
+ int bucketId = assignBucket(fields.get(i).name(), numBuckets);
+ buckets.get(bucketId).add(i);
+ }
+ int[][] result = new int[numBuckets][];
+ for (int i = 0; i < numBuckets; i++) {
+ List list = buckets.get(i);
+ result[i] = new int[list.size()];
+ for (int j = 0; j < list.size(); j++) {
+ result[i][j] = list.get(j);
+ }
+ }
+ return result;
+ }
+
+ public static byte compressionToByte(String compression) {
+ if (compression == null || compression.isEmpty() || "none".equalsIgnoreCase(compression)) {
+ return COMPRESSION_NONE;
+ }
+ if ("zstd".equalsIgnoreCase(compression)) {
+ return COMPRESSION_ZSTD;
+ }
+ throw new IllegalArgumentException("Unsupported Mosaic compression: " + compression);
+ }
+
+ /** Metadata for a single row group. */
+ public static class RowGroupMeta {
+ public final int numRows;
+ public final long[] bucketOffsets;
+ public final int[] compressedSizes;
+ public final int[] uncompressedSizes;
+
+ public RowGroupMeta(
+ int numRows, long[] bucketOffsets, int[] compressedSizes, int[] uncompressedSizes) {
+ this.numRows = numRows;
+ this.bucketOffsets = bucketOffsets;
+ this.compressedSizes = compressedSizes;
+ this.uncompressedSizes = uncompressedSizes;
+ }
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java
new file mode 100644
index 000000000000..bc0892c7d807
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicTypes.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.types.BigIntType;
+import org.apache.paimon.types.BinaryType;
+import org.apache.paimon.types.BooleanType;
+import org.apache.paimon.types.CharType;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypeRoot;
+import org.apache.paimon.types.DateType;
+import org.apache.paimon.types.DecimalType;
+import org.apache.paimon.types.DoubleType;
+import org.apache.paimon.types.FloatType;
+import org.apache.paimon.types.IntType;
+import org.apache.paimon.types.LocalZonedTimestampType;
+import org.apache.paimon.types.SmallIntType;
+import org.apache.paimon.types.TimeType;
+import org.apache.paimon.types.TimestampType;
+import org.apache.paimon.types.TinyIntType;
+import org.apache.paimon.types.VarBinaryType;
+import org.apache.paimon.types.VarCharType;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import static org.apache.paimon.format.mosaic.MosaicUtils.readVarint;
+import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint;
+
+/** Recursive binary serialization/deserialization for {@link DataType}. */
+public class MosaicTypes {
+
+ private static final byte TYPE_BOOLEAN = 0;
+ private static final byte TYPE_TINYINT = 1;
+ private static final byte TYPE_SMALLINT = 2;
+ private static final byte TYPE_INTEGER = 3;
+ private static final byte TYPE_BIGINT = 4;
+ private static final byte TYPE_FLOAT = 5;
+ private static final byte TYPE_DOUBLE = 6;
+ private static final byte TYPE_DATE = 7;
+ private static final byte TYPE_CHAR = 8;
+ private static final byte TYPE_VARCHAR = 9;
+ private static final byte TYPE_STRING = 10;
+ private static final byte TYPE_BINARY = 11;
+ private static final byte TYPE_VARBINARY = 12;
+ private static final byte TYPE_BYTES = 13;
+ private static final byte TYPE_DECIMAL = 14;
+ private static final byte TYPE_TIME = 15;
+ private static final byte TYPE_TIMESTAMP = 16;
+ private static final byte TYPE_TIMESTAMP_LTZ = 17;
+
+ @FunctionalInterface
+ interface TypeWriter {
+ void write(DataOutputStream out, DataType type) throws IOException;
+ }
+
+ @FunctionalInterface
+ interface TypeReader {
+ DataType read(DataInputStream in, boolean nullable) throws IOException;
+ }
+
+ private static final TypeWriter[] WRITERS = new TypeWriter[DataTypeRoot.values().length];
+ private static final TypeReader[] READERS = new TypeReader[18];
+
+ static {
+ // simple types
+ reg(DataTypeRoot.BOOLEAN, TYPE_BOOLEAN, (in, n) -> new BooleanType(n));
+ reg(DataTypeRoot.TINYINT, TYPE_TINYINT, (in, n) -> new TinyIntType(n));
+ reg(DataTypeRoot.SMALLINT, TYPE_SMALLINT, (in, n) -> new SmallIntType(n));
+ reg(DataTypeRoot.INTEGER, TYPE_INTEGER, (in, n) -> new IntType(n));
+ reg(DataTypeRoot.BIGINT, TYPE_BIGINT, (in, n) -> new BigIntType(n));
+ reg(DataTypeRoot.FLOAT, TYPE_FLOAT, (in, n) -> new FloatType(n));
+ reg(DataTypeRoot.DOUBLE, TYPE_DOUBLE, (in, n) -> new DoubleType(n));
+ reg(DataTypeRoot.DATE, TYPE_DATE, (in, n) -> new DateType(n));
+
+ // CHAR
+ WRITERS[DataTypeRoot.CHAR.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_CHAR);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, ((CharType) type).getLength());
+ };
+ READERS[TYPE_CHAR] = (in, n) -> new CharType(n, readVarint(in));
+
+ // VARCHAR / STRING
+ WRITERS[DataTypeRoot.VARCHAR.ordinal()] =
+ (out, type) -> {
+ int len = ((VarCharType) type).getLength();
+ if (len == VarCharType.MAX_LENGTH) {
+ out.writeByte(TYPE_STRING);
+ out.writeBoolean(type.isNullable());
+ } else {
+ out.writeByte(TYPE_VARCHAR);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, len);
+ }
+ };
+ READERS[TYPE_VARCHAR] = (in, n) -> new VarCharType(n, readVarint(in));
+ READERS[TYPE_STRING] = (in, n) -> new VarCharType(n, VarCharType.MAX_LENGTH);
+
+ // BINARY
+ WRITERS[DataTypeRoot.BINARY.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_BINARY);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, ((BinaryType) type).getLength());
+ };
+ READERS[TYPE_BINARY] = (in, n) -> new BinaryType(n, readVarint(in));
+
+ // VARBINARY / BYTES
+ WRITERS[DataTypeRoot.VARBINARY.ordinal()] =
+ (out, type) -> {
+ int len = ((VarBinaryType) type).getLength();
+ if (len == VarBinaryType.MAX_LENGTH) {
+ out.writeByte(TYPE_BYTES);
+ out.writeBoolean(type.isNullable());
+ } else {
+ out.writeByte(TYPE_VARBINARY);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, len);
+ }
+ };
+ READERS[TYPE_VARBINARY] = (in, n) -> new VarBinaryType(n, readVarint(in));
+ READERS[TYPE_BYTES] = (in, n) -> new VarBinaryType(n, VarBinaryType.MAX_LENGTH);
+
+ // DECIMAL
+ WRITERS[DataTypeRoot.DECIMAL.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_DECIMAL);
+ out.writeBoolean(type.isNullable());
+ DecimalType dt = (DecimalType) type;
+ writeVarint(out, dt.getPrecision());
+ writeVarint(out, dt.getScale());
+ };
+ READERS[TYPE_DECIMAL] = (in, n) -> new DecimalType(n, readVarint(in), readVarint(in));
+
+ // TIME
+ WRITERS[DataTypeRoot.TIME_WITHOUT_TIME_ZONE.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_TIME);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, ((TimeType) type).getPrecision());
+ };
+ READERS[TYPE_TIME] = (in, n) -> new TimeType(n, readVarint(in));
+
+ // TIMESTAMP
+ WRITERS[DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_TIMESTAMP);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, ((TimestampType) type).getPrecision());
+ };
+ READERS[TYPE_TIMESTAMP] = (in, n) -> new TimestampType(n, readVarint(in));
+
+ // TIMESTAMP WITH LOCAL TIME ZONE
+ WRITERS[DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE.ordinal()] =
+ (out, type) -> {
+ out.writeByte(TYPE_TIMESTAMP_LTZ);
+ out.writeBoolean(type.isNullable());
+ writeVarint(out, ((LocalZonedTimestampType) type).getPrecision());
+ };
+ READERS[TYPE_TIMESTAMP_LTZ] = (in, n) -> new LocalZonedTimestampType(n, readVarint(in));
+ }
+
+ private static void reg(DataTypeRoot root, byte typeId, TypeReader reader) {
+ WRITERS[root.ordinal()] =
+ (out, type) -> {
+ out.writeByte(typeId);
+ out.writeBoolean(type.isNullable());
+ };
+ READERS[typeId] = reader;
+ }
+
+ public static void writeType(DataOutputStream out, DataType type) throws IOException {
+ TypeWriter writer = WRITERS[type.getTypeRoot().ordinal()];
+ if (writer == null) {
+ throw new IOException("Unsupported Mosaic type: " + type.getTypeRoot());
+ }
+ writer.write(out, type);
+ }
+
+ public static DataType readType(DataInputStream in) throws IOException {
+ int typeId = in.readByte() & 0xFF;
+ boolean nullable = in.readBoolean();
+ TypeReader reader = typeId < READERS.length ? READERS[typeId] : null;
+ if (reader == null) {
+ throw new IOException("Unsupported Mosaic type ID: " + typeId);
+ }
+ return reader.read(in, nullable);
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java
new file mode 100644
index 000000000000..a7f4cdcbe48b
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicUtils.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+/** Shared varint and long encoding/decoding utilities for the Mosaic file format. */
+public class MosaicUtils {
+
+ // ==================== byte[] based ====================
+
+ public static int readVarint(byte[] buf, int[] pos) {
+ int value = 0;
+ int shift = 0;
+ int b;
+ do {
+ b = buf[pos[0]++] & 0xFF;
+ value |= (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ return value;
+ }
+
+ public static long readLong(byte[] buf, int[] pos) {
+ long v = 0;
+ for (int i = 0; i < 8; i++) {
+ v = (v << 8) | (buf[pos[0]++] & 0xFF);
+ }
+ return v;
+ }
+
+ public static int writeVarint(byte[] buf, int pos, int value) {
+ while ((value & ~0x7F) != 0) {
+ buf[pos++] = (byte) ((value & 0x7F) | 0x80);
+ value >>>= 7;
+ }
+ buf[pos++] = (byte) value;
+ return pos;
+ }
+
+ public static int writeLong(byte[] buf, int pos, long value) {
+ buf[pos++] = (byte) (value >>> 56);
+ buf[pos++] = (byte) (value >>> 48);
+ buf[pos++] = (byte) (value >>> 40);
+ buf[pos++] = (byte) (value >>> 32);
+ buf[pos++] = (byte) (value >>> 24);
+ buf[pos++] = (byte) (value >>> 16);
+ buf[pos++] = (byte) (value >>> 8);
+ buf[pos++] = (byte) value;
+ return pos;
+ }
+
+ // ==================== stream based ====================
+
+ public static void writeVarint(DataOutputStream out, int value) throws IOException {
+ while ((value & ~0x7F) != 0) {
+ out.writeByte((value & 0x7F) | 0x80);
+ value >>>= 7;
+ }
+ out.writeByte(value);
+ }
+
+ public static int readVarint(DataInputStream in) throws IOException {
+ int value = 0;
+ int shift = 0;
+ int b;
+ do {
+ b = in.readByte() & 0xFF;
+ value |= (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ return value;
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java
new file mode 100644
index 000000000000..78083c283d8d
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriter.java
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.format.FormatWriter;
+import org.apache.paimon.format.mosaic.MosaicSpec.RowGroupMeta;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.types.RowType;
+
+import com.github.luben.zstd.Zstd;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_NONE;
+import static org.apache.paimon.format.mosaic.MosaicSpec.COMPRESSION_ZSTD;
+import static org.apache.paimon.format.mosaic.MosaicUtils.writeLong;
+import static org.apache.paimon.format.mosaic.MosaicUtils.writeVarint;
+
+/** Writer for the Mosaic file format with row group support. */
+public class MosaicWriter implements FormatWriter {
+
+ private final PositionOutputStream out;
+ private final MosaicSchema schema;
+ private MosaicSchema prunedSchema;
+ private final MosaicBucketWriter[] bucketWriters;
+ private final int numBuckets;
+ private final int zstdLevel;
+ private final byte compressionByte;
+ private final long rowGroupMaxSize;
+
+ private final List rowGroupMetas;
+ private byte[] compressBuffer;
+ private int currentRowGroupRows;
+ private long currentBufferedSize;
+ private double compressionRatio;
+ private boolean closed;
+
+ public MosaicWriter(
+ PositionOutputStream out,
+ RowType rowType,
+ int numBuckets,
+ int zstdLevel,
+ String compression,
+ long rowGroupMaxSize) {
+ this.out = out;
+ this.numBuckets = Math.min(numBuckets, rowType.getFieldCount());
+ this.zstdLevel = zstdLevel;
+ this.compressionByte = MosaicSpec.compressionToByte(compression);
+ this.rowGroupMaxSize = rowGroupMaxSize;
+ this.schema = MosaicSchema.create(rowType, this.numBuckets);
+ this.bucketWriters = new MosaicBucketWriter[this.numBuckets];
+
+ int[][] bucketMapping = schema.bucketToGlobalIndices();
+ for (int b = 0; b < this.numBuckets; b++) {
+ if (bucketMapping[b].length > 0) {
+ bucketWriters[b] = new MosaicBucketWriter(rowType, bucketMapping[b]);
+ }
+ }
+
+ this.rowGroupMetas = new ArrayList<>();
+ this.compressBuffer = new byte[0];
+ this.currentRowGroupRows = 0;
+ this.currentBufferedSize = 0;
+ this.compressionRatio = this.compressionByte == COMPRESSION_NONE ? 1.0 : 0.3;
+ this.closed = false;
+ }
+
+ @Override
+ public void addElement(InternalRow element) throws IOException {
+ long size = 0;
+ for (int i = 0; i < numBuckets; i++) {
+ if (bucketWriters[i] != null) {
+ size += bucketWriters[i].writeRow(element);
+ }
+ }
+ currentRowGroupRows++;
+ currentBufferedSize += size;
+
+ if (currentBufferedSize >= rowGroupMaxSize) {
+ flushRowGroup();
+ }
+ }
+
+ @Override
+ public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException {
+ long estimatedSize = out.getPos() + (long) (currentBufferedSize * compressionRatio);
+ return estimatedSize >= targetSize;
+ }
+
+ private void flushRowGroup() throws IOException {
+ if (currentRowGroupRows == 0) {
+ return;
+ }
+
+ long[] bucketOffsets = new long[numBuckets];
+ int[] compressedSizes = new int[numBuckets];
+ int[] uncompressedSizes = new int[numBuckets];
+
+ for (int b = 0; b < numBuckets; b++) {
+ MosaicBucketWriter bucketWriter = bucketWriters[b];
+ if (bucketWriter == null || bucketWriter.isEmpty()) {
+ continue;
+ }
+ byte[] raw = bucketWriter.finish();
+ compressedSizes[b] = writeCompressed(raw);
+ uncompressedSizes[b] = raw.length;
+ bucketOffsets[b] = out.getPos() - compressedSizes[b];
+ bucketWriter.reset();
+ }
+
+ rowGroupMetas.add(
+ new RowGroupMeta(
+ currentRowGroupRows, bucketOffsets, compressedSizes, uncompressedSizes));
+
+ long totalCompressed = 0;
+ long totalUncompressed = 0;
+ for (int b = 0; b < numBuckets; b++) {
+ totalCompressed += compressedSizes[b];
+ totalUncompressed += uncompressedSizes[b];
+ }
+ if (totalUncompressed > 0) {
+ compressionRatio = (double) totalCompressed / totalUncompressed;
+ }
+
+ currentRowGroupRows = 0;
+ currentBufferedSize = 0;
+ }
+
+ private void flushRowGroupPruned() throws IOException {
+ if (currentRowGroupRows == 0) {
+ return;
+ }
+
+ boolean[][] allNullByBucket = new boolean[numBuckets][];
+ long[] bucketOffsets = new long[numBuckets];
+ int[] compressedSizes = new int[numBuckets];
+ int[] uncompressedSizes = new int[numBuckets];
+
+ for (int b = 0; b < numBuckets; b++) {
+ MosaicBucketWriter bucketWriter = bucketWriters[b];
+ if (bucketWriter == null || bucketWriter.isEmpty()) {
+ continue;
+ }
+ allNullByBucket[b] = bucketWriter.getAllNullFlags();
+ byte[] raw = bucketWriter.finish(true);
+ compressedSizes[b] = writeCompressed(raw);
+ uncompressedSizes[b] = raw.length;
+ bucketOffsets[b] = out.getPos() - compressedSizes[b];
+ bucketWriter.reset();
+ }
+
+ rowGroupMetas.add(
+ new RowGroupMeta(
+ currentRowGroupRows, bucketOffsets, compressedSizes, uncompressedSizes));
+
+ prunedSchema = schema.pruneAllNullColumns(allNullByBucket);
+
+ currentRowGroupRows = 0;
+ currentBufferedSize = 0;
+ }
+
+ private int writeCompressed(byte[] raw) throws IOException {
+ switch (compressionByte) {
+ case COMPRESSION_NONE:
+ out.write(raw);
+ return raw.length;
+ case COMPRESSION_ZSTD:
+ int bound = (int) Zstd.compressBound(raw.length);
+ if (compressBuffer.length < bound) {
+ compressBuffer = new byte[bound];
+ }
+ int compLen = (int) Zstd.compress(compressBuffer, raw, zstdLevel);
+ out.write(compressBuffer, 0, compLen);
+ return compLen;
+ default:
+ throw new UnsupportedEncodingException(
+ "Unsupported compression: " + compressionByte);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (closed) {
+ return;
+ }
+ closed = true;
+
+ // Flush remaining rows as the last row group
+ boolean singleRowGroup = rowGroupMetas.isEmpty() && currentRowGroupRows > 0;
+ if (singleRowGroup) {
+ flushRowGroupPruned();
+ } else {
+ flushRowGroup();
+ }
+
+ // Write schema block (use pruned schema if available)
+ MosaicSchema schemaToWrite = prunedSchema != null ? prunedSchema : schema;
+ byte[] schemaRaw = schemaToWrite.serialize();
+ long schemaBlockOffset = out.getPos();
+ switch (compressionByte) {
+ case COMPRESSION_NONE:
+ {
+ ByteBuffer lenBuf = ByteBuffer.allocate(4).order(ByteOrder.BIG_ENDIAN);
+ lenBuf.putInt(schemaRaw.length);
+ out.write(lenBuf.array());
+ out.write(schemaRaw);
+ break;
+ }
+ case COMPRESSION_ZSTD:
+ {
+ int schemaBound = (int) Zstd.compressBound(schemaRaw.length);
+ if (compressBuffer.length < schemaBound) {
+ compressBuffer = new byte[schemaBound];
+ }
+ long compLen = Zstd.compress(compressBuffer, schemaRaw, zstdLevel);
+ ByteBuffer lenBuf = ByteBuffer.allocate(4).order(ByteOrder.BIG_ENDIAN);
+ lenBuf.putInt(schemaRaw.length);
+ out.write(lenBuf.array());
+ out.write(compressBuffer, 0, (int) compLen);
+ break;
+ }
+ default:
+ throw new UnsupportedEncodingException(
+ "Unsupported compression: " + compressionByte);
+ }
+
+ // Write row group index (varint encoded, only non-empty buckets)
+ long indexOffset = out.getPos();
+ int numRowGroups = rowGroupMetas.size();
+ byte[] indexBuf = new byte[numRowGroups * (5 + numBuckets * 25)];
+ int idxPos = 0;
+ for (RowGroupMeta meta : rowGroupMetas) {
+ idxPos = writeVarint(indexBuf, idxPos, meta.numRows);
+ int nonEmpty = 0;
+ for (int b = 0; b < numBuckets; b++) {
+ if (meta.compressedSizes[b] > 0) {
+ nonEmpty++;
+ }
+ }
+ idxPos = writeVarint(indexBuf, idxPos, nonEmpty);
+ for (int b = 0; b < numBuckets; b++) {
+ if (meta.compressedSizes[b] > 0) {
+ idxPos = writeVarint(indexBuf, idxPos, b);
+ idxPos = writeLong(indexBuf, idxPos, meta.bucketOffsets[b]);
+ idxPos = writeVarint(indexBuf, idxPos, meta.compressedSizes[b]);
+ idxPos = writeVarint(indexBuf, idxPos, meta.uncompressedSizes[b]);
+ }
+ }
+ }
+ out.write(indexBuf, 0, idxPos);
+
+ // Write footer
+ ByteBuffer footer = ByteBuffer.allocate(MosaicSpec.FOOTER_SIZE).order(ByteOrder.BIG_ENDIAN);
+ footer.putLong(indexOffset);
+ footer.putLong(schemaBlockOffset);
+ footer.putInt(numBuckets);
+ footer.putInt(numRowGroups);
+ footer.put(compressionByte);
+ footer.put(MosaicSpec.VERSION);
+ footer.putShort((short) 0);
+ footer.put(MosaicSpec.MAGIC);
+ out.write(footer.array());
+
+ out.flush();
+ }
+}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java
new file mode 100644
index 000000000000..5393ebcc0396
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.format.FormatWriter;
+import org.apache.paimon.format.FormatWriterFactory;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.types.RowType;
+
+import java.io.IOException;
+
+/** Factory for creating {@link MosaicWriter} instances. */
+public class MosaicWriterFactory implements FormatWriterFactory {
+
+ private final RowType rowType;
+ private final int numBuckets;
+ private final int zstdLevel;
+ private final long rowGroupMaxSize;
+
+ public MosaicWriterFactory(
+ RowType rowType, int numBuckets, int zstdLevel, long rowGroupMaxSize) {
+ this.rowType = rowType;
+ this.numBuckets = numBuckets;
+ this.zstdLevel = zstdLevel;
+ this.rowGroupMaxSize = rowGroupMaxSize;
+ }
+
+ @Override
+ public FormatWriter create(PositionOutputStream out, String compression) throws IOException {
+ return new MosaicWriter(out, rowType, numBuckets, zstdLevel, compression, rowGroupMaxSize);
+ }
+}
diff --git a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory
index 80cfe4b946b8..777fcb65f545 100644
--- a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory
+++ b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory
@@ -20,3 +20,4 @@ org.apache.paimon.format.csv.CsvFileFormatFactory
org.apache.paimon.format.text.TextFileFormatFactory
org.apache.paimon.format.json.JsonFileFormatFactory
org.apache.paimon.format.blob.BlobFileFormatFactory
+org.apache.paimon.format.mosaic.MosaicFileFormatFactory
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java b/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java
new file mode 100644
index 000000000000..4d915960f0f6
--- /dev/null
+++ b/paimon-format/src/test/java/org/apache/paimon/format/WideTableFormatBenchmark.java
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.format.FileFormatFactory.FormatContext;
+import org.apache.paimon.format.mosaic.MosaicFileFormat;
+import org.apache.paimon.format.orc.OrcFileFormat;
+import org.apache.paimon.format.parquet.ParquetFileFormat;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.options.MemorySize;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.reader.RecordReader;
+import org.apache.paimon.types.DataTypeRoot;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Benchmark to compare file sizes and projection read performance between Parquet, ORC and Mosaic
+ * for wide tables (10,000+ columns).
+ *
+ * Run manually: {@code mvn exec:java -pl paimon-format
+ * -Dexec.mainClass="org.apache.paimon.format.WideTableFileFormatSizeTest"
+ * -Dexec.classpathScope="test"}
+ */
+public class WideTableFormatBenchmark {
+
+ private static final int COLUMN_COUNT = 10000;
+ private static final int ROW_COUNT = 10;
+ private static final String COMPRESSION = "zstd";
+
+ public static void main(String[] args) throws Exception {
+ run(WideTableFormatBenchmark::fileSizeComparison);
+ run(tempDir -> projectionReadPerformance(tempDir, 500));
+ run(tempDir -> projectionReadPerformance(tempDir, 4500));
+ }
+
+ private static void run(Runner runner) throws IOException {
+ java.nio.file.Path tempDir = Files.createTempDirectory("mosaic-benchmark");
+ try {
+ runner.run(tempDir);
+ } finally {
+ deleteRecursively(tempDir);
+ }
+ }
+
+ private static void fileSizeComparison(java.nio.file.Path tempDir) throws IOException {
+ RowType rowType = buildWideRowType();
+ int fieldCount = rowType.getFieldCount();
+ LocalFileIO fileIO = new LocalFileIO();
+
+ long parquetSize =
+ writeParquet(
+ rowType,
+ ROW_COUNT,
+ new Path(tempDir.toString(), "wide_table.parquet"),
+ fileIO);
+
+ long orcSize =
+ writeOrc(
+ rowType, ROW_COUNT, new Path(tempDir.toString(), "wide_table.orc"), fileIO);
+
+ Path mosaicPath = new Path(tempDir.toString(), "wide_table.mosaic");
+ long mosaicSize = writeMosaic(rowType, ROW_COUNT, mosaicPath, fileIO);
+
+ System.out.println("=== Wide Table File Size Comparison ===");
+ System.out.println("Columns: " + COLUMN_COUNT + ", Rows: " + ROW_COUNT);
+ System.out.println("Column name avg length: ~80 bytes");
+ System.out.println("Compression: " + COMPRESSION + " (level 9)");
+ System.out.println("---------------------------------------");
+ System.out.printf("Parquet: %,d bytes (%.1f KB)%n", parquetSize, parquetSize / 1024.0);
+ System.out.printf("ORC: %,d bytes (%.1f KB)%n", orcSize, orcSize / 1024.0);
+ System.out.printf("Mosaic: %,d bytes (%.1f KB)%n", mosaicSize, mosaicSize / 1024.0);
+ System.out.println("---------------------------------------");
+
+ // verify Mosaic correctness
+ List mosaicResult = readMosaic(rowType, rowType, mosaicPath, fileIO);
+ check(mosaicResult.size() == ROW_COUNT, "Row count mismatch");
+ for (int r = 0; r < ROW_COUNT; r++) {
+ GenericRow expected = generateRow(r, fieldCount);
+ for (int c = 0; c < COLUMN_COUNT; c++) {
+ assertCellEqual(mosaicResult.get(r), expected, c);
+ }
+ }
+ System.out.println("Correctness check: PASSED");
+ }
+
+ private static void projectionReadPerformance(java.nio.file.Path tempDir, int rows)
+ throws IOException {
+ RowType rowType = buildWideRowType();
+ LocalFileIO fileIO = new LocalFileIO();
+
+ Path parquetPath = new Path(tempDir.toString(), "proj_test.parquet");
+ long parquetFileSize = writeParquet(rowType, rows, parquetPath, fileIO);
+
+ Path orcPath = new Path(tempDir.toString(), "proj_test.orc");
+ long orcFileSize = writeOrc(rowType, rows, orcPath, fileIO);
+
+ Path mosaicPath = new Path(tempDir.toString(), "proj_test.mosaic");
+ long mosaicFileSize = writeMosaic(rowType, rows, mosaicPath, fileIO);
+
+ int[] projected10Cols = {0, 100, 500, 1000, 2000, 5000, 7000, 8000, 9000, 9999};
+ int[] projected1Col = {1000};
+
+ System.out.printf("\n=== Projection Read Performance (%d rows) ===%n", rows);
+ System.out.printf(
+ "File size - Parquet: %.1f MB, ORC: %.1f MB, Mosaic: %.1f MB%n",
+ parquetFileSize / 1024.0 / 1024.0,
+ orcFileSize / 1024.0 / 1024.0,
+ mosaicFileSize / 1024.0 / 1024.0);
+ System.out.println("---------------------------------------");
+
+ benchmarkProjection(
+ rowType, projected10Cols, rows, parquetPath, orcPath, mosaicPath, fileIO);
+ benchmarkProjection(rowType, projected1Col, rows, parquetPath, orcPath, mosaicPath, fileIO);
+ }
+
+ private static void benchmarkProjection(
+ RowType rowType,
+ int[] projectedColumns,
+ int rows,
+ Path parquetPath,
+ Path orcPath,
+ Path mosaicPath,
+ LocalFileIO fileIO)
+ throws IOException {
+ RowType projectedType = rowType.project(projectedColumns);
+
+ int warmup = 3;
+ int iterations = 10;
+
+ for (int i = 0; i < warmup; i++) {
+ readParquetProjected(rowType, projectedType, parquetPath, fileIO);
+ }
+ long parquetStart = System.nanoTime();
+ for (int i = 0; i < iterations; i++) {
+ readParquetProjected(rowType, projectedType, parquetPath, fileIO);
+ }
+ long parquetTimeNs = (System.nanoTime() - parquetStart) / iterations;
+
+ for (int i = 0; i < warmup; i++) {
+ readOrcProjected(rowType, projectedType, orcPath, fileIO);
+ }
+ long orcStart = System.nanoTime();
+ for (int i = 0; i < iterations; i++) {
+ readOrcProjected(rowType, projectedType, orcPath, fileIO);
+ }
+ long orcTimeNs = (System.nanoTime() - orcStart) / iterations;
+
+ for (int i = 0; i < warmup; i++) {
+ readMosaic(rowType, projectedType, mosaicPath, fileIO);
+ }
+ long mosaicStart = System.nanoTime();
+ for (int i = 0; i < iterations; i++) {
+ readMosaic(rowType, projectedType, mosaicPath, fileIO);
+ }
+ long mosaicTimeNs = (System.nanoTime() - mosaicStart) / iterations;
+
+ System.out.printf(
+ "Project %2d / %d cols: Parquet %,d us, ORC %,d us, Mosaic %,d us%n",
+ projectedColumns.length,
+ COLUMN_COUNT,
+ parquetTimeNs / 1000,
+ orcTimeNs / 1000,
+ mosaicTimeNs / 1000);
+
+ // verify projection results
+ List parquetResult =
+ readParquetProjected(rowType, projectedType, parquetPath, fileIO);
+ List mosaicResult = readMosaic(rowType, projectedType, mosaicPath, fileIO);
+ check(
+ mosaicResult.size() == parquetResult.size(),
+ "Projection row count mismatch: parquet="
+ + parquetResult.size()
+ + " mosaic="
+ + mosaicResult.size());
+ for (int r = 0; r < parquetResult.size(); r++) {
+ for (int c = 0; c < projectedColumns.length; c++) {
+ int origCol = projectedColumns[c];
+ if (isIntColumn(origCol)) {
+ check(
+ mosaicResult.get(r).getInt(c) == parquetResult.get(r).getInt(c),
+ "INT mismatch at row=" + r + " col=" + c);
+ } else {
+ check(
+ mosaicResult
+ .get(r)
+ .getString(c)
+ .toString()
+ .equals(parquetResult.get(r).getString(c).toString()),
+ "STRING mismatch at row=" + r + " col=" + c);
+ }
+ }
+ }
+ }
+
+ // ==================== Parquet helpers ====================
+
+ private static long writeParquet(RowType rowType, int rowCount, Path path, LocalFileIO fileIO)
+ throws IOException {
+ ParquetFileFormat parquet = new ParquetFileFormat(createFormatContext());
+ FormatWriterFactory writerFactory = parquet.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, COMPRESSION);
+ int fieldCount = rowType.getFieldCount();
+ for (int r = 0; r < rowCount; r++) {
+ writer.addElement(generateRow(r, fieldCount));
+ }
+ writer.close();
+ out.close();
+ return fileIO.getFileSize(path);
+ }
+
+ private static List readParquetProjected(
+ RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO)
+ throws IOException {
+ ParquetFileFormat parquet = new ParquetFileFormat(createFormatContext());
+ RecordReader reader =
+ parquet.createReaderFactory(fullType, projectedType, null)
+ .createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)));
+ List result = new ArrayList<>();
+ reader.forEachRemaining(
+ row -> {
+ Object[] fields = new Object[projectedType.getFieldCount()];
+ for (int i = 0; i < fields.length; i++) {
+ if (row.isNullAt(i)) {
+ fields[i] = null;
+ } else if (projectedType.getTypeAt(i).getTypeRoot()
+ == DataTypeRoot.INTEGER) {
+ fields[i] = row.getInt(i);
+ } else {
+ fields[i] = BinaryString.fromString(row.getString(i).toString());
+ }
+ }
+ result.add(GenericRow.of(fields));
+ });
+ reader.close();
+ return result;
+ }
+
+ // ==================== ORC helpers ====================
+
+ private static long writeOrc(RowType rowType, int rowCount, Path path, LocalFileIO fileIO)
+ throws IOException {
+ OrcFileFormat orc = new OrcFileFormat(createFormatContext());
+ FormatWriterFactory writerFactory = orc.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, COMPRESSION);
+ int fieldCount = rowType.getFieldCount();
+ for (int r = 0; r < rowCount; r++) {
+ writer.addElement(generateRow(r, fieldCount));
+ }
+ writer.close();
+ out.close();
+ return fileIO.getFileSize(path);
+ }
+
+ private static List readOrcProjected(
+ RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO)
+ throws IOException {
+ OrcFileFormat orc = new OrcFileFormat(createFormatContext());
+ RecordReader reader =
+ orc.createReaderFactory(fullType, projectedType, new ArrayList<>())
+ .createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)));
+ List result = new ArrayList<>();
+ reader.forEachRemaining(
+ row -> {
+ Object[] fields = new Object[projectedType.getFieldCount()];
+ for (int i = 0; i < fields.length; i++) {
+ if (row.isNullAt(i)) {
+ fields[i] = null;
+ } else if (projectedType.getTypeAt(i).getTypeRoot()
+ == DataTypeRoot.INTEGER) {
+ fields[i] = row.getInt(i);
+ } else {
+ fields[i] = row.getString(i);
+ }
+ }
+ result.add(GenericRow.of(fields));
+ });
+ reader.close();
+ return result;
+ }
+
+ // ==================== Mosaic helpers ====================
+
+ private static long writeMosaic(RowType rowType, int rowCount, Path path, LocalFileIO fileIO)
+ throws IOException {
+ MosaicFileFormat mosaic = new MosaicFileFormat(createFormatContext());
+ FormatWriterFactory writerFactory = mosaic.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, COMPRESSION);
+ int fieldCount = rowType.getFieldCount();
+ for (int r = 0; r < rowCount; r++) {
+ writer.addElement(generateRow(r, fieldCount));
+ }
+ writer.close();
+ out.close();
+ return fileIO.getFileSize(path);
+ }
+
+ private static List readMosaic(
+ RowType fullType, RowType projectedType, Path path, LocalFileIO fileIO)
+ throws IOException {
+ MosaicFileFormat mosaic = new MosaicFileFormat(createFormatContext());
+ RecordReader reader =
+ mosaic.createReaderFactory(fullType, projectedType, null)
+ .createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)));
+ List result = new ArrayList<>();
+ reader.forEachRemaining(
+ row -> {
+ Object[] fields = new Object[projectedType.getFieldCount()];
+ for (int i = 0; i < fields.length; i++) {
+ if (row.isNullAt(i)) {
+ fields[i] = null;
+ } else if (projectedType.getTypeAt(i).getTypeRoot()
+ == DataTypeRoot.INTEGER) {
+ fields[i] = row.getInt(i);
+ } else {
+ fields[i] = row.getString(i);
+ }
+ }
+ result.add(GenericRow.of(fields));
+ });
+ reader.close();
+ return result;
+ }
+
+ // ==================== Helpers ====================
+
+ private static final int INT_COLUMN_INTERVAL = 10;
+ private static final String[] STRING_SAMPLES = {
+ "uuid: 550e8400-e29b-41d4-a716-446655440000",
+ "{\"user_id\": 12345, \"action\": \"click\", \"page\": \"home\"}",
+ "https://example.com/api/v1/resource/abc123?query=active&sort=desc",
+ "customer_service@company-name.example.com",
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+ "2024-01-15T09:23:47.123Z",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+ "error: connection timeout after 30000ms, retrying...",
+ "session_token_a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6q7r8s9t0",
+ "active,verified,premium,notifications_enabled,marketing_opt_in",
+ "New York, NY 10001, United States",
+ "REF-ORD-2024-8847293-XJ",
+ "0x7f8a9b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9",
+ "Approved by manager at 2024-01-15T10:00:00Z",
+ "file:///data/storage/partition_2024_01/batch_17.parquet",
+ "[ERROR] NullPointerException at com.example.Service.processLine(42)",
+ "User preferences: theme=dark, lang=zh-CN, timezone=Asia/Shanghai",
+ "192.168.1.105",
+ "Batch job completed successfully. Processed 1,234,567 records in 45.3s.",
+ "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0",
+ "Shipping via FedEx Ground, tracking #: 784930123456, est. 3 business days",
+ "comment: This product exceeded my expectations! Would recommend to everyone.",
+ "department=engineering|team=platform|role=senior|level=L6",
+ "version=3.2.1-SNAPSHOT, build=20240115.1423, commit=abc123def",
+ "payment_method=visa_ending_4242|billing_cycle=monthly|amount=99.99USD"
+ };
+
+ private static RowType buildWideRowType() {
+ RowType.Builder builder = RowType.builder();
+ for (int i = 0; i < COLUMN_COUNT; i++) {
+ String name =
+ String.format(
+ "this_is_a_very_long_column_name_for_testing_compression_ratio_column_index_%05d",
+ i);
+ if (i % INT_COLUMN_INTERVAL == 0) {
+ builder.field(name, DataTypes.INT());
+ } else {
+ builder.field(name, DataTypes.STRING());
+ }
+ }
+ return builder.build();
+ }
+
+ private static GenericRow generateRow(int rowIndex, int fieldCount) {
+ Object[] fields = new Object[fieldCount];
+ for (int c = 0; c < fieldCount; c++) {
+ if (c % INT_COLUMN_INTERVAL == 0) {
+ fields[c] = rowIndex * fieldCount + c;
+ } else {
+ int sampleIdx = (rowIndex + c) % STRING_SAMPLES.length;
+ fields[c] =
+ BinaryString.fromString(
+ STRING_SAMPLES[sampleIdx]
+ + " [row="
+ + rowIndex
+ + ",col="
+ + c
+ + "]");
+ }
+ }
+ return GenericRow.of(fields);
+ }
+
+ private static boolean isIntColumn(int index) {
+ return index % INT_COLUMN_INTERVAL == 0;
+ }
+
+ private static void assertCellEqual(InternalRow actual, InternalRow expected, int col) {
+ if (isIntColumn(col)) {
+ check(actual.getInt(col) == expected.getInt(col), "INT mismatch at col=" + col);
+ } else {
+ check(
+ actual.getString(col).toString().equals(expected.getString(col).toString()),
+ "STRING mismatch at col=" + col);
+ }
+ }
+
+ private static void check(boolean condition, String message) {
+ if (!condition) {
+ throw new AssertionError(message);
+ }
+ }
+
+ private static FormatContext createFormatContext() {
+ return new FormatContext(new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 9, null);
+ }
+
+ private static void deleteRecursively(java.nio.file.Path dir) {
+ try {
+ Files.walk(dir)
+ .sorted(java.util.Comparator.reverseOrder())
+ .forEach(
+ p -> {
+ try {
+ Files.deleteIfExists(p);
+ } catch (IOException e) {
+ // ignore
+ }
+ });
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+
+ private interface Runner {
+ void run(java.nio.file.Path tempDir) throws IOException;
+ }
+}
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java
new file mode 100644
index 000000000000..17e4d3a5a392
--- /dev/null
+++ b/paimon-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java
@@ -0,0 +1,1244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.mosaic;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.data.Decimal;
+import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.data.Timestamp;
+import org.apache.paimon.format.FileFormatFactory.FormatContext;
+import org.apache.paimon.format.FormatReaderContext;
+import org.apache.paimon.format.FormatReaderFactory;
+import org.apache.paimon.format.FormatWriter;
+import org.apache.paimon.format.FormatWriterFactory;
+import org.apache.paimon.format.orc.OrcFileFormat;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.options.MemorySize;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.reader.FileRecordReader;
+import org.apache.paimon.reader.RecordReader;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for the Mosaic file format. */
+public class MosaicFileFormatTest {
+
+ @TempDir java.nio.file.Path tempDir;
+
+ @Test
+ public void testBasicRoundTrip() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("name", DataTypes.STRING())
+ .field("value", DataTypes.DOUBLE())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, BinaryString.fromString("name_" + i), i * 1.5));
+ }
+
+ Path path = new Path(tempDir.toString(), "basic.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getString(1).toString()).isEqualTo("name_" + i);
+ assertThat(result.get(i).getDouble(2)).isEqualTo(i * 1.5);
+ }
+ }
+
+ @Test
+ public void testProjectionPushdown() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("a", DataTypes.INT())
+ .field("b", DataTypes.STRING())
+ .field("c", DataTypes.BIGINT())
+ .field("d", DataTypes.DOUBLE())
+ .field("e", DataTypes.FLOAT())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 50; i++) {
+ data.add(
+ GenericRow.of(
+ i,
+ BinaryString.fromString("val_" + i),
+ (long) i * 100,
+ i * 2.5,
+ (float) i * 0.1f));
+ }
+
+ Path path = new Path(tempDir.toString(), "proj.mosaic");
+ write(rowType, data, path);
+
+ // Project only columns a and c
+ RowType projectedType =
+ RowType.builder()
+ .field("a", DataTypes.INT())
+ .field("c", DataTypes.BIGINT())
+ .build();
+
+ List result = read(rowType, projectedType, path);
+
+ assertThat(result).hasSize(50);
+ for (int i = 0; i < 50; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getLong(1)).isEqualTo((long) i * 100);
+ }
+ }
+
+ @Test
+ public void testProjectionSkipsVariableLengthColumns() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("f_int", DataTypes.INT())
+ .field("f_str1", DataTypes.STRING())
+ .field("f_bytes", DataTypes.BYTES())
+ .field("f_str2", DataTypes.STRING())
+ .field("f_decimal_large", DataTypes.DECIMAL(30, 5))
+ .field("f_target", DataTypes.BIGINT())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(
+ GenericRow.of(
+ i,
+ BinaryString.fromString("variable_length_string_" + i),
+ ("binary_data_" + i).getBytes(),
+ BinaryString.fromString("another_string_value_" + i),
+ Decimal.fromBigDecimal(
+ new BigDecimal("123456789012345678901234.12345"), 30, 5),
+ (long) i * 1000));
+ }
+
+ Path path = new Path(tempDir.toString(), "skip_varlen.mosaic");
+ write(rowType, data, path);
+
+ // Project only f_int and f_target, forcing reader to skip variable-length columns in
+ // between
+ RowType projectedType =
+ RowType.builder()
+ .field("f_int", DataTypes.INT())
+ .field("f_target", DataTypes.BIGINT())
+ .build();
+
+ List result = read(rowType, projectedType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getLong(1)).isEqualTo((long) i * 1000);
+ }
+ }
+
+ @Test
+ public void testNullValues() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("name", DataTypes.STRING().nullable())
+ .field("value", DataTypes.DOUBLE().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ data.add(GenericRow.of(1, BinaryString.fromString("hello"), 1.0));
+ data.add(GenericRow.of(2, null, 2.0));
+ data.add(GenericRow.of(3, BinaryString.fromString("world"), null));
+ data.add(GenericRow.of(4, null, null));
+
+ Path path = new Path(tempDir.toString(), "nulls.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(4);
+
+ assertThat(result.get(0).getInt(0)).isEqualTo(1);
+ assertThat(result.get(0).getString(1).toString()).isEqualTo("hello");
+ assertThat(result.get(0).getDouble(2)).isEqualTo(1.0);
+
+ assertThat(result.get(1).getInt(0)).isEqualTo(2);
+ assertThat(result.get(1).isNullAt(1)).isTrue();
+ assertThat(result.get(1).getDouble(2)).isEqualTo(2.0);
+
+ assertThat(result.get(2).getInt(0)).isEqualTo(3);
+ assertThat(result.get(2).getString(1).toString()).isEqualTo("world");
+ assertThat(result.get(2).isNullAt(2)).isTrue();
+
+ assertThat(result.get(3).getInt(0)).isEqualTo(4);
+ assertThat(result.get(3).isNullAt(1)).isTrue();
+ assertThat(result.get(3).isNullAt(2)).isTrue();
+ }
+
+ @Test
+ public void testAllPrimitiveTypes() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("f_boolean", DataTypes.BOOLEAN())
+ .field("f_tinyint", DataTypes.TINYINT())
+ .field("f_smallint", DataTypes.SMALLINT())
+ .field("f_int", DataTypes.INT())
+ .field("f_bigint", DataTypes.BIGINT())
+ .field("f_float", DataTypes.FLOAT())
+ .field("f_double", DataTypes.DOUBLE())
+ .field("f_string", DataTypes.STRING())
+ .field("f_bytes", DataTypes.BYTES())
+ .field("f_decimal_compact", DataTypes.DECIMAL(10, 2))
+ .field("f_decimal_large", DataTypes.DECIMAL(30, 5))
+ .field("f_date", DataTypes.DATE())
+ .field("f_timestamp", DataTypes.TIMESTAMP(3))
+ .field("f_timestamp_high", DataTypes.TIMESTAMP(9))
+ .build();
+
+ List data = new ArrayList<>();
+ data.add(
+ GenericRow.of(
+ true,
+ (byte) 42,
+ (short) 1234,
+ 999999,
+ 123456789012345L,
+ 3.14f,
+ 2.718281828,
+ BinaryString.fromString("hello world"),
+ new byte[] {1, 2, 3, 4, 5},
+ Decimal.fromBigDecimal(new BigDecimal("12345.67"), 10, 2),
+ Decimal.fromBigDecimal(
+ new BigDecimal("123456789012345678901234.12345"), 30, 5),
+ 19000, // days since epoch
+ Timestamp.fromEpochMillis(1700000000000L),
+ Timestamp.fromEpochMillis(1700000000000L, 123456)));
+
+ Path path = new Path(tempDir.toString(), "all_types.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(1);
+ InternalRow row = result.get(0);
+ assertThat(row.getBoolean(0)).isTrue();
+ assertThat(row.getByte(1)).isEqualTo((byte) 42);
+ assertThat(row.getShort(2)).isEqualTo((short) 1234);
+ assertThat(row.getInt(3)).isEqualTo(999999);
+ assertThat(row.getLong(4)).isEqualTo(123456789012345L);
+ assertThat(row.getFloat(5)).isEqualTo(3.14f);
+ assertThat(row.getDouble(6)).isEqualTo(2.718281828);
+ assertThat(row.getString(7).toString()).isEqualTo("hello world");
+ assertThat(row.getBinary(8)).isEqualTo(new byte[] {1, 2, 3, 4, 5});
+ assertThat(row.getDecimal(9, 10, 2).toBigDecimal())
+ .isEqualByComparingTo(new BigDecimal("12345.67"));
+ assertThat(row.getDecimal(10, 30, 5).toBigDecimal())
+ .isEqualByComparingTo(new BigDecimal("123456789012345678901234.12345"));
+ assertThat(row.getInt(11)).isEqualTo(19000);
+ assertThat(row.getTimestamp(12, 3).getMillisecond()).isEqualTo(1700000000000L);
+ assertThat(row.getTimestamp(13, 9).getMillisecond()).isEqualTo(1700000000000L);
+ assertThat(row.getTimestamp(13, 9).getNanoOfMillisecond()).isEqualTo(123456);
+ }
+
+ @Test
+ public void testWideTable() throws IOException {
+ int columnCount = 10000;
+ int rowCount = 10;
+
+ RowType rowType = buildWideRowType(columnCount);
+ List data = new ArrayList<>();
+ for (int r = 0; r < rowCount; r++) {
+ Object[] fields = new Object[columnCount];
+ for (int c = 0; c < columnCount; c++) {
+ fields[c] = r * columnCount + c;
+ }
+ data.add(GenericRow.of(fields));
+ }
+
+ Path path = new Path(tempDir.toString(), "wide.mosaic");
+ LocalFileIO fileIO = new LocalFileIO();
+ write(rowType, data, path);
+ long mosaicSize = fileIO.getFileSize(path);
+
+ // Compare with ORC
+ Path orcPath = new Path(tempDir.toString(), "wide.orc");
+ OrcFileFormat orc =
+ new OrcFileFormat(
+ new FormatContext(
+ new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 9, null));
+ FormatWriterFactory orcWriterFactory = orc.createWriterFactory(rowType);
+ PositionOutputStream orcOut = fileIO.newOutputStream(orcPath, false);
+ FormatWriter orcWriter = orcWriterFactory.create(orcOut, "zstd");
+ for (InternalRow row : data) {
+ orcWriter.addElement(row);
+ }
+ orcWriter.close();
+ orcOut.close();
+ long orcSize = fileIO.getFileSize(orcPath);
+
+ System.out.println("=== Wide Table: Mosaic vs ORC ===");
+ System.out.printf("Mosaic: %,d bytes (%.1f KB)%n", mosaicSize, mosaicSize / 1024.0);
+ System.out.printf("ORC: %,d bytes (%.1f KB)%n", orcSize, orcSize / 1024.0);
+ System.out.printf("Ratio: ORC is %.1fx larger%n", (double) orcSize / mosaicSize);
+
+ assertThat(mosaicSize).isLessThan(orcSize);
+
+ // Verify correctness
+ List result = read(rowType, rowType, path);
+ assertThat(result).hasSize(rowCount);
+ for (int r = 0; r < rowCount; r++) {
+ for (int c = 0; c < columnCount; c++) {
+ assertThat(result.get(r).getInt(c)).isEqualTo(r * columnCount + c);
+ }
+ }
+ }
+
+ @Test
+ public void testWideTableProjection() throws IOException {
+ int columnCount = 10000;
+ int rowCount = 100;
+
+ RowType rowType = buildWideRowType(columnCount);
+ List data = new ArrayList<>();
+ for (int r = 0; r < rowCount; r++) {
+ Object[] fields = new Object[columnCount];
+ for (int c = 0; c < columnCount; c++) {
+ fields[c] = r * columnCount + c;
+ }
+ data.add(GenericRow.of(fields));
+ }
+
+ Path path = new Path(tempDir.toString(), "wide_proj.mosaic");
+ write(rowType, data, path);
+
+ // Project 10 columns
+ int[] projectedIndices = {0, 100, 500, 1000, 2000, 5000, 7000, 8000, 9000, 9999};
+ RowType projectedType = rowType.project(projectedIndices);
+
+ List result = read(rowType, projectedType, path);
+
+ assertThat(result).hasSize(rowCount);
+ for (int r = 0; r < rowCount; r++) {
+ for (int i = 0; i < projectedIndices.length; i++) {
+ int c = projectedIndices[i];
+ assertThat(result.get(r).getInt(i)).isEqualTo(r * columnCount + c);
+ }
+ }
+ }
+
+ @Test
+ public void testEmptyTable() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("name", DataTypes.STRING())
+ .build();
+
+ Path path = new Path(tempDir.toString(), "empty.mosaic");
+ write(rowType, new ArrayList<>(), path);
+ List result = read(rowType, rowType, path);
+ assertThat(result).isEmpty();
+ }
+
+ @Test
+ public void testSingleColumn() throws IOException {
+ RowType rowType = RowType.builder().field("id", DataTypes.INT()).build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 10; i++) {
+ data.add(GenericRow.of(i));
+ }
+
+ Path path = new Path(tempDir.toString(), "single.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(10);
+ for (int i = 0; i < 10; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ }
+ }
+
+ @Test
+ public void testMultiRowGroupStringStability() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("name", DataTypes.STRING())
+ .build();
+
+ // Use tiny writeBatchMemory to force multiple row groups
+ MosaicFileFormat format =
+ new MosaicFileFormat(
+ new FormatContext(
+ new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null));
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, BinaryString.fromString("string_value_" + i)));
+ }
+
+ Path path = new Path(tempDir.toString(), "multi_rg_string.mosaic");
+ LocalFileIO fileIO = new LocalFileIO();
+ FormatWriterFactory writerFactory = format.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, "zstd");
+ for (InternalRow row : data) {
+ writer.addElement(row);
+ }
+ writer.close();
+ out.close();
+
+ // Project only the string column
+ RowType projectedType = RowType.builder().field("name", DataTypes.STRING()).build();
+ FormatReaderFactory readerFactory =
+ format.createReaderFactory(rowType, projectedType, null);
+ FileRecordReader reader =
+ (FileRecordReader)
+ readerFactory.createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)));
+
+ // Read batches one by one; retain string values from earlier batches
+ List allStrings = new ArrayList<>();
+ RecordReader.RecordIterator batch;
+ while ((batch = reader.readBatch()) != null) {
+ InternalRow row;
+ while ((row = batch.next()) != null) {
+ allStrings.add(row.getString(0));
+ }
+ batch.releaseBatch();
+ }
+ reader.close();
+
+ // Verify all retained strings are still correct
+ assertThat(allStrings).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(allStrings.get(i).toString()).isEqualTo("string_value_" + i);
+ }
+ }
+
+ // ==================== Columnar Encoding Tests ====================
+
+ @Test
+ public void testConstEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("const_int", DataTypes.INT())
+ .field("const_long", DataTypes.BIGINT())
+ .field("const_double", DataTypes.DOUBLE())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 200; i++) {
+ data.add(GenericRow.of(i, 42, 999L, 3.14));
+ }
+
+ Path path = new Path(tempDir.toString(), "const_enc.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(200);
+ for (int i = 0; i < 200; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getInt(1)).isEqualTo(42);
+ assertThat(result.get(i).getLong(2)).isEqualTo(999L);
+ assertThat(result.get(i).getDouble(3)).isEqualTo(3.14);
+ }
+ }
+
+ @Test
+ public void testConstEncodingWithNulls() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("const_nullable", DataTypes.INT().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, i % 3 == 0 ? null : 42));
+ }
+
+ Path path = new Path(tempDir.toString(), "const_null.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ if (i % 3 == 0) {
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ } else {
+ assertThat(result.get(i).getInt(1)).isEqualTo(42);
+ }
+ }
+ }
+
+ @Test
+ public void testBooleanConstEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("flag_true", DataTypes.BOOLEAN())
+ .field("flag_false", DataTypes.BOOLEAN())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, true, false));
+ }
+
+ Path path = new Path(tempDir.toString(), "bool_const.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getBoolean(1)).isTrue();
+ assertThat(result.get(i).getBoolean(2)).isFalse();
+ }
+ }
+
+ @Test
+ public void testBooleanDictEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("flag", DataTypes.BOOLEAN())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, i % 2 == 0));
+ }
+
+ Path path = new Path(tempDir.toString(), "bool_dict.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getBoolean(1)).isEqualTo(i % 2 == 0);
+ }
+ }
+
+ @Test
+ public void testDictEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("status", DataTypes.INT())
+ .field("category", DataTypes.BIGINT())
+ .field("level", DataTypes.SMALLINT())
+ .build();
+
+ int[] statuses = {1, 2, 3, 4, 5};
+ long[] categories = {100L, 200L, 300L};
+ short[] levels = {10, 20};
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 200; i++) {
+ data.add(GenericRow.of(i, statuses[i % 5], categories[i % 3], levels[i % 2]));
+ }
+
+ Path path = new Path(tempDir.toString(), "dict_enc.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(200);
+ for (int i = 0; i < 200; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getInt(1)).isEqualTo(statuses[i % 5]);
+ assertThat(result.get(i).getLong(2)).isEqualTo(categories[i % 3]);
+ assertThat(result.get(i).getShort(3)).isEqualTo(levels[i % 2]);
+ }
+ }
+
+ @Test
+ public void testDictEncodingWithNulls() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("flag", DataTypes.TINYINT().nullable())
+ .build();
+
+ byte[] flags = {1, 2, 3};
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, i % 4 == 0 ? null : flags[i % 3]));
+ }
+
+ Path path = new Path(tempDir.toString(), "dict_null.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ if (i % 4 == 0) {
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ } else {
+ assertThat(result.get(i).getByte(1)).isEqualTo(flags[i % 3]);
+ }
+ }
+ }
+
+ @Test
+ public void testDictEncodingBoundary() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("dict_255", DataTypes.INT())
+ .field("plain_256", DataTypes.INT())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 512; i++) {
+ data.add(GenericRow.of(i % 255, i % 256));
+ }
+
+ Path path = new Path(tempDir.toString(), "dict_boundary.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(512);
+ for (int i = 0; i < 512; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i % 255);
+ assertThat(result.get(i).getInt(1)).isEqualTo(i % 256);
+ }
+ }
+
+ @Test
+ public void testFloatDictEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("f_float", DataTypes.FLOAT())
+ .field("f_double", DataTypes.DOUBLE())
+ .build();
+
+ float[] floats = {1.5f, 2.5f, 3.5f};
+ double[] doubles = {10.1, 20.2};
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, floats[i % 3], doubles[i % 2]));
+ }
+
+ Path path = new Path(tempDir.toString(), "float_dict.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getFloat(1)).isEqualTo(floats[i % 3]);
+ assertThat(result.get(i).getDouble(2)).isEqualTo(doubles[i % 2]);
+ }
+ }
+
+ @Test
+ public void testAllNullEncoding() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("always_null_str", DataTypes.STRING().nullable())
+ .field("always_null_dbl", DataTypes.DOUBLE().nullable())
+ .field("always_null_int", DataTypes.INT().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 50; i++) {
+ data.add(GenericRow.of(i, null, null, null));
+ }
+
+ Path path = new Path(tempDir.toString(), "all_null_enc.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(50);
+ for (int i = 0; i < 50; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ assertThat(result.get(i).isNullAt(2)).isTrue();
+ assertThat(result.get(i).isNullAt(3)).isTrue();
+ }
+ }
+
+ @Test
+ public void testMixedEncodings() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("plain_col", DataTypes.INT())
+ .field("const_col", DataTypes.BIGINT())
+ .field("dict_col", DataTypes.SMALLINT())
+ .field("all_null_col", DataTypes.DOUBLE().nullable())
+ .field("plain_str", DataTypes.STRING())
+ .build();
+
+ short[] dictValues = {10, 20, 30, 40, 50};
+ List data = new ArrayList<>();
+ for (int i = 0; i < 1000; i++) {
+ data.add(
+ GenericRow.of(
+ i, 999L, dictValues[i % 5], null, BinaryString.fromString("str_" + i)));
+ }
+
+ Path path = new Path(tempDir.toString(), "mixed_enc.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(1000);
+ for (int i = 0; i < 1000; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getLong(1)).isEqualTo(999L);
+ assertThat(result.get(i).getShort(2)).isEqualTo(dictValues[i % 5]);
+ assertThat(result.get(i).isNullAt(3)).isTrue();
+ assertThat(result.get(i).getString(4).toString()).isEqualTo("str_" + i);
+ }
+ }
+
+ @Test
+ public void testMixedEncodingsWithProjection() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("plain_col", DataTypes.INT())
+ .field("const_col", DataTypes.BIGINT())
+ .field("dict_col", DataTypes.SMALLINT())
+ .field("all_null_col", DataTypes.DOUBLE().nullable())
+ .field("plain_str", DataTypes.STRING())
+ .build();
+
+ short[] dictValues = {10, 20, 30};
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(
+ GenericRow.of(
+ i, 42L, dictValues[i % 3], null, BinaryString.fromString("s" + i)));
+ }
+
+ Path path = new Path(tempDir.toString(), "mixed_proj.mosaic");
+ write(rowType, data, path);
+
+ RowType projectedType =
+ RowType.builder()
+ .field("dict_col", DataTypes.SMALLINT())
+ .field("const_col", DataTypes.BIGINT())
+ .build();
+
+ List result = read(rowType, projectedType, path);
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getShort(0)).isEqualTo(dictValues[i % 3]);
+ assertThat(result.get(i).getLong(1)).isEqualTo(42L);
+ }
+ }
+
+ // ==================== Schema Prefix Compression Tests ====================
+
+ @Test
+ public void testSchemaPrefixCompression() throws IOException {
+ int numCols = 100;
+ RowType.Builder builder = RowType.builder();
+ for (int i = 0; i < numCols; i++) {
+ builder.field(
+ "com.example.sensors.signal_" + String.format("%03d", i),
+ DataTypes.DOUBLE().nullable());
+ }
+ RowType rowType = builder.build();
+
+ List data = new ArrayList<>();
+ for (int r = 0; r < 50; r++) {
+ Object[] fields = new Object[numCols];
+ for (int c = 0; c < numCols; c++) {
+ fields[c] = (double) (r * numCols + c);
+ }
+ data.add(GenericRow.of(fields));
+ }
+
+ Path path = new Path(tempDir.toString(), "prefix.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(50);
+ for (int r = 0; r < 50; r++) {
+ for (int c = 0; c < numCols; c++) {
+ assertThat(result.get(r).getDouble(c)).isEqualTo((double) (r * numCols + c));
+ }
+ }
+
+ RowType projectedType =
+ RowType.builder()
+ .field("com.example.sensors.signal_050", DataTypes.DOUBLE().nullable())
+ .build();
+ List projected = read(rowType, projectedType, path);
+ assertThat(projected).hasSize(50);
+ for (int r = 0; r < 50; r++) {
+ assertThat(projected.get(r).getDouble(0)).isEqualTo((double) (r * numCols + 50));
+ }
+ }
+
+ @Test
+ public void testSchemaMixedPrefixAndNonPrefix() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("group.a.signal_1", DataTypes.DOUBLE())
+ .field("group.a.signal_2", DataTypes.DOUBLE())
+ .field("name", DataTypes.STRING())
+ .field("group.b.signal_1", DataTypes.FLOAT())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 20; i++) {
+ data.add(
+ GenericRow.of(
+ i,
+ (double) i,
+ (double) (i * 2),
+ BinaryString.fromString("n" + i),
+ (float) i));
+ }
+
+ Path path = new Path(tempDir.toString(), "mixed_prefix.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(20);
+ for (int i = 0; i < 20; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getDouble(1)).isEqualTo((double) i);
+ assertThat(result.get(i).getDouble(2)).isEqualTo((double) (i * 2));
+ assertThat(result.get(i).getString(3).toString()).isEqualTo("n" + i);
+ assertThat(result.get(i).getFloat(4)).isEqualTo((float) i);
+ }
+ }
+
+ @Test
+ public void testSchemaSerializationRoundTrip() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("simple", DataTypes.INT())
+ .field("a.b.col1", DataTypes.DOUBLE())
+ .field("a.b.col2", DataTypes.STRING())
+ .field("x.y.z.col3", DataTypes.BIGINT())
+ .build();
+
+ MosaicSchema original = MosaicSchema.create(rowType, 10);
+ byte[] serialized = original.serialize();
+ MosaicSchema restored = MosaicSchema.deserialize(serialized);
+
+ assertThat(restored.numBuckets()).isEqualTo(10);
+
+ RowType projAll = rowType;
+ for (int b = 0; b < 10; b++) {
+ int[] origMapping = original.getProjectionMapping(b, projAll);
+ int[] restoredMapping = restored.getProjectionMapping(b, projAll);
+ if (origMapping == null) {
+ assertThat(restoredMapping).isNull();
+ } else {
+ assertThat(restoredMapping).isEqualTo(origMapping);
+ }
+ }
+ }
+
+ // ==================== ALL_NULL Column Pruning Tests ====================
+
+ @Test
+ public void testAllNullColumnPruningRoundTrip() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("null_col_1", DataTypes.DOUBLE().nullable())
+ .field("value", DataTypes.BIGINT())
+ .field("null_col_2", DataTypes.STRING().nullable())
+ .field("null_col_3", DataTypes.INT().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, null, (long) i * 10, null, null));
+ }
+
+ Path path = new Path(tempDir.toString(), "prune.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ assertThat(result.get(i).getLong(2)).isEqualTo((long) i * 10);
+ assertThat(result.get(i).isNullAt(3)).isTrue();
+ assertThat(result.get(i).isNullAt(4)).isTrue();
+ }
+ }
+
+ @Test
+ public void testProjectPrunedAllNullColumn() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("always_null", DataTypes.DOUBLE().nullable())
+ .field("value", DataTypes.INT())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 50; i++) {
+ data.add(GenericRow.of(i, null, i * 2));
+ }
+
+ Path path = new Path(tempDir.toString(), "proj_pruned.mosaic");
+ write(rowType, data, path);
+
+ RowType projNull =
+ RowType.builder().field("always_null", DataTypes.DOUBLE().nullable()).build();
+ List result = read(rowType, projNull, path);
+ assertThat(result).hasSize(50);
+ for (int i = 0; i < 50; i++) {
+ assertThat(result.get(i).isNullAt(0)).isTrue();
+ }
+
+ RowType projMixed =
+ RowType.builder()
+ .field("always_null", DataTypes.DOUBLE().nullable())
+ .field("value", DataTypes.INT())
+ .build();
+ List result2 = read(rowType, projMixed, path);
+ assertThat(result2).hasSize(50);
+ for (int i = 0; i < 50; i++) {
+ assertThat(result2.get(i).isNullAt(0)).isTrue();
+ assertThat(result2.get(i).getInt(1)).isEqualTo(i * 2);
+ }
+ }
+
+ @Test
+ public void testAllNullPruningWideTable() throws IOException {
+ int totalCols = 500;
+ int nonNullCols = 50;
+
+ RowType.Builder builder = RowType.builder();
+ for (int i = 0; i < totalCols; i++) {
+ builder.field("col_" + String.format("%04d", i), DataTypes.INT().nullable());
+ }
+ RowType rowType = builder.build();
+
+ List data = new ArrayList<>();
+ for (int r = 0; r < 100; r++) {
+ Object[] fields = new Object[totalCols];
+ for (int c = 0; c < nonNullCols; c++) {
+ fields[c] = r * totalCols + c;
+ }
+ data.add(GenericRow.of(fields));
+ }
+
+ Path path = new Path(tempDir.toString(), "wide_prune.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int r = 0; r < 100; r++) {
+ for (int c = 0; c < nonNullCols; c++) {
+ assertThat(result.get(r).getInt(c)).isEqualTo(r * totalCols + c);
+ }
+ for (int c = nonNullCols; c < totalCols; c++) {
+ assertThat(result.get(r).isNullAt(c)).isTrue();
+ }
+ }
+
+ // Verify pruning reduced schema size (compared to no pruning)
+ LocalFileIO fileIO = new LocalFileIO();
+ long prunedFileSize = fileIO.getFileSize(path);
+
+ // Write same data without pruning (multi-row-group forces no pruning)
+ Path noPrunePath = new Path(tempDir.toString(), "wide_no_prune.mosaic");
+ MosaicFileFormat tinyFormat =
+ new MosaicFileFormat(
+ new FormatContext(
+ new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null));
+ FormatWriterFactory noPruneFactory = tinyFormat.createWriterFactory(rowType);
+ PositionOutputStream noPruneOut = fileIO.newOutputStream(noPrunePath, false);
+ FormatWriter noPruneWriter = noPruneFactory.create(noPruneOut, "zstd");
+ for (InternalRow row : data) {
+ noPruneWriter.addElement(row);
+ }
+ noPruneWriter.close();
+ noPruneOut.close();
+ long noPruneSize = fileIO.getFileSize(noPrunePath);
+
+ System.out.printf(
+ "Pruning test: pruned=%,d bytes, unpruned=%,d bytes, saved=%.0f%%%n",
+ prunedFileSize, noPruneSize, (1.0 - (double) prunedFileSize / noPruneSize) * 100);
+ assertThat(prunedFileSize).isLessThan(noPruneSize);
+ }
+
+ @Test
+ public void testMultiRowGroupNoPruning() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("nullable", DataTypes.INT().nullable())
+ .build();
+
+ MosaicFileFormat format =
+ new MosaicFileFormat(
+ new FormatContext(
+ new Options(), 1024, 1024, MemorySize.ofBytes(1), 3, null));
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, i == 0 ? 42 : null));
+ }
+
+ Path path = new Path(tempDir.toString(), "multi_rg_no_prune.mosaic");
+ LocalFileIO fileIO = new LocalFileIO();
+ FormatWriterFactory writerFactory = format.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, "zstd");
+ for (InternalRow row : data) {
+ writer.addElement(row);
+ }
+ writer.close();
+ out.close();
+
+ FormatReaderFactory readerFactory = format.createReaderFactory(rowType, rowType, null);
+ List result = new ArrayList<>();
+ try (RecordReader reader =
+ readerFactory.createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)))) {
+ reader.forEachRemaining(
+ row -> {
+ Object[] fields = new Object[rowType.getFieldCount()];
+ for (int i = 0; i < fields.length; i++) {
+ if (!row.isNullAt(i)) {
+ fields[i] =
+ InternalRow.createFieldGetter(rowType.getTypeAt(i), i)
+ .getFieldOrNull(row);
+ }
+ }
+ result.add(GenericRow.of(fields));
+ });
+ }
+
+ assertThat(result).hasSize(100);
+ assertThat(result.get(0).getInt(0)).isEqualTo(0);
+ assertThat(result.get(0).getInt(1)).isEqualTo(42);
+ for (int i = 1; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ }
+ }
+
+ @Test
+ public void testAllColumnsAllNull() throws IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("a", DataTypes.INT().nullable())
+ .field("b", DataTypes.STRING().nullable())
+ .field("c", DataTypes.DOUBLE().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 30; i++) {
+ data.add(GenericRow.of(null, null, null));
+ }
+
+ Path path = new Path(tempDir.toString(), "all_cols_null.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(30);
+ for (int i = 0; i < 30; i++) {
+ assertThat(result.get(i).isNullAt(0)).isTrue();
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ assertThat(result.get(i).isNullAt(2)).isTrue();
+ }
+ }
+
+ // ==================== Helpers ====================
+
+ private void write(RowType rowType, List data, Path path) throws IOException {
+ LocalFileIO fileIO = new LocalFileIO();
+ MosaicFileFormat format = createFormat();
+ FormatWriterFactory writerFactory = format.createWriterFactory(rowType);
+ PositionOutputStream out = fileIO.newOutputStream(path, false);
+ FormatWriter writer = writerFactory.create(out, "zstd");
+ for (InternalRow row : data) {
+ writer.addElement(row);
+ }
+ writer.close();
+ out.close();
+ }
+
+ private List read(RowType dataType, RowType projectedType, Path path)
+ throws IOException {
+ LocalFileIO fileIO = new LocalFileIO();
+ MosaicFileFormat format = createFormat();
+ FormatReaderFactory readerFactory =
+ format.createReaderFactory(dataType, projectedType, null);
+ RecordReader reader =
+ readerFactory.createReader(
+ new FormatReaderContext(fileIO, path, fileIO.getFileSize(path)));
+
+ List result = new ArrayList<>();
+ reader.forEachRemaining(
+ row -> {
+ int fieldCount = projectedType.getFieldCount();
+ Object[] fields = new Object[fieldCount];
+ for (int i = 0; i < fieldCount; i++) {
+ if (row.isNullAt(i)) {
+ fields[i] = null;
+ } else {
+ fields[i] =
+ InternalRow.createFieldGetter(projectedType.getTypeAt(i), i)
+ .getFieldOrNull(row);
+ }
+ }
+ result.add(GenericRow.of(fields));
+ });
+ reader.close();
+ return result;
+ }
+
+ @Test
+ public void testLongConstantString() throws IOException {
+ // 1KB constant string — CONST should work regardless of value length
+ String longStr = repeatChar('x', 1024);
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("long_const", DataTypes.STRING())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 200; i++) {
+ data.add(GenericRow.of(i, BinaryString.fromString(longStr)));
+ }
+
+ Path path = new Path(tempDir.toString(), "long_const.mosaic");
+ write(rowType, data, path);
+
+ // Verify CONST is smaller than PLAIN (200 * 1KB = 200KB plain, CONST = 1KB)
+ long fileSize = tempDir.toFile().toPath().resolve("long_const.mosaic").toFile().length();
+
+ List result = read(rowType, rowType, path);
+ assertThat(result).hasSize(200);
+ for (int i = 0; i < 200; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getString(1).toString()).isEqualTo(longStr);
+ }
+ }
+
+ @Test
+ public void testLongConstantStringWithNulls() throws IOException {
+ String longStr = repeatChar('y', 2048);
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("long_const_nullable", DataTypes.STRING().nullable())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ data.add(GenericRow.of(i, i % 3 == 0 ? null : BinaryString.fromString(longStr)));
+ }
+
+ Path path = new Path(tempDir.toString(), "long_const_null.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(100);
+ for (int i = 0; i < 100; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ if (i % 3 == 0) {
+ assertThat(result.get(i).isNullAt(1)).isTrue();
+ } else {
+ assertThat(result.get(i).getString(1).toString()).isEqualTo(longStr);
+ }
+ }
+ }
+
+ @Test
+ public void testRepeatedLongStringsDict() throws IOException {
+ // 5 distinct 500-byte strings — should use DICT encoding
+ String[] values = new String[5];
+ for (int i = 0; i < 5; i++) {
+ values[i] = repeatChar((char) ('A' + i), 500);
+ }
+
+ RowType rowType =
+ RowType.builder()
+ .field("id", DataTypes.INT())
+ .field("long_dict", DataTypes.STRING())
+ .build();
+
+ List data = new ArrayList<>();
+ for (int i = 0; i < 500; i++) {
+ data.add(GenericRow.of(i, BinaryString.fromString(values[i % 5])));
+ }
+
+ Path path = new Path(tempDir.toString(), "long_dict.mosaic");
+ write(rowType, data, path);
+ List result = read(rowType, rowType, path);
+
+ assertThat(result).hasSize(500);
+ for (int i = 0; i < 500; i++) {
+ assertThat(result.get(i).getInt(0)).isEqualTo(i);
+ assertThat(result.get(i).getString(1).toString()).isEqualTo(values[i % 5]);
+ }
+ }
+
+ private MosaicFileFormat createFormat() {
+ return new MosaicFileFormat(
+ new FormatContext(new Options(), 1024, 1024, MemorySize.ofMebiBytes(128), 3, null));
+ }
+
+ private static String repeatChar(char c, int count) {
+ char[] chars = new char[count];
+ Arrays.fill(chars, c);
+ return new String(chars);
+ }
+
+ private RowType buildWideRowType(int columnCount) {
+ RowType.Builder builder = RowType.builder();
+ for (int i = 0; i < columnCount; i++) {
+ builder.field(
+ String.format(
+ "this_is_a_very_long_column_name_for_testing_compression_ratio_column_index_%05d",
+ i),
+ DataTypes.INT());
+ }
+ return builder.build();
+ }
+}