apache · JingsongLi · May 15, 2026 · May 11, 2026 · May 11, 2026 · May 12, 2026
diff --git a/docs/content/pypaimon/ray-data.md b/docs/content/pypaimon/ray-data.md
@@ -207,11 +207,57 @@ write_paimon(
 )
 ```
 
+**Cluster rows by (partition, bucket) before writing:**
+
+For HASH_FIXED tables, Ray's default round-robin block distribution can
+scatter rows that share the same `(partition, bucket)` across many Ray
+tasks. Each task opens its own writer and emits its own data file, so
+the write produces `partitions × buckets × ray_tasks` files instead of
+the `partitions × buckets` the writer would naturally produce.
+
+`shuffle=True` clusters rows by `(partition_keys..., bucket)` so each
+group lands in one Ray task — one writer, one file group:
+
+```python
+write_paimon(
+    ray_dataset,
+    "database_name.table_name",
+    catalog_options={"warehouse": "/path/to/warehouse"},
+    shuffle=True,
+)
+```
+
+Bucket assignment uses the same hash routine the writer uses, so the
+shuffle-time bucket is byte-equivalent to the writer's. For
+non-HASH_FIXED tables the shuffle is a soft no-op with a warning; the
+write still succeeds.
+
+`override_num_blocks` is an optional Ray output block count
+(mirrors the same-named option on `read_paimon`). With `shuffle=True`
+it is a parallelism hint for the groupby shuffle; with `shuffle=False`
+it triggers a plain block rebalance to that count:
+
+```python
+write_paimon(
+    ray_dataset,
+    "database_name.table_name",
+    catalog_options={"warehouse": "/path/to/warehouse"},
+    override_num_blocks=4,
+)
+```
+
 **Parameters:**
 - `dataset`: the Ray Dataset to write.
 - `table_identifier`: full table name, e.g. `"db_name.table_name"`.
 - `catalog_options`: kwargs forwarded to `CatalogFactory.create()`.
 - `overwrite`: if `True`, overwrite existing data in the table.
+- `shuffle`: if `True` and the target is HASH_FIXED, cluster rows by
+  `(partition_keys..., bucket)` so each (partition, bucket) lands in
+  one Ray task. Non-HASH_FIXED tables log a warning and fall back to
+  no-shuffle. Defaults to `False`.
+- `override_num_blocks`: optional Ray output block count. Must be `>= 1`.
+  With `shuffle=True`, a parallelism hint for the groupby shuffle;
+  with `shuffle=False`, a plain Ray block rebalance to that count.
 - `concurrency`: optional max number of Ray write tasks to run concurrently.
 - `ray_remote_args`: optional kwargs passed to `ray.remote()` in write tasks
   (e.g. `{"num_cpus": 2}`).

diff --git a/paimon-python/pypaimon/ray/ray_paimon.py b/paimon-python/pypaimon/ray/ray_paimon.py
@@ -107,6 +107,8 @@ def write_paimon(
     catalog_options: Dict[str, str],
     *,
     overwrite: bool = False,
+    shuffle: bool = False,
+    override_num_blocks: Optional[int] = None,
     concurrency: Optional[int] = None,
     ray_remote_args: Optional[Dict[str, Any]] = None,
 ) -> None:
@@ -117,15 +119,39 @@ def write_paimon(
         table_identifier: Full table name, e.g. ``"db_name.table_name"``.
         catalog_options: Options passed to ``CatalogFactory.create()``.
         overwrite: If ``True``, overwrite existing data in the table.
+        shuffle: When ``True`` and the target is a HASH_FIXED table, cluster
+            rows by ``(partition_keys..., bucket)`` so each (partition,
+            bucket) lands in one Ray task — reduces the small-file count
+            for distributed writes. Non-HASH_FIXED tables log a warning
+            and fall back to no-shuffle. Defaults to ``False`` (Ray's
+            default round-robin distribution).
+        override_num_blocks: Optional Ray output block count. Must be
+            ``>= 1`` when set. With ``shuffle=True``, used as a
+            parallelism hint for the groupby shuffle; with
+            ``shuffle=False``, triggers a plain block rebalance to that
+            count. ``None`` (default) skips the rebalance. Mirrors the
+            ``override_num_blocks`` parameter on :func:`read_paimon`.
         concurrency: Optional max number of Ray write tasks to run concurrently.
         ray_remote_args: Optional kwargs passed to ``ray.remote`` in write tasks.
     """
     from pypaimon.catalog.catalog_factory import CatalogFactory
+    from pypaimon.ray.shuffle import maybe_apply_repartition
     from pypaimon.write.ray_datasink import PaimonDatasink
 
+    if override_num_blocks is not None and override_num_blocks < 1:
+        raise ValueError(
+            "override_num_blocks must be at least 1, got {}".format(
+                override_num_blocks)
+        )
+
     catalog = CatalogFactory.create(catalog_options)
     table = catalog.get_table(table_identifier)
 
+    dataset, _ = maybe_apply_repartition(
+        dataset, table,
+        shuffle=shuffle, override_num_blocks=override_num_blocks,
+    )
+
     datasink = PaimonDatasink(table, overwrite=overwrite)
 
     write_kwargs = {}

diff --git a/paimon-python/pypaimon/ray/shuffle.py b/paimon-python/pypaimon/ray/shuffle.py
@@ -0,0 +1,157 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+"""Pre-repartition a Ray Dataset by (partition, bucket) before writing
+to a Paimon table.
+
+Without this, Ray's default round-robin block distribution scatters rows
+that share the same (partition, bucket) across many Ray tasks. Each
+task then opens its own writer and emits its own data file, producing
+``partitions × buckets × ray_tasks`` files instead of the
+``partitions × buckets`` the writer would naturally produce.
+
+When ``shuffle=True`` and the table is HASH_FIXED, we group rows by
+``(partition_keys..., bucket)`` so every distinct group lands in a
+single Ray task. ``bucket`` is computed using the same
+``FixedBucketRowKeyExtractor`` the writer uses, so the shuffle-time
+bucket assignment is byte-equivalent to the writer's.
+
+For any other bucket mode the shuffle is a soft no-op with a warning;
+we never raise. ``shuffle=False`` is the default and preserves the
+original Ray round-robin behaviour.
+"""
+
+import logging
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import pyarrow as pa
+
+from pypaimon.table.bucket_mode import BucketMode
+
+if TYPE_CHECKING:
+    import ray.data
+
+    from pypaimon.table.table import Table
+
+logger = logging.getLogger(__name__)
+
+# Transient column the helper appends before the groupby and strips
+# afterwards. Sink-side schema is identical to caller-provided schema.
+BUCKET_KEY_COL = "__paimon_bucket__"
+
+
+def maybe_apply_repartition(
+        dataset: "ray.data.Dataset",
+        table: "Table",
+        *,
+        shuffle: bool,
+        override_num_blocks: Optional[int],
+) -> Tuple["ray.data.Dataset", bool]:
+    """Optionally rewrite ``dataset`` so rows are clustered for the writer.
+
+    Args:
+        dataset: The input Ray Dataset.
+        table: The Paimon target table (used for bucket mode + schema).
+        shuffle: When True, group rows by ``(partition_keys..., bucket)``.
+            Falls through to a warning + no-op for non-HASH_FIXED tables.
+        override_num_blocks: Optional. When ``shuffle=True``, used as the
+            ``num_partitions`` hint for the groupby shuffle. When
+            ``shuffle=False``, triggers a plain block rebalance to that
+            count. ``None`` + ``shuffle=False`` means no-op.
+
+    Returns:
+        ``(dataset, was_shuffle_applied)`` — the dataset to hand to the
+        sink, plus a flag the caller can use for telemetry.
+    """
+    if not shuffle and override_num_blocks is None:
+        return dataset, False
+
+    if shuffle:
+        bucket_mode = table.bucket_mode()
+        if bucket_mode != BucketMode.HASH_FIXED:
+            logger.warning(
+                "shuffle=True requires a HASH_FIXED table; got %s. "
+                "Falling back to no-shuffle write.",
+                bucket_mode.name,
+            )
+            shuffle = False
+
+    if shuffle:
+        partition_keys = list(table.table_schema.partition_keys or [])
+        extractor = table.create_row_key_extractor()
+        bucket_udf = _make_bucket_udf(extractor)
+
+        ds_with_bucket = dataset.map_batches(
+            bucket_udf, batch_format="pyarrow", zero_copy_batch=True,
+        )
+        group_keys: List[str] = partition_keys + [BUCKET_KEY_COL]
+        grouped = ds_with_bucket.groupby(
+            group_keys, num_partitions=override_num_blocks,
+        )
+        regrouped = grouped.map_groups(_identity_batch, batch_format="pyarrow")
+        return regrouped.drop_columns([BUCKET_KEY_COL]), True
+
+    # After a soft fallback, override_num_blocks may still be None —
+    # keep the contract that None means "no Ray-side repartition".
+    if override_num_blocks is None:
+        return dataset, False
+    return dataset.repartition(override_num_blocks, shuffle=False), False
+
+
+def _identity_batch(batch: pa.Table) -> pa.Table:
+    # Some Ray versions promote ``string`` to ``large_string`` (and
+    # ``binary`` to ``large_binary``) while materialising blocks for
+    # ``groupby().map_groups``. Paimon's writer compares schemas with a
+    # strict ``!=`` and rejects the large variants, so coerce them back
+    # to the regular types here. Other Arrow types pass through.
+    return _coerce_large_string_types(batch)
+
+
+def _coerce_large_string_types(batch: pa.Table) -> pa.Table:
+    needs_cast = False
+    fields = []
+    for field in batch.schema:
+        if pa.types.is_large_string(field.type):
+            fields.append(field.with_type(pa.string()))
+            needs_cast = True
+        elif pa.types.is_large_binary(field.type):
+            fields.append(field.with_type(pa.binary()))
+            needs_cast = True
+        else:
+            fields.append(field)
+    return batch.cast(pa.schema(fields)) if needs_cast else batch
+
+
+def _make_bucket_udf(extractor):
+    """Build a map_batches UDF that appends BUCKET_KEY_COL.
+
+    The bucket value comes from ``extract_partition_bucket_batch`` so it
+    matches the writer's bucket assignment for the same row exactly.
+    """
+    def _udf(batch: pa.Table) -> pa.Table:
+        if batch.num_rows == 0:
+            return batch.append_column(
+                BUCKET_KEY_COL, pa.array([], type=pa.int32())
+            )
+        record_batch = batch.combine_chunks().to_batches()[0]
+        _, buckets = extractor.extract_partition_bucket_batch(record_batch)
+        return batch.append_column(
+            BUCKET_KEY_COL, pa.array(buckets, type=pa.int32())
+        )
+
+    return _udf