Skip to content

Commit fe013bc

Browse files
authored
feat(C++): remove hardcoded row group size, keep 64M default (#872)
1 parent 31b78b0 commit fe013bc

3 files changed

Lines changed: 13 additions & 3 deletions

File tree

cpp/src/graphar/filesystem.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,9 @@ Status FileSystem::WriteTableToFile(
266266
}
267267
case FileType::PARQUET: {
268268
auto schema = table->schema();
269+
auto row_group_size = options->getParquetMaxRowGroupLength();
269270
RETURN_NOT_ARROW_OK(parquet::arrow::WriteTable(
270-
*table, arrow::default_memory_pool(), output_stream, 64 * 1024 * 1024,
271+
*table, arrow::default_memory_pool(), output_stream, row_group_size,
271272
options->getParquetWriterProperties(),
272273
options->getArrowWriterProperties()));
273274
break;
@@ -300,8 +301,9 @@ Status FileSystem::WriteLabelTableToFile(
300301
parquet::WriterProperties::Builder builder;
301302
builder.compression(arrow::Compression::type::ZSTD); // enable compression
302303
builder.encoding(parquet::Encoding::RLE);
304+
auto row_group_size = builder.build()->max_row_group_length();
303305
RETURN_NOT_ARROW_OK(parquet::arrow::WriteTable(
304-
*table, arrow::default_memory_pool(), output_stream, 64 * 1024 * 1024,
306+
*table, arrow::default_memory_pool(), output_stream, row_group_size,
305307
builder.build(), parquet::default_arrow_writer_properties()));
306308
return Status::OK();
307309
}

cpp/src/graphar/writer_util.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ WriterOptions::getParquetWriterProperties() const {
8686
return builder.build();
8787
}
8888

89+
int64_t WriterOptions::getParquetMaxRowGroupLength() const {
90+
if (parquetOption_) {
91+
return parquetOption_->max_row_group_length;
92+
}
93+
return parquet::WriterProperties::Builder().build()->max_row_group_length();
94+
}
95+
8996
std::shared_ptr<parquet::ArrowWriterProperties>
9097
WriterOptions::getArrowWriterProperties() const {
9198
parquet::ArrowWriterProperties::Builder builder;

cpp/src/graphar/writer_util.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class WriterOptions {
9494
std::vector<::parquet::SortingColumn> sorting_columns;
9595
int64_t dictionary_pagesize_limit = 1024 * 1024;
9696
int64_t write_batch_size = 1024;
97-
int64_t max_row_group_length = 1024 * 1024;
97+
int64_t max_row_group_length = 64 * 1024 * 1024;
9898
int64_t data_pagesize = 1024 * 1024;
9999
size_t max_statistics_size = 4096;
100100
int compression_level = std::numeric_limits<int>::min();
@@ -429,6 +429,7 @@ class WriterOptions {
429429
std::shared_ptr<parquet::WriterProperties> getParquetWriterProperties() const;
430430
std::shared_ptr<parquet::ArrowWriterProperties> getArrowWriterProperties()
431431
const;
432+
int64_t getParquetMaxRowGroupLength() const;
432433
#ifdef ARROW_ORC
433434
arrow::adapters::orc::WriteOptions getOrcOption() const;
434435
#endif

0 commit comments

Comments
 (0)