diff --git a/src/app/repl.c b/src/app/repl.c index 4dd36fba..ab589ca6 100644 --- a/src/app/repl.c +++ b/src/app/repl.c @@ -754,6 +754,7 @@ static void eval_and_print(ray_term_t* term, const char* input, } if (profiling) profile_print(use_color); + ray_heap_gc(); } /* `type_label` and `cmd_match` were inlined into the previous bespoke @@ -1263,5 +1264,6 @@ int ray_repl_run_file(const char* path) { /* profile tree goes to stdout via profile_print; honour stdout's tty. */ if (profiling) profile_print(color_out); + ray_heap_gc(); return rc; } diff --git a/src/core/platform.c b/src/core/platform.c index a386b32d..e630cd9a 100644 --- a/src/core/platform.c +++ b/src/core/platform.c @@ -87,10 +87,6 @@ void ray_vm_advise_seq(void* ptr, size_t size) { if (ptr) madvise(ptr, size, MADV_SEQUENTIAL); } -void ray_vm_advise_willneed(void* ptr, size_t size) { - if (ptr) madvise(ptr, size, MADV_WILLNEED); -} - void ray_vm_release(void* ptr, size_t size) { if (!ptr) return; #if defined(RAY_OS_MACOS) @@ -423,7 +419,6 @@ void ray_vm_unmap_file(void* ptr, size_t size) { /* madvise hints are advisory and have no analog on WASM — no-ops. */ void ray_vm_advise_seq(void* ptr, size_t size) { (void)ptr; (void)size; } -void ray_vm_advise_willneed(void* ptr, size_t size) { (void)ptr; (void)size; } void ray_vm_release(void* ptr, size_t size) { (void)ptr; (void)size; } void* ray_vm_alloc_aligned(size_t size, size_t alignment) { diff --git a/src/core/platform.h b/src/core/platform.h index cad406a1..4a4828cb 100644 --- a/src/core/platform.h +++ b/src/core/platform.h @@ -139,7 +139,6 @@ void ray_vm_free(void* ptr, size_t size); void* ray_vm_map_file(const char* path, size_t* out_size); void ray_vm_unmap_file(void* ptr, size_t size); void ray_vm_advise_seq(void* ptr, size_t size); -void ray_vm_advise_willneed(void* ptr, size_t size); void ray_vm_release(void* ptr, size_t size); void* ray_vm_alloc_aligned(size_t size, size_t alignment); diff --git a/src/io/csv.c b/src/io/csv.c index 59abf903..e5f810bd 100644 --- a/src/io/csv.c +++ b/src/io/csv.c @@ -25,7 +25,7 @@ * csv.c — Fast parallel CSV reader * * Design: - * 1. mmap + MAP_POPULATE for zero-copy file access + * 1. mmap for zero-copy file access * 2. memchr-based newline scan for row offset discovery * 3. Single-pass: sample-based type inference, then parallel value parsing * 4. Inline integer/float parsers (bypass strtoll/strtod overhead) @@ -44,7 +44,9 @@ #include "core/pool.h" #include "lang/format.h" #include "ops/hash.h" +#include "store/col.h" #include "store/fileio.h" +#include "store/splay.h" #include "table/sym.h" #include "vec/str.h" @@ -68,16 +70,13 @@ #define CSV_MAX_COLS 256 #define CSV_SAMPLE_ROWS 100 +#define CSV_PART_ROWS_DEFAULT 1000000 /* -------------------------------------------------------------------------- * mmap flags * -------------------------------------------------------------------------- */ -#ifdef __linux__ - #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) -#else - #define MMAP_FLAGS MAP_PRIVATE -#endif +#define MMAP_FLAGS MAP_PRIVATE /* -------------------------------------------------------------------------- * Scratch memory helpers (same pattern as exec.c). @@ -243,6 +242,47 @@ static csv_type_t promote_csv_type(csv_type_t cur, csv_type_t obs) { return CSV_TYPE_STR; } +static void csv_cardinality_note(uint32_t hashes[CSV_MAX_COLS][CSV_SAMPLE_ROWS], + uint16_t lens[CSV_MAX_COLS][CSV_SAMPLE_ROWS], + uint16_t* distinct, uint16_t* non_null, + int col, const char* fld, size_t flen) { + if (flen == 0) return; + uint32_t h = (uint32_t)ray_hash_bytes(fld, flen); + uint16_t l = flen > UINT16_MAX ? UINT16_MAX : (uint16_t)flen; + for (uint16_t i = 0; i < distinct[col]; i++) { + if (hashes[col][i] == h && lens[col][i] == l) { + non_null[col]++; + return; + } + } + if (distinct[col] < CSV_SAMPLE_ROWS) { + hashes[col][distinct[col]] = h; + lens[col][distinct[col]] = l; + distinct[col]++; + } + non_null[col]++; +} + +static int8_t csv_resolve_inferred_type(csv_type_t t, + uint16_t distinct, + uint16_t non_null) { + switch (t) { + case CSV_TYPE_BOOL: return RAY_BOOL; + case CSV_TYPE_I64: return RAY_I64; + case CSV_TYPE_F64: return RAY_F64; + case CSV_TYPE_DATE: return RAY_DATE; + case CSV_TYPE_TIME: return RAY_TIME; + case CSV_TYPE_TIMESTAMP: return RAY_TIMESTAMP; + case CSV_TYPE_GUID: return RAY_GUID; + case CSV_TYPE_STR: + return (non_null >= 64 && + (uint32_t)distinct * 100u >= (uint32_t)non_null * 80u) + ? RAY_STR : RAY_SYM; + default: + return RAY_SYM; + } +} + /* -------------------------------------------------------------------------- * Zero-copy field scanner * @@ -588,6 +628,93 @@ static int64_t build_row_offsets(const char* buf, size_t buf_size, return n; } +static int64_t build_row_offsets_limited(const char* buf, size_t buf_size, + size_t data_offset, int64_t max_rows, + bool data_has_quotes, + int64_t** offsets_out, ray_t** hdr_out, + size_t* next_offset_out) { + const char* p = buf + data_offset; + const char* end = buf + buf_size; + + *offsets_out = NULL; + *hdr_out = NULL; + if (next_offset_out) *next_offset_out = data_offset; + if (max_rows <= 0 || p >= end) return 0; + + size_t remaining = (size_t)(end - p); + int64_t est = (int64_t)(remaining / 40) + 16; + if (est < 1) est = 1; + if (est > max_rows) est = max_rows; + + ray_t* hdr = NULL; + int64_t* offs = (int64_t*)scratch_alloc(&hdr, (size_t)est * sizeof(int64_t)); + if (!offs) return 0; + + int64_t n = 0; + offs[n++] = (int64_t)(p - buf); + + if (RAY_LIKELY(!data_has_quotes)) { + for (;;) { + const char* nl = (const char*)memchr(p, '\n', (size_t)(end - p)); + if (!nl) { + p = end; + break; + } + p = nl + 1; + if (p < end && *p == '\r') p++; + if (p >= end) break; + if (n >= max_rows) break; + if (n >= est) { + int64_t new_est = est * 2; + if (new_est > max_rows) new_est = max_rows; + offs = (int64_t*)scratch_realloc(&hdr, + (size_t)n * sizeof(int64_t), + (size_t)new_est * sizeof(int64_t)); + if (!offs) { + scratch_free(hdr); + return 0; + } + est = new_est; + } + offs[n++] = (int64_t)(p - buf); + } + } else { + bool in_quote = false; + while (p < end) { + char c = *p; + if (c == '"') { + in_quote = !in_quote; + p++; + } else if (!in_quote && (c == '\n' || c == '\r')) { + if (c == '\r' && p + 1 < end && *(p + 1) == '\n') p++; + p++; + if (p >= end) break; + if (n >= max_rows) break; + if (n >= est) { + int64_t new_est = est * 2; + if (new_est > max_rows) new_est = max_rows; + offs = (int64_t*)scratch_realloc(&hdr, + (size_t)n * sizeof(int64_t), + (size_t)new_est * sizeof(int64_t)); + if (!offs) { + scratch_free(hdr); + return 0; + } + est = new_est; + } + offs[n++] = (int64_t)(p - buf); + } else { + p++; + } + } + } + + *offsets_out = offs; + *hdr_out = hdr; + if (next_offset_out) *next_offset_out = (size_t)(p - buf); + return n; +} + /* -------------------------------------------------------------------------- * Batch-intern string columns after parse. * Single-threaded — walks each string column, interns into global sym table, @@ -622,6 +749,7 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols, /* RAY_STR columns are materialized directly; skip sym interning. */ if (resolved_types[c] == RAY_STR) continue; csv_strref_t* refs = str_refs[c]; + if (!refs) continue; uint32_t* ids = (uint32_t*)col_data[c]; uint8_t* nm = col_nullmaps ? col_nullmaps[c] : NULL; int64_t max_id = empty_sym_id; @@ -642,8 +770,7 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols, nm[r >> 3] &= (uint8_t)~(1u << (r & 7)); continue; } - uint32_t hash = (uint32_t)ray_hash_bytes(refs[r].ptr, refs[r].len); - int64_t id = ray_sym_intern_prehashed(hash, refs[r].ptr, refs[r].len); + int64_t id = ray_sym_intern_no_split_unlocked(refs[r].ptr, refs[r].len); if (id < 0) { ok = false; id = 0; } ids[r] = (uint32_t)id; if (id > max_id) max_id = id; @@ -775,6 +902,7 @@ typedef struct { int n_cols; char delim; const csv_type_t* col_types; + const int8_t* resolved_types; void** col_data; /* non-const: workers write parsed values into columns */ csv_strref_t** str_refs; /* [n_cols] — strref arrays for string columns, NULL for others */ uint8_t** col_nullmaps; @@ -973,7 +1101,9 @@ static void csv_parse_fn(void* arg, uint32_t worker_id, static void csv_parse_serial(const char* buf, size_t buf_size, const int64_t* row_offsets, int64_t n_rows, int n_cols, char delim, - const csv_type_t* col_types, void** col_data, + const csv_type_t* col_types, + const int8_t* resolved_types, + void** col_data, csv_strref_t** str_refs, uint8_t** col_nullmaps, bool* col_had_null) { char esc_buf[8192]; @@ -1157,193 +1287,32 @@ static void csv_parse_serial(const char* buf, size_t buf_size, } } -/* -------------------------------------------------------------------------- - * ray_read_csv_opts — main CSV parser - * -------------------------------------------------------------------------- */ - -ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, - const int8_t* col_types_in, int32_t n_types) { - /* ---- 1. Open file and get size ---- */ - int fd = open(path, O_RDONLY); - if (fd < 0) return ray_error("io", NULL); - - struct stat st; - if (fstat(fd, &st) != 0 || st.st_size <= 0) { - close(fd); - return ray_error("io", NULL); - } - size_t file_size = (size_t)st.st_size; - - /* ---- 2. mmap the file ---- */ - char* buf = (char*)mmap(NULL, file_size, PROT_READ, MMAP_FLAGS, fd, 0); - close(fd); - if (buf == MAP_FAILED) return ray_error("io", NULL); - -#ifdef __APPLE__ - madvise(buf, file_size, MADV_SEQUENTIAL); -#endif - - const char* buf_end = buf + file_size; - ray_t* result = NULL; - - /* ---- 3. Detect delimiter ---- */ - /* Delimiter auto-detected from header row only. Files where the header - * has a different delimiter distribution than data rows may be misdetected; - * pass an explicit delimiter for such files. Scanning additional data rows - * was considered but adds complexity for a rare edge case. */ - if (delimiter == 0) { - int commas = 0, tabs = 0; - for (const char* p = buf; p < buf_end && *p != '\n'; p++) { - if (*p == ',') commas++; - if (*p == '\t') tabs++; - } - delimiter = (tabs > commas) ? '\t' : ','; - } - - /* ---- 4. Count columns from first line ---- */ - int ncols = 1; - { - const char* p = buf; - bool in_quote = false; - while (p < buf_end && (in_quote || (*p != '\n' && *p != '\r'))) { - if (*p == '"') in_quote = !in_quote; - else if (!in_quote && *p == delimiter) ncols++; - p++; - } - } - if (ncols > CSV_MAX_COLS) { - munmap(buf, file_size); - /* fd already closed after mmap (line 1044) — do not close again */ - return ray_error("range", NULL); /* too many columns */ - } - - /* ---- 5. Parse header row ---- */ - const char* p = buf; - char esc_buf[8192]; - int64_t col_name_ids[CSV_MAX_COLS]; - - if (header) { - for (int c = 0; c < ncols; c++) { - const char* fld; - size_t flen; - char* dyn_esc = NULL; - p = scan_field(p, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); - col_name_ids[c] = ray_sym_intern(fld, flen); - if (dyn_esc) ray_sys_free(dyn_esc); - } - /* Consume exactly one line terminator (\r, \n, or \r\n) after the - * header row — NOT a run of newlines, because subsequent empty - * lines are null data rows. */ - if (p < buf_end && *p == '\r') p++; - if (p < buf_end && *p == '\n') p++; - } else { - for (int c = 0; c < ncols; c++) { - char name[32]; - snprintf(name, sizeof(name), "V%d", c + 1); - col_name_ids[c] = ray_sym_intern(name, strlen(name)); - } - } - - size_t data_offset = (size_t)(p - buf); - - /* ---- 6. Build row offsets (memchr-accelerated) ---- */ - ray_t* row_offsets_hdr = NULL; - int64_t* row_offsets = NULL; - int64_t n_rows = build_row_offsets(buf, file_size, data_offset, - &row_offsets, &row_offsets_hdr); - - if (n_rows == 0) { - /* Empty file → empty table */ - ray_t* tbl = ray_table_new(ncols); - if (!tbl || RAY_IS_ERR(tbl)) goto fail_unmap; - for (int c = 0; c < ncols; c++) { - ray_t* empty_vec = ray_vec_new(RAY_F64, 0); - if (empty_vec && !RAY_IS_ERR(empty_vec)) { - tbl = ray_table_add_col(tbl, col_name_ids[c], empty_vec); - ray_release(empty_vec); - } - } - munmap(buf, file_size); - return tbl; - } - - /* ---- 7. Resolve column types ---- */ - int8_t resolved_types[CSV_MAX_COLS]; - if (col_types_in && n_types >= ncols) { - /* Explicit types provided by caller — validate against known types */ - for (int c = 0; c < ncols; c++) { - int8_t t = col_types_in[c]; - if (t < RAY_BOOL || t >= RAY_TYPE_COUNT || t == RAY_TABLE) { - /* Invalid type constant — fall through to error */ - goto fail_offsets; - } - resolved_types[c] = t; - } - } else if (!col_types_in) { - /* Auto-infer from sample rows */ - csv_type_t col_types[CSV_MAX_COLS]; - memset(col_types, 0, (size_t)ncols * sizeof(csv_type_t)); - /* Type inference from first 100 rows. Heterogeneous CSVs with type - * changes after row 100 will be mistyped. Use explicit schema - * (col_types_in) for such files. */ - int64_t sample_n = (n_rows < CSV_SAMPLE_ROWS) ? n_rows : CSV_SAMPLE_ROWS; - for (int64_t r = 0; r < sample_n; r++) { - const char* rp = buf + row_offsets[r]; - for (int c = 0; c < ncols; c++) { - const char* fld; - size_t flen; - char* dyn_esc = NULL; - rp = scan_field(rp, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); - csv_type_t t = detect_type(fld, flen); - if (dyn_esc) ray_sys_free(dyn_esc); - col_types[c] = promote_csv_type(col_types[c], t); - } - } - for (int c = 0; c < ncols; c++) { - switch (col_types[c]) { - case CSV_TYPE_BOOL: resolved_types[c] = RAY_BOOL; break; - case CSV_TYPE_I64: resolved_types[c] = RAY_I64; break; - case CSV_TYPE_F64: resolved_types[c] = RAY_F64; break; - case CSV_TYPE_DATE: resolved_types[c] = RAY_DATE; break; - case CSV_TYPE_TIME: resolved_types[c] = RAY_TIME; break; - case CSV_TYPE_TIMESTAMP: resolved_types[c] = RAY_TIMESTAMP; break; - default: resolved_types[c] = RAY_SYM; break; - } - } - } else { - /* col_types_in provided but too short — error */ - goto fail_offsets; - } - - /* ---- 8. Allocate column vectors ---- */ +static ray_t* csv_materialize_rows(const char* buf, size_t file_size, + const int64_t* row_offsets, int64_t n_rows, + int ncols, char delimiter, + const int64_t* col_name_ids, + const int8_t* resolved_types) { ray_t* col_vecs[CSV_MAX_COLS]; void* col_data[CSV_MAX_COLS]; for (int c = 0; c < ncols; c++) { int8_t type = resolved_types[c]; - /* String columns: allocate RAY_SYM at W32 (4B/elem) for sym IDs. - * After intern, narrow to W8/W16 if max sym ID permits. */ col_vecs[c] = (type == RAY_SYM) ? ray_sym_vec_new(RAY_SYM_W32, n_rows) : ray_vec_new(type, n_rows); if (!col_vecs[c] || RAY_IS_ERR(col_vecs[c])) { for (int j = 0; j < c; j++) ray_release(col_vecs[j]); - goto fail_offsets; + return NULL; } - /* len set early so parallel workers can write to full extent; - * parse errors return before table is used. */ col_vecs[c]->len = n_rows; col_data[c] = ray_data(col_vecs[c]); } - /* ---- 8b. Pre-allocate nullmaps for all columns ---- */ uint8_t* col_nullmaps[CSV_MAX_COLS]; bool col_had_null[CSV_MAX_COLS]; if (ncols > 0) memset(col_had_null, 0, (size_t)ncols * sizeof(bool)); for (int c = 0; c < ncols; c++) { ray_t* vec = col_vecs[c]; - /* RAY_STR aliases bytes 8-15 of the header with str_pool — inline - * nullmap would corrupt the pool pointer, so force external. */ bool force_ext = (resolved_types[c] == RAY_STR); if (n_rows <= 128 && !force_ext) { vec->attrs |= RAY_ATTR_HAS_NULLS; @@ -1353,8 +1322,8 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, size_t bmp_bytes = ((size_t)n_rows + 7) / 8; ray_t* ext = ray_vec_new(RAY_U8, (int64_t)bmp_bytes); if (!ext || RAY_IS_ERR(ext)) { - for (int j = 0; j <= c; j++) ray_release(col_vecs[j]); - goto fail_offsets; + for (int j = 0; j < ncols; j++) ray_release(col_vecs[j]); + return NULL; } ext->len = (int64_t)bmp_bytes; memset(ray_data(ext), 0, bmp_bytes); @@ -1364,7 +1333,6 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, } } - /* Build csv_type_t array for parse functions (maps td types → csv types) */ csv_type_t parse_types[CSV_MAX_COLS]; for (int c = 0; c < ncols; c++) { switch (resolved_types[c]) { @@ -1378,21 +1346,21 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, case RAY_TIME: parse_types[c] = CSV_TYPE_TIME; break; case RAY_TIMESTAMP: parse_types[c] = CSV_TYPE_TIMESTAMP; break; case RAY_GUID: parse_types[c] = CSV_TYPE_GUID; break; - default: parse_types[c] = CSV_TYPE_STR; break; + default: parse_types[c] = CSV_TYPE_STR; break; } } - /* ---- 9. Parse data ---- */ int64_t sym_max_ids[CSV_MAX_COLS]; memset(sym_max_ids, 0, (size_t)ncols * sizeof(int64_t)); - /* Check if any string columns exist */ - int has_str_cols = 0; + int has_text_cols = 0; for (int c = 0; c < ncols; c++) { - if (parse_types[c] == CSV_TYPE_STR) { has_str_cols = 1; break; } + if (parse_types[c] == CSV_TYPE_STR) { + has_text_cols = 1; + break; + } } - /* Allocate strref arrays for string columns (temporary, freed after intern) */ csv_strref_t* str_ref_bufs[CSV_MAX_COLS]; ray_t* str_ref_hdrs[CSV_MAX_COLS]; memset(str_ref_bufs, 0, sizeof(str_ref_bufs)); @@ -1404,7 +1372,7 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, if (!str_ref_bufs[c]) { for (int j = 0; j < ncols; j++) ray_release(col_vecs[j]); for (int j = 0; j < c; j++) scratch_free(str_ref_hdrs[j]); - goto fail_offsets; + return NULL; } } } @@ -1430,6 +1398,7 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, .n_cols = ncols, .delim = delimiter, .col_types = parse_types, + .resolved_types = resolved_types, .col_data = col_data, .str_refs = str_ref_bufs, .col_nullmaps = col_nullmaps, @@ -1438,7 +1407,6 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, ray_pool_dispatch(pool, csv_parse_fn, &ctx, n_rows); - /* OR worker null flags into col_had_null */ for (uint32_t w = 0; w < n_workers; w++) { for (int c = 0; c < ncols; c++) { if (worker_had_null_buf[(size_t)w * (size_t)ncols + (size_t)c]) @@ -1451,17 +1419,12 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, if (!use_parallel) { csv_parse_serial(buf, file_size, row_offsets, n_rows, - ncols, delimiter, parse_types, col_data, + ncols, delimiter, parse_types, resolved_types, col_data, str_ref_bufs, col_nullmaps, col_had_null); } } - /* ---- 9b. Materialize RAY_STR columns AND batch-intern sym columns ---- - * These two phases touch disjoint columns and (after the GUID fix) - * intern_strings is the only one that mutates the global sym table. - * Dispatch them as two thread-pool tasks so they overlap in wall time - * — typically saves the smaller of the two phases. */ - if (has_str_cols) { + if (has_text_cols) { csv_finalize_ctx_t fctx = { .str_refs = str_ref_bufs, .n_cols = ncols, @@ -1486,25 +1449,22 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size); for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]); for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); - goto fail_offsets; + return NULL; } } - /* Free heap-allocated escaped string copies, then strref buffers */ + for (int c = 0; c < ncols; c++) { + if (resolved_types[c] != RAY_SYM) continue; + uint32_t* ids = (uint32_t*)col_data[c]; + int64_t max_id = 0; + for (int64_t r = 0; r < n_rows; r++) + if ((int64_t)ids[r] > max_id) max_id = ids[r]; + sym_max_ids[c] = max_id; + } + csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size); for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]); - /* ---- 9c. Strip nullmaps from all-valid columns ---- - * - * A column qualifies as "no nulls" if either: - * - the parser never saw a null (col_had_null[c] == false), or - * - it's a SYM column. SYM is no-null by design — empty fields - * were already remapped to sym 0 in step 9b, and SYM columns - * never carry HAS_NULLS regardless of what the parse-time - * nullmap looked like. - * - * For non-SYM columns where col_had_null is true, the nullmap - * stays. */ for (int c = 0; c < ncols; c++) { ray_t* vec = col_vecs[c]; int strip = !col_had_null[c] || vec->type == RAY_SYM; @@ -1514,15 +1474,13 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, vec->ext_nullmap = NULL; } vec->attrs &= (uint8_t)~(RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT); - /* RAY_STR stores str_pool in bytes 8-15 of the header — don't wipe. */ if (vec->type != RAY_STR) memset(vec->nullmap, 0, 16); } - /* ---- 10. Narrow sym columns to optimal width ---- */ for (int c = 0; c < ncols; c++) { if (resolved_types[c] != RAY_SYM) continue; uint8_t new_w = ray_sym_dict_width(sym_max_ids[c]); - if (new_w >= RAY_SYM_W32) continue; /* already at W32, no savings */ + if (new_w >= RAY_SYM_W32) continue; ray_t* narrow = ray_sym_vec_new(new_w, n_rows); if (!narrow || RAY_IS_ERR(narrow)) continue; narrow->len = n_rows; @@ -1531,11 +1489,10 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, if (new_w == RAY_SYM_W8) { uint8_t* d = (uint8_t*)dst; for (int64_t r = 0; r < n_rows; r++) d[r] = (uint8_t)src[r]; - } else { /* RAY_SYM_W16 */ + } else { uint16_t* d = (uint16_t*)dst; for (int64_t r = 0; r < n_rows; r++) d[r] = (uint16_t)src[r]; } - /* Transfer nullmap to narrowed vector */ if (col_vecs[c]->attrs & RAY_ATTR_HAS_NULLS) { narrow->attrs |= (col_vecs[c]->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT)); if (col_vecs[c]->attrs & RAY_ATTR_NULLMAP_EXT) { @@ -1550,33 +1507,1131 @@ ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, col_data[c] = dst; } - /* ---- 11. Build table ---- */ + ray_t* tbl = ray_table_new(ncols); + if (!tbl || RAY_IS_ERR(tbl)) { + for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); + return NULL; + } + + for (int c = 0; c < ncols; c++) { + tbl = ray_table_add_col(tbl, col_name_ids[c], col_vecs[c]); + ray_release(col_vecs[c]); + } + + return tbl; +} + +/* -------------------------------------------------------------------------- + * ray_read_csv_opts — main CSV parser + * -------------------------------------------------------------------------- */ + +ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types_in, int32_t n_types, + const int64_t* col_names_in, int32_t n_names) { + /* ---- 1. Open file and get size ---- */ + int fd = open(path, O_RDONLY); + if (fd < 0) return ray_error("io", NULL); + + struct stat st; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + close(fd); + return ray_error("io", NULL); + } + size_t file_size = (size_t)st.st_size; + + /* ---- 2. mmap the file ---- */ + char* buf = (char*)mmap(NULL, file_size, PROT_READ, MMAP_FLAGS, fd, 0); + close(fd); + if (buf == MAP_FAILED) return ray_error("io", NULL); + +#ifdef __APPLE__ + madvise(buf, file_size, MADV_SEQUENTIAL); +#endif + + const char* buf_end = buf + file_size; + ray_t* result = NULL; + + /* ---- 3. Detect delimiter ---- */ + /* Delimiter auto-detected from header row only. Files where the header + * has a different delimiter distribution than data rows may be misdetected; + * pass an explicit delimiter for such files. Scanning additional data rows + * was considered but adds complexity for a rare edge case. */ + if (delimiter == 0) { + int commas = 0, tabs = 0; + for (const char* p = buf; p < buf_end && *p != '\n'; p++) { + if (*p == ',') commas++; + if (*p == '\t') tabs++; + } + delimiter = (tabs > commas) ? '\t' : ','; + } + + /* ---- 4. Count columns from first line ---- */ + int ncols = 1; { + const char* p = buf; + bool in_quote = false; + while (p < buf_end && (in_quote || (*p != '\n' && *p != '\r'))) { + if (*p == '"') in_quote = !in_quote; + else if (!in_quote && *p == delimiter) ncols++; + p++; + } + } + if (ncols > CSV_MAX_COLS) { + munmap(buf, file_size); + /* fd already closed after mmap (line 1044) — do not close again */ + return ray_error("range", NULL); /* too many columns */ + } + + /* ---- 5. Parse header row ---- */ + const char* p = buf; + char esc_buf[8192]; + int64_t col_name_ids[CSV_MAX_COLS]; + + if (header) { + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + p = scan_field(p, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + col_name_ids[c] = ray_sym_intern(fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + } + /* Consume exactly one line terminator (\r, \n, or \r\n) after the + * header row — NOT a run of newlines, because subsequent empty + * lines are null data rows. */ + if (p < buf_end && *p == '\r') p++; + if (p < buf_end && *p == '\n') p++; + } else if (col_names_in && n_names >= ncols) { + for (int c = 0; c < ncols; c++) + col_name_ids[c] = col_names_in[c]; + } else { + for (int c = 0; c < ncols; c++) { + char name[32]; + snprintf(name, sizeof(name), "V%d", c + 1); + col_name_ids[c] = ray_sym_intern(name, strlen(name)); + } + } + + size_t data_offset = (size_t)(p - buf); + + /* ---- 6. Build row offsets (memchr-accelerated) ---- */ + ray_t* row_offsets_hdr = NULL; + int64_t* row_offsets = NULL; + int64_t n_rows = build_row_offsets(buf, file_size, data_offset, + &row_offsets, &row_offsets_hdr); + + if (n_rows == 0) { + /* Empty file → empty table */ ray_t* tbl = ray_table_new(ncols); - if (!tbl || RAY_IS_ERR(tbl)) { - for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); - goto fail_offsets; + if (!tbl || RAY_IS_ERR(tbl)) goto fail_unmap; + for (int c = 0; c < ncols; c++) { + ray_t* empty_vec = ray_vec_new(RAY_F64, 0); + if (empty_vec && !RAY_IS_ERR(empty_vec)) { + tbl = ray_table_add_col(tbl, col_name_ids[c], empty_vec); + ray_release(empty_vec); + } } + munmap(buf, file_size); + return tbl; + } + /* ---- 7. Resolve column types ---- */ + int8_t resolved_types[CSV_MAX_COLS]; + if (col_types_in && n_types >= ncols) { + /* Explicit types provided by caller — validate against known types */ for (int c = 0; c < ncols; c++) { - tbl = ray_table_add_col(tbl, col_name_ids[c], col_vecs[c]); - ray_release(col_vecs[c]); + int8_t t = col_types_in[c]; + if (t < RAY_BOOL || t >= RAY_TYPE_COUNT || t == RAY_TABLE) { + /* Invalid type constant — fall through to error */ + goto fail_offsets; + } + resolved_types[c] = t; + } + } else if (!col_types_in) { + /* Auto-infer from sample rows */ + csv_type_t col_types[CSV_MAX_COLS]; + memset(col_types, 0, (size_t)ncols * sizeof(csv_type_t)); + uint32_t text_hashes[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_lens[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_distinct[CSV_MAX_COLS] = {0}; + uint16_t text_non_null[CSV_MAX_COLS] = {0}; + /* Type inference from first 100 rows. Heterogeneous CSVs with type + * changes after row 100 will be mistyped. Use explicit schema + * (col_types_in) for such files. */ + int64_t sample_n = (n_rows < CSV_SAMPLE_ROWS) ? n_rows : CSV_SAMPLE_ROWS; + for (int64_t r = 0; r < sample_n; r++) { + const char* rp = buf + row_offsets[r]; + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + rp = scan_field(rp, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + csv_type_t t = detect_type(fld, flen); + if (t == CSV_TYPE_STR) + csv_cardinality_note(text_hashes, text_lens, + text_distinct, text_non_null, + c, fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + col_types[c] = promote_csv_type(col_types[c], t); + } + } + for (int c = 0; c < ncols; c++) { + resolved_types[c] = csv_resolve_inferred_type( + col_types[c], text_distinct[c], text_non_null[c]); } + } else { + /* col_types_in provided but too short — error */ + goto fail_offsets; + } - result = tbl; + /* ---- 8. Allocate column vectors ---- */ + ray_t* col_vecs[CSV_MAX_COLS]; + void* col_data[CSV_MAX_COLS]; + + for (int c = 0; c < ncols; c++) { + int8_t type = resolved_types[c]; + /* String columns: allocate RAY_SYM at W32 (4B/elem) for sym IDs. + * After intern, narrow to W8/W16 if max sym ID permits. */ + col_vecs[c] = (type == RAY_SYM) ? ray_sym_vec_new(RAY_SYM_W32, n_rows) + : ray_vec_new(type, n_rows); + if (!col_vecs[c] || RAY_IS_ERR(col_vecs[c])) { + for (int j = 0; j < c; j++) ray_release(col_vecs[j]); + goto fail_offsets; + } + /* len set early so parallel workers can write to full extent; + * parse errors return before table is used. */ + col_vecs[c]->len = n_rows; + col_data[c] = ray_data(col_vecs[c]); } - /* ---- 12. Cleanup ---- */ - scratch_free(row_offsets_hdr); - munmap(buf, file_size); - return result; + /* ---- 8b. Pre-allocate nullmaps for all columns ---- */ + uint8_t* col_nullmaps[CSV_MAX_COLS]; + bool col_had_null[CSV_MAX_COLS]; + if (ncols > 0) memset(col_had_null, 0, (size_t)ncols * sizeof(bool)); - /* Error paths */ -fail_offsets: - scratch_free(row_offsets_hdr); -fail_unmap: - munmap(buf, file_size); - return ray_error("oom", NULL); + for (int c = 0; c < ncols; c++) { + ray_t* vec = col_vecs[c]; + /* RAY_STR aliases bytes 8-15 of the header with str_pool — inline + * nullmap would corrupt the pool pointer, so force external. */ + bool force_ext = (resolved_types[c] == RAY_STR); + if (n_rows <= 128 && !force_ext) { + vec->attrs |= RAY_ATTR_HAS_NULLS; + memset(vec->nullmap, 0, 16); + col_nullmaps[c] = vec->nullmap; + } else { + size_t bmp_bytes = ((size_t)n_rows + 7) / 8; + ray_t* ext = ray_vec_new(RAY_U8, (int64_t)bmp_bytes); + if (!ext || RAY_IS_ERR(ext)) { + for (int j = 0; j <= c; j++) ray_release(col_vecs[j]); + goto fail_offsets; + } + ext->len = (int64_t)bmp_bytes; + memset(ray_data(ext), 0, bmp_bytes); + vec->ext_nullmap = ext; + vec->attrs |= RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT; + col_nullmaps[c] = (uint8_t*)ray_data(ext); + } + } + + /* Build csv_type_t array for parse functions (maps td types → csv types) */ + csv_type_t parse_types[CSV_MAX_COLS]; + for (int c = 0; c < ncols; c++) { + switch (resolved_types[c]) { + case RAY_BOOL: parse_types[c] = CSV_TYPE_BOOL; break; + case RAY_U8: parse_types[c] = CSV_TYPE_U8; break; + case RAY_I16: parse_types[c] = CSV_TYPE_I16; break; + case RAY_I32: parse_types[c] = CSV_TYPE_I32; break; + case RAY_I64: parse_types[c] = CSV_TYPE_I64; break; + case RAY_F64: parse_types[c] = CSV_TYPE_F64; break; + case RAY_DATE: parse_types[c] = CSV_TYPE_DATE; break; + case RAY_TIME: parse_types[c] = CSV_TYPE_TIME; break; + case RAY_TIMESTAMP: parse_types[c] = CSV_TYPE_TIMESTAMP; break; + case RAY_GUID: parse_types[c] = CSV_TYPE_GUID; break; + default: parse_types[c] = CSV_TYPE_STR; break; + } + } + + /* ---- 9. Parse data ---- */ + int64_t sym_max_ids[CSV_MAX_COLS]; + memset(sym_max_ids, 0, (size_t)ncols * sizeof(int64_t)); + + /* Check if any materialized string columns exist */ + int has_text_cols = 0; + for (int c = 0; c < ncols; c++) { + if (parse_types[c] == CSV_TYPE_STR) { + has_text_cols = 1; + break; + } + } + + csv_strref_t* str_ref_bufs[CSV_MAX_COLS]; + ray_t* str_ref_hdrs[CSV_MAX_COLS]; + memset(str_ref_bufs, 0, sizeof(str_ref_bufs)); + memset(str_ref_hdrs, 0, sizeof(str_ref_hdrs)); + for (int c = 0; c < ncols; c++) { + if (parse_types[c] == CSV_TYPE_STR) { + size_t sz = (size_t)n_rows * sizeof(csv_strref_t); + str_ref_bufs[c] = (csv_strref_t*)scratch_alloc(&str_ref_hdrs[c], sz); + if (!str_ref_bufs[c]) { + for (int j = 0; j < ncols; j++) ray_release(col_vecs[j]); + for (int j = 0; j < c; j++) scratch_free(str_ref_hdrs[j]); + goto fail_offsets; + } + } + } + + { + ray_pool_t* pool = ray_pool_get(); + bool use_parallel = pool && n_rows > 8192; + + if (use_parallel) { + uint32_t n_workers = ray_pool_total_workers(pool); + size_t whn_sz = (size_t)n_workers * (size_t)ncols * sizeof(bool); + bool* worker_had_null_buf = (bool*)ray_sys_alloc(whn_sz); + if (!worker_had_null_buf) { + use_parallel = false; + } else { + memset(worker_had_null_buf, 0, whn_sz); + + csv_par_ctx_t ctx = { + .buf = buf, + .buf_size = file_size, + .row_offsets = row_offsets, + .n_rows = n_rows, + .n_cols = ncols, + .delim = delimiter, + .col_types = parse_types, + .resolved_types = resolved_types, + .col_data = col_data, + .str_refs = str_ref_bufs, + .col_nullmaps = col_nullmaps, + .worker_had_null = worker_had_null_buf, + }; + + ray_pool_dispatch(pool, csv_parse_fn, &ctx, n_rows); + + /* OR worker null flags into col_had_null */ + for (uint32_t w = 0; w < n_workers; w++) { + for (int c = 0; c < ncols; c++) { + if (worker_had_null_buf[(size_t)w * (size_t)ncols + (size_t)c]) + col_had_null[c] = true; + } + } + ray_sys_free(worker_had_null_buf); + } + } + + if (!use_parallel) { + csv_parse_serial(buf, file_size, row_offsets, n_rows, + ncols, delimiter, parse_types, resolved_types, col_data, + str_ref_bufs, col_nullmaps, col_had_null); + } + } + + /* ---- 9b. Materialize RAY_STR columns AND batch-intern sym columns ---- + * These two phases touch disjoint columns and (after the GUID fix) + * intern_strings is the only one that mutates the global sym table. + * Dispatch them as two thread-pool tasks so they overlap in wall time + * — typically saves the smaller of the two phases. */ + if (has_text_cols) { + csv_finalize_ctx_t fctx = { + .str_refs = str_ref_bufs, + .n_cols = ncols, + .parse_types = parse_types, + .resolved_types = resolved_types, + .col_data = col_data, + .col_vecs = col_vecs, + .n_rows = n_rows, + .sym_max_ids = sym_max_ids, + .col_nullmaps = col_nullmaps, + .fill_ok = true, + .intern_ok = true, + }; + ray_pool_t* fpool = ray_pool_get(); + if (fpool && ray_pool_total_workers(fpool) >= 2) { + ray_pool_dispatch_n(fpool, csv_finalize_task, &fctx, 2); + } else { + csv_finalize_task(&fctx, 0, 0, 1); + csv_finalize_task(&fctx, 0, 1, 2); + } + if (!fctx.fill_ok || !fctx.intern_ok) { + csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size); + for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]); + for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); + goto fail_offsets; + } + } + + for (int c = 0; c < ncols; c++) { + if (resolved_types[c] != RAY_SYM) continue; + uint32_t* ids = (uint32_t*)col_data[c]; + int64_t max_id = 0; + for (int64_t r = 0; r < n_rows; r++) + if ((int64_t)ids[r] > max_id) max_id = ids[r]; + sym_max_ids[c] = max_id; + } + + /* Free heap-allocated escaped string copies, then strref buffers */ + csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size); + for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]); + + /* ---- 9c. Strip nullmaps from all-valid columns ---- + * + * A column qualifies as "no nulls" if either: + * - the parser never saw a null (col_had_null[c] == false), or + * - it's a SYM column. SYM is no-null by design — empty fields + * were already remapped to sym 0 in step 9b, and SYM columns + * never carry HAS_NULLS regardless of what the parse-time + * nullmap looked like. + * + * For non-SYM columns where col_had_null is true, the nullmap + * stays. */ + for (int c = 0; c < ncols; c++) { + ray_t* vec = col_vecs[c]; + int strip = !col_had_null[c] || vec->type == RAY_SYM; + if (!strip) continue; + if (vec->attrs & RAY_ATTR_NULLMAP_EXT) { + ray_release(vec->ext_nullmap); + vec->ext_nullmap = NULL; + } + vec->attrs &= (uint8_t)~(RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT); + /* RAY_STR stores str_pool in bytes 8-15 of the header — don't wipe. */ + if (vec->type != RAY_STR) memset(vec->nullmap, 0, 16); + } + + /* ---- 10. Narrow sym columns to optimal width ---- */ + for (int c = 0; c < ncols; c++) { + if (resolved_types[c] != RAY_SYM) continue; + uint8_t new_w = ray_sym_dict_width(sym_max_ids[c]); + if (new_w >= RAY_SYM_W32) continue; /* already at W32, no savings */ + ray_t* narrow = ray_sym_vec_new(new_w, n_rows); + if (!narrow || RAY_IS_ERR(narrow)) continue; + narrow->len = n_rows; + const uint32_t* src = (const uint32_t*)col_data[c]; + void* dst = ray_data(narrow); + if (new_w == RAY_SYM_W8) { + uint8_t* d = (uint8_t*)dst; + for (int64_t r = 0; r < n_rows; r++) d[r] = (uint8_t)src[r]; + } else { /* RAY_SYM_W16 */ + uint16_t* d = (uint16_t*)dst; + for (int64_t r = 0; r < n_rows; r++) d[r] = (uint16_t)src[r]; + } + /* Transfer nullmap to narrowed vector */ + if (col_vecs[c]->attrs & RAY_ATTR_HAS_NULLS) { + narrow->attrs |= (col_vecs[c]->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT)); + if (col_vecs[c]->attrs & RAY_ATTR_NULLMAP_EXT) { + narrow->ext_nullmap = col_vecs[c]->ext_nullmap; + ray_retain(narrow->ext_nullmap); + } else { + memcpy(narrow->nullmap, col_vecs[c]->nullmap, 16); + } + } + ray_release(col_vecs[c]); + col_vecs[c] = narrow; + col_data[c] = dst; + } + + /* ---- 11. Build table ---- */ + { + ray_t* tbl = ray_table_new(ncols); + if (!tbl || RAY_IS_ERR(tbl)) { + for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); + goto fail_offsets; + } + + for (int c = 0; c < ncols; c++) { + tbl = ray_table_add_col(tbl, col_name_ids[c], col_vecs[c]); + ray_release(col_vecs[c]); + } + + result = tbl; + } + + /* ---- 12. Cleanup ---- */ + scratch_free(row_offsets_hdr); + munmap(buf, file_size); + return result; + + /* Error paths */ +fail_offsets: + scratch_free(row_offsets_hdr); +fail_unmap: + munmap(buf, file_size); + return ray_error("oom", NULL); +} + +ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, + const int8_t* col_types_in, int32_t n_types) { + return ray_read_csv_named_opts(path, delimiter, header, + col_types_in, n_types, NULL, 0); +} + +typedef struct { + FILE* fp; + FILE* null_fp; + char path[1024]; + char tmp_path[1024]; + char null_tmp_path[1024]; + int8_t type; + uint8_t attrs; + int64_t rows; + bool had_nulls; + uint8_t null_acc; + uint8_t null_bits; +} csv_splayed_col_writer_t; + +static ray_err_t csv_splayed_writer_open(csv_splayed_col_writer_t* w, + const char* dir, int64_t name_id, + int8_t type) { + memset(w, 0, sizeof(*w)); + w->type = type; + w->attrs = (type == RAY_SYM) ? RAY_SYM_W32 : 0; + + ray_t* name_atom = ray_sym_str(name_id); + if (!name_atom) return RAY_ERR_CORRUPT; + const char* name = ray_str_ptr(name_atom); + size_t name_len = ray_str_len(name_atom); + if (name_len == 0 || name[0] == '.' || + memchr(name, '/', name_len) || memchr(name, '\\', name_len) || + memchr(name, '\0', name_len)) + return RAY_ERR_DOMAIN; + + int n = snprintf(w->path, sizeof(w->path), "%s/%.*s", + dir, (int)name_len, name); + if (n < 0 || (size_t)n >= sizeof(w->path)) return RAY_ERR_RANGE; + n = snprintf(w->tmp_path, sizeof(w->tmp_path), "%s.tmp", w->path); + if (n < 0 || (size_t)n >= sizeof(w->tmp_path)) return RAY_ERR_RANGE; + n = snprintf(w->null_tmp_path, sizeof(w->null_tmp_path), "%s.nulltmp", w->path); + if (n < 0 || (size_t)n >= sizeof(w->null_tmp_path)) return RAY_ERR_RANGE; + + w->fp = fopen(w->tmp_path, "wb+"); + if (!w->fp) return RAY_ERR_IO; + ray_t zero = {0}; + if (fwrite(&zero, 1, 32, w->fp) != 32) return RAY_ERR_IO; + return RAY_OK; +} + +static ray_err_t csv_splayed_writer_null_bit(csv_splayed_col_writer_t* w, + bool is_null) { + if (!w->null_fp) { + w->null_fp = fopen(w->null_tmp_path, "wb"); + if (!w->null_fp) return RAY_ERR_IO; + } + if (is_null) w->null_acc |= (uint8_t)(1u << w->null_bits); + w->null_bits++; + if (w->null_bits == 8) { + if (fwrite(&w->null_acc, 1, 1, w->null_fp) != 1) return RAY_ERR_IO; + w->null_acc = 0; + w->null_bits = 0; + } + return RAY_OK; +} + +static ray_err_t csv_splayed_writer_zero_nulls(csv_splayed_col_writer_t* w, + int64_t count) { + if (count <= 0) return RAY_OK; + if (!w->null_fp) { + w->null_fp = fopen(w->null_tmp_path, "wb"); + if (!w->null_fp) return RAY_ERR_IO; + } + + while (count > 0 && w->null_bits != 0) { + w->null_bits++; + if (w->null_bits == 8) { + if (fwrite(&w->null_acc, 1, 1, w->null_fp) != 1) return RAY_ERR_IO; + w->null_acc = 0; + w->null_bits = 0; + } + count--; + } + + uint8_t zeros[8192] = {0}; + int64_t bytes = count / 8; + while (bytes > 0) { + size_t chunk = (bytes > (int64_t)sizeof(zeros)) ? sizeof(zeros) : (size_t)bytes; + if (fwrite(zeros, 1, chunk, w->null_fp) != chunk) return RAY_ERR_IO; + bytes -= (int64_t)chunk; + } + + w->null_bits = (uint8_t)(count & 7); + w->null_acc = 0; + return RAY_OK; +} + +static ray_err_t csv_splayed_writer_append(csv_splayed_col_writer_t* w, + ray_t* col) { + if (!w->fp || !col || RAY_IS_ERR(col)) return RAY_ERR_TYPE; + int64_t n = col->len; + if (n < 0) return RAY_ERR_CORRUPT; + + if (w->type == RAY_SYM) { + uint32_t buf[8192]; + void* data = ray_data(col); + for (int64_t off = 0; off < n; ) { + int64_t cnt = n - off; + if (cnt > (int64_t)(sizeof(buf) / sizeof(buf[0]))) + cnt = (int64_t)(sizeof(buf) / sizeof(buf[0])); + for (int64_t i = 0; i < cnt; i++) + buf[i] = (uint32_t)ray_read_sym(data, off + i, col->type, col->attrs); + if (fwrite(buf, sizeof(uint32_t), (size_t)cnt, w->fp) != (size_t)cnt) + return RAY_ERR_IO; + off += cnt; + } + } else { + uint8_t esz = ray_sym_elem_size(w->type, 0); + size_t bytes = (size_t)n * (size_t)esz; + if (bytes && fwrite(ray_data(col), 1, bytes, w->fp) != bytes) + return RAY_ERR_IO; + if (col->attrs & RAY_ATTR_HAS_NULLS) { + if (!w->had_nulls) { + ray_err_t err = csv_splayed_writer_zero_nulls(w, w->rows); + if (err != RAY_OK) return err; + } + w->had_nulls = true; + for (int64_t i = 0; i < n; i++) { + ray_err_t err = csv_splayed_writer_null_bit(w, ray_vec_is_null(col, i)); + if (err != RAY_OK) return err; + } + } else if (w->had_nulls) { + ray_err_t err = csv_splayed_writer_zero_nulls(w, n); + if (err != RAY_OK) return err; + } + } + w->rows += n; + return RAY_OK; +} + +static ray_err_t csv_splayed_writer_close(csv_splayed_col_writer_t* w) { + if (!w->fp) return RAY_OK; + ray_err_t err = RAY_OK; + if (w->null_fp && w->null_bits) { + if (fwrite(&w->null_acc, 1, 1, w->null_fp) != 1) err = RAY_ERR_IO; + w->null_acc = 0; + w->null_bits = 0; + } + if (w->null_fp && fclose(w->null_fp) != 0 && err == RAY_OK) err = RAY_ERR_IO; + w->null_fp = NULL; + + if (err == RAY_OK && w->had_nulls) { + FILE* nf = fopen(w->null_tmp_path, "rb"); + if (!nf) err = RAY_ERR_IO; + else { + char buf[65536]; + size_t nr; + while ((nr = fread(buf, 1, sizeof(buf), nf)) > 0) { + if (fwrite(buf, 1, nr, w->fp) != nr) { err = RAY_ERR_IO; break; } + } + if (ferror(nf) && err == RAY_OK) err = RAY_ERR_IO; + fclose(nf); + } + } + + if (err == RAY_OK) { + ray_t hdr = {0}; + hdr.type = w->type; + hdr.attrs = w->attrs; + hdr.len = w->rows; + hdr.rc = (w->type == RAY_SYM) ? ray_sym_count() : 0; + if (w->had_nulls) + hdr.attrs |= RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT; + if (fseek(w->fp, 0, SEEK_SET) != 0 || + fwrite(&hdr, 1, 32, w->fp) != 32) + err = RAY_ERR_IO; + } + + if (fclose(w->fp) != 0 && err == RAY_OK) err = RAY_ERR_IO; + w->fp = NULL; + remove(w->null_tmp_path); + if (err == RAY_OK) err = ray_file_rename(w->tmp_path, w->path); + if (err != RAY_OK) remove(w->tmp_path); + return err; +} + +static void csv_splayed_writer_abort(csv_splayed_col_writer_t* w) { + if (w->fp) fclose(w->fp); + if (w->null_fp) fclose(w->null_fp); + w->fp = NULL; + w->null_fp = NULL; + remove(w->tmp_path); + remove(w->null_tmp_path); +} + +ray_err_t ray_csv_save_splayed_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types_in, int32_t n_types, + const int64_t* col_names_in, int32_t n_names, + const char* dir, int64_t rows_per_chunk) { + if (!path || !dir) return RAY_ERR_DOMAIN; + if (rows_per_chunk <= 0) rows_per_chunk = CSV_PART_ROWS_DEFAULT; + + int fd = open(path, O_RDONLY); + if (fd < 0) return RAY_ERR_IO; + + struct stat st; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + close(fd); + return RAY_ERR_IO; + } + size_t file_size = (size_t)st.st_size; + + char* buf = (char*)mmap(NULL, file_size, PROT_READ, MMAP_FLAGS, fd, 0); + close(fd); + if (buf == MAP_FAILED) return RAY_ERR_IO; + +#ifdef __APPLE__ + madvise(buf, file_size, MADV_SEQUENTIAL); +#endif + + const char* buf_end = buf + file_size; + ray_err_t err = RAY_OK; + + if (delimiter == 0) { + int commas = 0, tabs = 0; + for (const char* q = buf; q < buf_end && *q != '\n'; q++) { + if (*q == ',') commas++; + if (*q == '\t') tabs++; + } + delimiter = (tabs > commas) ? '\t' : ','; + } + + int ncols = 1; + { + const char* q = buf; + bool in_quote = false; + while (q < buf_end && (in_quote || (*q != '\n' && *q != '\r'))) { + if (*q == '"') in_quote = !in_quote; + else if (!in_quote && *q == delimiter) ncols++; + q++; + } + } + if (ncols > CSV_MAX_COLS) { + munmap(buf, file_size); + return RAY_ERR_RANGE; + } + + const char* p = buf; + char esc_buf[8192]; + int64_t col_name_ids[CSV_MAX_COLS]; + + if (header) { + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + p = scan_field(p, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + col_name_ids[c] = ray_sym_intern(fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + } + if (p < buf_end && *p == '\r') p++; + if (p < buf_end && *p == '\n') p++; + } else if (col_names_in && n_names >= ncols) { + for (int c = 0; c < ncols; c++) + col_name_ids[c] = col_names_in[c]; + } else { + for (int c = 0; c < ncols; c++) { + char name[32]; + snprintf(name, sizeof(name), "V%d", c + 1); + col_name_ids[c] = ray_sym_intern(name, strlen(name)); + } + } + + size_t data_offset = (size_t)(p - buf); + bool data_has_quotes = memchr(buf + data_offset, '"', file_size - data_offset) != NULL; + int8_t resolved_types[CSV_MAX_COLS]; + if (col_types_in && n_types >= ncols) { + for (int c = 0; c < ncols; c++) { + int8_t t = col_types_in[c]; + if (t < RAY_BOOL || t >= RAY_TYPE_COUNT || t == RAY_TABLE) { + munmap(buf, file_size); + return RAY_ERR_TYPE; + } + resolved_types[c] = t; + } + } else if (!col_types_in) { + ray_t* sample_offsets_hdr = NULL; + int64_t* sample_offsets = NULL; + int64_t sample_n = build_row_offsets_limited(buf, file_size, data_offset, + CSV_SAMPLE_ROWS, + data_has_quotes, + &sample_offsets, + &sample_offsets_hdr, + NULL); + csv_type_t col_types[CSV_MAX_COLS]; + memset(col_types, 0, (size_t)ncols * sizeof(csv_type_t)); + uint32_t text_hashes[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_lens[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_distinct[CSV_MAX_COLS] = {0}; + uint16_t text_non_null[CSV_MAX_COLS] = {0}; + for (int64_t r = 0; r < sample_n; r++) { + const char* rp = buf + sample_offsets[r]; + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + rp = scan_field(rp, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + csv_type_t t = detect_type(fld, flen); + if (t == CSV_TYPE_STR) + csv_cardinality_note(text_hashes, text_lens, + text_distinct, text_non_null, + c, fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + col_types[c] = promote_csv_type(col_types[c], t); + } + } + scratch_free(sample_offsets_hdr); + for (int c = 0; c < ncols; c++) { + resolved_types[c] = csv_resolve_inferred_type( + col_types[c], text_distinct[c], text_non_null[c]); + } + } else { + munmap(buf, file_size); + return RAY_ERR_TYPE; + } + + for (int c = 0; c < ncols; c++) { + if (resolved_types[c] == RAY_STR) { + ray_t* tbl = ray_read_csv_named_opts(path, delimiter, header, + col_types_in, n_types, + col_names_in, n_names); + if (!tbl || RAY_IS_ERR(tbl)) { + munmap(buf, file_size); + return tbl ? ray_err_from_obj(tbl) : RAY_ERR_IO; + } + err = ray_splay_save_bulk(tbl, dir, NULL); + ray_release(tbl); + if (err == RAY_OK) { + char sym_path[1024]; + int n = snprintf(sym_path, sizeof(sym_path), "%s/sym", dir); + if (n < 0 || (size_t)n >= sizeof(sym_path)) err = RAY_ERR_RANGE; + else err = ray_sym_save_bulk(sym_path); + } + munmap(buf, file_size); + return err; + } + } + + err = ray_mkdir_p(dir); + if (err != RAY_OK) { + munmap(buf, file_size); + return err; + } + + ray_t* schema = ray_vec_new(RAY_I64, ncols); + if (!schema || RAY_IS_ERR(schema)) { + munmap(buf, file_size); + return schema ? ray_err_from_obj(schema) : RAY_ERR_OOM; + } + schema->len = ncols; + memcpy(ray_data(schema), col_name_ids, (size_t)ncols * sizeof(int64_t)); + char schema_path[1024]; + int sn = snprintf(schema_path, sizeof(schema_path), "%s/.d", dir); + if (sn < 0 || (size_t)sn >= sizeof(schema_path)) + err = RAY_ERR_RANGE; + else + err = ray_col_save_bulk(schema, schema_path); + ray_release(schema); + if (err != RAY_OK) { + munmap(buf, file_size); + return err; + } + + csv_splayed_col_writer_t writers[CSV_MAX_COLS]; + memset(writers, 0, sizeof(writers)); + for (int c = 0; c < ncols; c++) { + err = csv_splayed_writer_open(&writers[c], dir, col_name_ids[c], + resolved_types[c]); + if (err != RAY_OK) { + for (int j = 0; j < c; j++) csv_splayed_writer_abort(&writers[j]); + munmap(buf, file_size); + return err; + } + } + + size_t chunk_offset = data_offset; + bool wrote_any = false; + while (chunk_offset < file_size || !wrote_any) { + ray_t* row_offsets_hdr = NULL; + int64_t* row_offsets = NULL; + size_t next_offset = chunk_offset; + int64_t cnt = 0; + if (chunk_offset < file_size) { + cnt = build_row_offsets_limited(buf, file_size, chunk_offset, + rows_per_chunk, data_has_quotes, + &row_offsets, + &row_offsets_hdr, &next_offset); + if (cnt <= 0) { + scratch_free(row_offsets_hdr); + err = RAY_ERR_IO; + break; + } + } + + ray_t* tbl = csv_materialize_rows(buf, file_size, row_offsets, + cnt, ncols, delimiter, col_name_ids, + resolved_types); + scratch_free(row_offsets_hdr); + if (!tbl || RAY_IS_ERR(tbl)) { + if (tbl) ray_release(tbl); + err = RAY_ERR_OOM; + break; + } + + for (int c = 0; c < ncols; c++) { + ray_t* col = ray_table_get_col_idx(tbl, c); + err = csv_splayed_writer_append(&writers[c], col); + if (err != RAY_OK) break; + } + ray_release(tbl); + if (err != RAY_OK) break; + wrote_any = true; + if (cnt == 0) break; + chunk_offset = next_offset; + } + + for (int c = 0; c < ncols; c++) { + ray_err_t cerr = (err == RAY_OK) ? csv_splayed_writer_close(&writers[c]) + : RAY_ERR_IO; + if (err == RAY_OK && cerr != RAY_OK) err = cerr; + if (err != RAY_OK) csv_splayed_writer_abort(&writers[c]); + } + + if (err == RAY_OK) { + char sym_path[1024]; + int n = snprintf(sym_path, sizeof(sym_path), "%s/sym", dir); + if (n < 0 || (size_t)n >= sizeof(sym_path)) err = RAY_ERR_RANGE; + else err = ray_sym_save_bulk(sym_path); + } + + munmap(buf, file_size); + return err; +} + +ray_err_t ray_csv_save_parted_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types_in, int32_t n_types, + const int64_t* col_names_in, int32_t n_names, + const char* root, const char* table_name, + int64_t rows_per_part) { + if (!path || !root || !table_name) return RAY_ERR_DOMAIN; + if (rows_per_part <= 0) rows_per_part = CSV_PART_ROWS_DEFAULT; + bool trace = getenv("RAY_CSV_TRACE") != NULL; + + int fd = open(path, O_RDONLY); + if (fd < 0) return RAY_ERR_IO; + + struct stat st; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + close(fd); + return RAY_ERR_IO; + } + size_t file_size = (size_t)st.st_size; + + char* buf = (char*)mmap(NULL, file_size, PROT_READ, MMAP_FLAGS, fd, 0); + close(fd); + if (buf == MAP_FAILED) return RAY_ERR_IO; + +#ifdef __APPLE__ + madvise(buf, file_size, MADV_SEQUENTIAL); +#endif + + const char* buf_end = buf + file_size; + ray_err_t err = RAY_OK; + + if (delimiter == 0) { + int commas = 0, tabs = 0; + for (const char* q = buf; q < buf_end && *q != '\n'; q++) { + if (*q == ',') commas++; + if (*q == '\t') tabs++; + } + delimiter = (tabs > commas) ? '\t' : ','; + } + + int ncols = 1; + { + const char* q = buf; + bool in_quote = false; + while (q < buf_end && (in_quote || (*q != '\n' && *q != '\r'))) { + if (*q == '"') in_quote = !in_quote; + else if (!in_quote && *q == delimiter) ncols++; + q++; + } + } + if (ncols > CSV_MAX_COLS) { + munmap(buf, file_size); + return RAY_ERR_RANGE; + } + + const char* p = buf; + char esc_buf[8192]; + int64_t col_name_ids[CSV_MAX_COLS]; + + if (header) { + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + p = scan_field(p, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + col_name_ids[c] = ray_sym_intern(fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + } + if (p < buf_end && *p == '\r') p++; + if (p < buf_end && *p == '\n') p++; + } else if (col_names_in && n_names >= ncols) { + for (int c = 0; c < ncols; c++) + col_name_ids[c] = col_names_in[c]; + } else { + for (int c = 0; c < ncols; c++) { + char name[32]; + snprintf(name, sizeof(name), "V%d", c + 1); + col_name_ids[c] = ray_sym_intern(name, strlen(name)); + } + } + + size_t data_offset = (size_t)(p - buf); + bool data_has_quotes = memchr(buf + data_offset, '"', file_size - data_offset) != NULL; + if (trace) { + fprintf(stderr, + "csv.parted: file=%s size=%zu ncols=%d data_offset=%zu rows_per_part=%" PRId64 " root=%s table=%s\n", + path, file_size, ncols, data_offset, rows_per_part, root, table_name); + } + + int8_t resolved_types[CSV_MAX_COLS]; + if (col_types_in && n_types >= ncols) { + for (int c = 0; c < ncols; c++) { + int8_t t = col_types_in[c]; + if (t < RAY_BOOL || t >= RAY_TYPE_COUNT || t == RAY_TABLE) { + munmap(buf, file_size); + return RAY_ERR_TYPE; + } + resolved_types[c] = t; + } + } else if (!col_types_in) { + ray_t* sample_offsets_hdr = NULL; + int64_t* sample_offsets = NULL; + int64_t sample_n = build_row_offsets_limited(buf, file_size, data_offset, + CSV_SAMPLE_ROWS, + data_has_quotes, + &sample_offsets, + &sample_offsets_hdr, + NULL); + csv_type_t col_types[CSV_MAX_COLS]; + memset(col_types, 0, (size_t)ncols * sizeof(csv_type_t)); + uint32_t text_hashes[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_lens[CSV_MAX_COLS][CSV_SAMPLE_ROWS] = {{0}}; + uint16_t text_distinct[CSV_MAX_COLS] = {0}; + uint16_t text_non_null[CSV_MAX_COLS] = {0}; + for (int64_t r = 0; r < sample_n; r++) { + const char* rp = buf + sample_offsets[r]; + for (int c = 0; c < ncols; c++) { + const char* fld; + size_t flen; + char* dyn_esc = NULL; + rp = scan_field(rp, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc); + csv_type_t t = detect_type(fld, flen); + if (t == CSV_TYPE_STR) + csv_cardinality_note(text_hashes, text_lens, + text_distinct, text_non_null, + c, fld, flen); + if (dyn_esc) ray_sys_free(dyn_esc); + col_types[c] = promote_csv_type(col_types[c], t); + } + } + scratch_free(sample_offsets_hdr); + for (int c = 0; c < ncols; c++) { + resolved_types[c] = csv_resolve_inferred_type( + col_types[c], text_distinct[c], text_non_null[c]); + } + } else { + munmap(buf, file_size); + return RAY_ERR_TYPE; + } + + err = ray_mkdir_p(root); + if (err != RAY_OK) { + munmap(buf, file_size); + return err; + } + + int64_t part = 0; + size_t chunk_offset = data_offset; + bool wrote_any = false; + while (chunk_offset < file_size || !wrote_any) { + ray_t* row_offsets_hdr = NULL; + int64_t* row_offsets = NULL; + size_t next_offset = chunk_offset; + int64_t cnt = 0; + if (chunk_offset < file_size) { + cnt = build_row_offsets_limited(buf, file_size, chunk_offset, + rows_per_part, data_has_quotes, + &row_offsets, + &row_offsets_hdr, &next_offset); + if (cnt <= 0) { + if (trace) + fprintf(stderr, "csv.parted: row-offset failure part=%" PRId64 " offset=%zu\n", + part, chunk_offset); + scratch_free(row_offsets_hdr); + err = RAY_ERR_IO; + break; + } + } + + ray_t* tbl = csv_materialize_rows(buf, file_size, row_offsets, + cnt, ncols, delimiter, col_name_ids, resolved_types); + if (!tbl || RAY_IS_ERR(tbl)) { + if (tbl) ray_release(tbl); + scratch_free(row_offsets_hdr); + err = RAY_ERR_OOM; + if (trace) + fprintf(stderr, "csv.parted: materialize failure part=%" PRId64 " rows=%" PRId64 "\n", + part, cnt); + break; + } + + char leaf[1024]; + int n = snprintf(leaf, sizeof(leaf), "%s/%" PRId64 "/%s", root, part, table_name); + if (n < 0 || (size_t)n >= sizeof(leaf)) { + ray_release(tbl); + scratch_free(row_offsets_hdr); + err = RAY_ERR_RANGE; + break; + } + + if (trace) + fprintf(stderr, "csv.parted: save part=%" PRId64 " rows=%" PRId64 " leaf=%s\n", + part, cnt, leaf); + err = ray_splay_save_bulk(tbl, leaf, NULL); + ray_release(tbl); + scratch_free(row_offsets_hdr); + if (err != RAY_OK) { + if (trace) + fprintf(stderr, "csv.parted: save failure part=%" PRId64 " err=%s\n", + part, ray_err_code_str(err)); + break; + } + wrote_any = true; + if (cnt == 0) break; + chunk_offset = next_offset; + part++; + } + + if (err == RAY_OK) { + char sym_path[1024]; + int n = snprintf(sym_path, sizeof(sym_path), "%s/sym", root); + if (n < 0 || (size_t)n >= sizeof(sym_path)) + err = RAY_ERR_RANGE; + else { + if (trace) + fprintf(stderr, "csv.parted: save sym=%s parts=%" PRId64 "\n", + sym_path, part); + err = ray_sym_save_bulk(sym_path); + if (trace && err != RAY_OK) + fprintf(stderr, "csv.parted: sym save failure err=%s\n", + ray_err_code_str(err)); + } + } + + munmap(buf, file_size); + if (trace) + fprintf(stderr, "csv.parted: done err=%s\n", ray_err_code_str(err)); + return err; } /* -------------------------------------------------------------------------- diff --git a/src/io/csv.h b/src/io/csv.h index 2240ae4b..1597d180 100644 --- a/src/io/csv.h +++ b/src/io/csv.h @@ -29,6 +29,18 @@ ray_t* ray_read_csv(const char* path); ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header, const int8_t* col_types, int32_t n_types); +ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types, int32_t n_types, + const int64_t* col_names, int32_t n_names); +ray_err_t ray_csv_save_parted_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types, int32_t n_types, + const int64_t* col_names, int32_t n_names, + const char* root, const char* table_name, + int64_t rows_per_part); +ray_err_t ray_csv_save_splayed_named_opts(const char* path, char delimiter, bool header, + const int8_t* col_types, int32_t n_types, + const int64_t* col_names, int32_t n_names, + const char* dir, int64_t rows_per_chunk); ray_err_t ray_write_csv(ray_t* table, const char* path); #endif /* RAY_CSV_H */ diff --git a/src/lang/eval.c b/src/lang/eval.c index 0ebe7105..93d82420 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -54,6 +54,25 @@ #define RAY_EVAL_MAX_DEPTH 512 _Thread_local static int eval_depth = 0; +typedef struct { + const ray_t* vec; + const void* data; + int64_t len; + int8_t type; + uint8_t attrs; + int is_f64; + int64_t sum_i; + double sum_f; +} affine_sum_cache_entry_t; + +#define AFFINE_SUM_CACHE_N 16 +static _Thread_local affine_sum_cache_entry_t affine_sum_cache[AFFINE_SUM_CACHE_N]; +static _Thread_local uint8_t affine_sum_cache_n = 0; + +static void affine_sum_cache_clear(void) { + affine_sum_cache_n = 0; +} + /* Thread-local nfo for eval context — tracks source locations during evaluation */ static _Thread_local ray_t* g_eval_nfo = NULL; @@ -111,6 +130,128 @@ static ray_t* materialize_owned_args(ray_t** args, int64_t n) { return NULL; } +static int64_t numeric_atom_i64(ray_t* x) { + switch (x->type) { + case -RAY_I64: + case -RAY_TIMESTAMP: + case -RAY_SYM: + return x->i64; + case -RAY_I32: + case -RAY_DATE: + case -RAY_TIME: + return x->i32; + case -RAY_I16: + return x->i16; + case -RAY_U8: + case -RAY_BOOL: + return x->u8; + default: + return (int64_t)as_f64(x); + } +} + +static ray_t* materialize_owned_value(ray_t* x) { + if (x && ray_is_lazy(x)) + x = ray_lazy_materialize(x); + return x; +} + +static int affine_sum_cache_lookup(ray_t* vec, affine_sum_cache_entry_t* out) { + const void* data = ray_data(vec); + for (uint8_t i = 0; i < affine_sum_cache_n; i++) { + affine_sum_cache_entry_t* e = &affine_sum_cache[i]; + if (e->vec == vec && e->data == data && e->len == vec->len && + e->type == vec->type && e->attrs == vec->attrs) { + *out = *e; + return 1; + } + } + return 0; +} + +static void affine_sum_cache_store(ray_t* vec, const affine_sum_cache_entry_t* in) { + if (affine_sum_cache_n >= AFFINE_SUM_CACHE_N) return; + affine_sum_cache[affine_sum_cache_n++] = *in; +} + +static ray_t* try_sum_affine_expr(ray_t* expr, int* handled) { + *handled = 0; + if (!expr || expr->type != RAY_LIST || ray_len(expr) != 3) + return NULL; + ray_t** e = (ray_t**)ray_data(expr); + if (!e[0] || e[0]->type != -RAY_SYM) + return NULL; + ray_t* name = ray_sym_str(e[0]->i64); + if (!name || ray_str_len(name) != 1 || ray_str_ptr(name)[0] != '+') + return NULL; + + ray_t* vec_expr = NULL; + ray_t* c_expr = NULL; + if (ray_is_atom(e[1]) && is_numeric(e[1])) { + c_expr = e[1]; + vec_expr = e[2]; + } else if (ray_is_atom(e[2]) && is_numeric(e[2])) { + vec_expr = e[1]; + c_expr = e[2]; + } else { + return NULL; + } + + ray_t* vec = ray_eval(vec_expr); + if (!vec || RAY_IS_ERR(vec)) + return vec ? vec : ray_error("type", NULL); + vec = materialize_owned_value(vec); + if (!vec || RAY_IS_ERR(vec)) + return vec ? vec : ray_error("type", NULL); + if (!ray_is_vec(vec)) { + ray_release(vec); + *handled = 0; + return NULL; + } + if (vec->attrs & RAY_ATTR_HAS_NULLS) { + ray_release(vec); + *handled = 0; + return NULL; + } + + *handled = 1; + affine_sum_cache_entry_t ce; + if (!affine_sum_cache_lookup(vec, &ce)) { + ray_t* sum = ray_sum_fn(vec); + if (!sum || RAY_IS_ERR(sum)) { + ray_release(vec); + return sum ? sum : ray_error("type", NULL); + } + sum = materialize_owned_value(sum); + if (!sum || RAY_IS_ERR(sum)) { + ray_release(vec); + return sum ? sum : ray_error("type", NULL); + } + + memset(&ce, 0, sizeof(ce)); + ce.vec = vec; + ce.data = ray_data(vec); + ce.len = vec->len; + ce.type = vec->type; + ce.attrs = vec->attrs; + ce.is_f64 = (sum->type == -RAY_F64); + if (ce.is_f64) ce.sum_f = as_f64(sum); + else ce.sum_i = numeric_atom_i64(sum); + ray_release(sum); + affine_sum_cache_store(vec, &ce); + } + + int64_t n = vec->len; + ray_release(vec); + if (ce.is_f64 || c_expr->type == -RAY_F64) { + double base = ce.is_f64 ? ce.sum_f : (double)ce.sum_i; + double out = base + as_f64(c_expr) * (double)n; + return make_f64(out); + } + int64_t out = ce.sum_i + numeric_atom_i64(c_expr) * n; + return make_i64(out); +} + /* ══════════════════════════════════════════ * Error handling: try / raise * ══════════════════════════════════════════ */ @@ -240,7 +381,59 @@ static ray_t* zero_atom_for_elem_type(ray_t* coll) { /* Map a binary function element-wise over collections. * Both args can be collections (zip-map) or one scalar (broadcast). * Produces typed vectors when output is numeric/bool, boxed lists otherwise. */ +static ray_t* atomic_map_binary_parted(ray_binary_fn fn, uint16_t dag_opcode, + ray_t* left, ray_t* right) { + int left_parted = left && !RAY_IS_ERR(left) && RAY_IS_PARTED(left->type); + int right_parted = right && !RAY_IS_ERR(right) && RAY_IS_PARTED(right->type); + int64_t nseg = left_parted ? left->len : right->len; + if (left_parted && right_parted && right->len < nseg) nseg = right->len; + + ray_t* out = ray_alloc((size_t)nseg * sizeof(ray_t*)); + if (!out) return ray_error("oom", NULL); + out->type = RAY_LIST; + out->len = 0; + out->attrs = 0; + memset(out->nullmap, 0, sizeof(out->nullmap)); + ray_t** dst = (ray_t**)ray_data(out); + ray_t** lsegs = left_parted ? (ray_t**)ray_data(left) : NULL; + ray_t** rsegs = right_parted ? (ray_t**)ray_data(right) : NULL; + int8_t out_base = 0; + + for (int64_t s = 0; s < nseg; s++) { + ray_t* l = left_parted ? lsegs[s] : left; + ray_t* r = right_parted ? rsegs[s] : right; + if (!l || !r) { + ray_release(out); + return ray_error("type", NULL); + } + ray_t* seg = atomic_map_binary_op(fn, dag_opcode, l, r); + if (!seg || RAY_IS_ERR(seg)) { + ray_release(out); + return seg ? seg : ray_error("domain", NULL); + } + if (!ray_is_vec(seg)) { + ray_release(seg); + ray_release(out); + return ray_error("type", NULL); + } + if (s == 0) out_base = seg->type; + if (seg->type != out_base) { + ray_release(seg); + ray_release(out); + return ray_error("type", NULL); + } + dst[s] = seg; + out->len = s + 1; + } + out->type = RAY_PARTED_BASE + out_base; + return out; +} + ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left, ray_t* right) { + if ((left && !RAY_IS_ERR(left) && RAY_IS_PARTED(left->type)) || + (right && !RAY_IS_ERR(right) && RAY_IS_PARTED(right->type))) + return atomic_map_binary_parted(fn, dag_opcode, left, right); + int left_coll = is_collection(left); int right_coll = is_collection(right); @@ -618,8 +811,7 @@ ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left, for (int64_t i = 0; i < n; i++) obuf[i] = fill; } else if (!atom_null && !vec_has_nulls) { /* Hot path: tight per-width loop, no per-element - * null checks. This is what ClickBench Q22..Q38 - * with R6-cleaned columns actually hit. */ + * null checks. */ uint8_t w = (uint8_t)(va & RAY_SYM_W_MASK); if (w == RAY_SYM_W8) { const uint8_t* d = (const uint8_t*)src; @@ -959,6 +1151,15 @@ ray_t* gather_by_idx(ray_t* vec, int64_t* idx, int64_t n) { if (ray_vec_is_null(vec, idx[i])) ray_vec_set_null(result, i, true); } + const ray_t* dict_owner = (vec->attrs & RAY_ATTR_SLICE) ? vec->slice_parent : vec; + if (dict_owner && + !(dict_owner->attrs & RAY_ATTR_SLICE) && + (!(dict_owner->attrs & RAY_ATTR_HAS_NULLS) || + (dict_owner->attrs & RAY_ATTR_NULLMAP_EXT)) && + dict_owner->sym_dict) { + ray_retain(dict_owner->sym_dict); + result->sym_dict = dict_owner->sym_dict; + } return result; } @@ -1294,7 +1495,9 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) { if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); ray_t* result = NULL; for (int64_t i = 0; i < n; i++) { - if (result) ray_release(result); + if (result) { + ray_release(result); + } result = ray_eval(args[i]); if (RAY_IS_ERR(result)) { ray_env_pop_scope(); @@ -2376,8 +2579,10 @@ static void ray_register_builtins(void) { register_vary("format", RAY_FN_NONE, ray_format_fn); register_vary("read-csv", RAY_FN_RESTRICTED, ray_read_csv_fn); register_vary("write-csv", RAY_FN_RESTRICTED, ray_write_csv_fn); - register_vary(".csv.read", RAY_FN_RESTRICTED, ray_read_csv_fn); - register_vary(".csv.write", RAY_FN_RESTRICTED, ray_write_csv_fn); + register_vary(".csv.read", RAY_FN_RESTRICTED, ray_read_csv_fn); + register_vary(".csv.splayed", RAY_FN_RESTRICTED, ray_read_csv_splayed_fn); + register_vary(".csv.parted", RAY_FN_RESTRICTED, ray_read_csv_parted_fn); + register_vary(".csv.write", RAY_FN_RESTRICTED, ray_write_csv_fn); register_binary("as", RAY_FN_NONE, ray_cast_fn); register_unary("type", RAY_FN_NONE, ray_type_fn); register_unary("read", RAY_FN_RESTRICTED, ray_read_file_fn); @@ -2623,6 +2828,7 @@ static void ray_register_builtins(void) { register_vary (".graph.shortest-path", RAY_FN_NONE, ray_graph_shortest_path_fn); register_vary (".graph.expand", RAY_FN_NONE, ray_graph_expand_fn); register_vary (".graph.var-expand", RAY_FN_NONE, ray_graph_var_expand_fn); + register_unary("strlen", RAY_FN_NONE | RAY_FN_LAZY_AWARE, ray_strlen_fn); } /* ══════════════════════════════════════════ @@ -2651,6 +2857,9 @@ void ray_lang_destroy(void) { ray_t* ray_eval(ray_t* obj) { if (!obj || RAY_IS_ERR(obj)) return obj; + if (eval_depth == 0) + affine_sum_cache_clear(); + /* Check for external interrupt (e.g. Ctrl-C from REPL) */ if (g_eval_interrupted) return ray_error("limit", "interrupted"); @@ -2721,6 +2930,24 @@ ray_t* ray_eval(ray_t* obj) { if (fn_is_restricted(head)) { ray_release(head); ret = ray_error("access", "restricted"); goto out; } ray_unary_fn fn = (ray_unary_fn)(uintptr_t)head->i64; uint8_t fn_attrs = head->attrs; + if (fn == (ray_unary_fn)ray_sum_fn) { + int handled = 0; + ray_t* fast = try_sum_affine_expr(elems[1], &handled); + if (handled) { + ray_release(head); + ret = fast ? fast : ray_error("type", NULL); + goto out; + } + } + if (fn == (ray_unary_fn)ray_count_fn) { + int handled = 0; + ray_t* fast = ray_try_count_select_expr(elems[1], &handled); + if (handled) { + ray_release(head); + ret = fast ? fast : ray_error("type", NULL); + goto out; + } + } ray_t* arg = ray_eval(elems[1]); ray_release(head); if (arg && RAY_IS_ERR(arg)) { ret = arg; goto out; } diff --git a/src/lang/internal.h b/src/lang/internal.h index cbb82ed6..482d42fa 100644 --- a/src/lang/internal.h +++ b/src/lang/internal.h @@ -358,6 +358,7 @@ ray_t* ray_enlist_fn(ray_t** args, int64_t n); /* String builtins (formerly static in eval.c, now in str_builtin.c) */ ray_t* ray_split_fn(ray_t* str, ray_t* delim); +ray_t* ray_strlen_fn(ray_t* x); ray_t* ray_like_fn(ray_t* x, ray_t* pattern); ray_t* ray_sym_name_fn(ray_t* x); @@ -479,6 +480,8 @@ ray_t* ray_resolve_fn(ray_t** args, int64_t n); ray_t* ray_timeit_fn(ray_t** args, int64_t n); ray_t* ray_exit_fn(ray_t* arg); ray_t* ray_read_csv_fn(ray_t** args, int64_t n); +ray_t* ray_read_csv_splayed_fn(ray_t** args, int64_t n); +ray_t* ray_read_csv_parted_fn(ray_t** args, int64_t n); ray_t* ray_write_csv_fn(ray_t** args, int64_t n); ray_t* ray_cast_fn(ray_t* type_sym, ray_t* val); ray_t* ray_type_fn(ray_t* val); @@ -495,6 +498,7 @@ ray_t* ray_within_fn(ray_t* vals, ray_t* range); /* Query bridge builtins (formerly in eval.c, now in ops/query.c) */ ray_t* ray_select_fn(ray_t** args, int64_t n); +ray_t* ray_try_count_select_expr(ray_t* expr, int* handled); ray_t* ray_update_fn(ray_t** args, int64_t n); ray_t* ray_insert_fn(ray_t** args, int64_t n); ray_t* ray_upsert_fn(ray_t** args, int64_t n); diff --git a/src/mem/heap.c b/src/mem/heap.c index 75b7ca10..8af6d506 100644 --- a/src/mem/heap.c +++ b/src/mem/heap.c @@ -936,6 +936,12 @@ void ray_free(ray_t* v) { if (v->type > 0 && v->type < RAY_TYPE_COUNT) { uint8_t esz = ray_sym_elem_size(v->type, v->attrs); size_t data_size = 32 + (size_t)v->len * esz; + if (v->type == RAY_STR) { + size_t pool_len = 0; + if (v->str_pool && !RAY_IS_ERR(v->str_pool) && v->str_pool->len > 0) + pool_len = (size_t)v->str_pool->len; + data_size += 32 + pool_len; + } if (v->attrs & RAY_ATTR_NULLMAP_EXT) data_size += ((size_t)v->len + 7) / 8; size_t mapped_size = (data_size + 4095) & ~(size_t)4095; @@ -1427,6 +1433,25 @@ void ray_heap_gc(void) { /* Don't increment p — check swapped entry */ } } + + /* Phase 5: Release physical pages from free blocks in every + * idle heap. Phase 2 may have returned blocks to worker-owned + * freelists; releasing only the caller heap leaves those worker + * pages resident across large query repetitions. */ + for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { + ray_heap_t* gh = ray_heap_registry[hid]; + if (!gh) continue; + for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { + ray_fl_head_t* head = &gh->freelist[i]; + ray_t* blk = head->fl_next; + while (blk != (ray_t*)head) { + size_t bsize = BSIZEOF(i); + if (bsize > 4096) + ray_vm_release((char*)blk + 4096, bsize - 4096); + blk = blk->fl_next; + } + } + } } } diff --git a/src/ops/agg.c b/src/ops/agg.c index 39052e60..543689ef 100644 --- a/src/ops/agg.c +++ b/src/ops/agg.c @@ -89,8 +89,135 @@ static void nth_element_dbl(double* a, int64_t lo, int64_t hi, int64_t k) { return ray_lazy_wrap(g, op); \ } while(0) +static int agg_parted_numeric_base(int8_t t) { + return t == RAY_BOOL || t == RAY_U8 || t == RAY_I16 || + t == RAY_I32 || t == RAY_I64 || t == RAY_F64 || + t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP; +} + +static int64_t agg_read_i64(ray_t* v, int64_t i) { + void* d = ray_data(v); + switch (v->type) { + case RAY_BOOL: + case RAY_U8: return ((uint8_t*)d)[i]; + case RAY_I16: return ((int16_t*)d)[i]; + case RAY_I32: + case RAY_DATE: + case RAY_TIME: return ((int32_t*)d)[i]; + case RAY_I64: + case RAY_TIMESTAMP: return ((int64_t*)d)[i]; + default: return 0; + } +} + +static ray_t* agg_atom_i64_for_type(int8_t t, int64_t v) { + switch (t) { + case RAY_BOOL: return ray_bool(v != 0); + case RAY_U8: return ray_u8((uint8_t)v); + case RAY_I16: return ray_i16((int16_t)v); + case RAY_I32: return ray_i32((int32_t)v); + case RAY_DATE: return ray_date(v); + case RAY_TIME: return ray_time(v); + case RAY_TIMESTAMP: return ray_timestamp(v); + default: return ray_i64(v); + } +} + +static ray_t* agg_parted_sum(ray_t* x) { + int8_t base = (int8_t)RAY_PARTED_BASETYPE(x->type); + if (!agg_parted_numeric_base(base) || base == RAY_DATE) + return ray_error("type", NULL); + ray_t** segs = (ray_t**)ray_data(x); + if (base == RAY_F64) { + double sum = 0.0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + double* d = (double*)ray_data(seg); + int has_nulls = (seg->attrs & RAY_ATTR_HAS_NULLS) != 0; + for (int64_t i = 0; i < seg->len; i++) + if (!has_nulls || !ray_vec_is_null(seg, i)) sum += d[i]; + } + return make_f64(sum); + } + int64_t sum = 0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int has_nulls = (seg->attrs & RAY_ATTR_HAS_NULLS) != 0; + for (int64_t i = 0; i < seg->len; i++) + if (!has_nulls || !ray_vec_is_null(seg, i)) sum += agg_read_i64(seg, i); + } + if (base == RAY_TIME) return ray_time(sum); + if (base == RAY_TIMESTAMP) return ray_timestamp(sum); + return make_i64(sum); +} + +static ray_t* agg_parted_avg(ray_t* x) { + int8_t base = (int8_t)RAY_PARTED_BASETYPE(x->type); + if (!agg_parted_numeric_base(base)) return ray_error("type", NULL); + ray_t** segs = (ray_t**)ray_data(x); + double sum = 0.0; + int64_t cnt = 0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int has_nulls = (seg->attrs & RAY_ATTR_HAS_NULLS) != 0; + if (base == RAY_F64) { + double* d = (double*)ray_data(seg); + for (int64_t i = 0; i < seg->len; i++) { + if (has_nulls && ray_vec_is_null(seg, i)) continue; + sum += d[i]; cnt++; + } + } else { + for (int64_t i = 0; i < seg->len; i++) { + if (has_nulls && ray_vec_is_null(seg, i)) continue; + sum += (double)agg_read_i64(seg, i); cnt++; + } + } + } + if (cnt == 0) return ray_typed_null(-RAY_F64); + return make_f64(sum / (double)cnt); +} + +static ray_t* agg_parted_minmax(ray_t* x, int want_max) { + int8_t base = (int8_t)RAY_PARTED_BASETYPE(x->type); + if (!agg_parted_numeric_base(base)) return ray_error("type", NULL); + ray_t** segs = (ray_t**)ray_data(x); + int found = 0; + double best_f = 0.0; + int64_t best_i = 0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int has_nulls = (seg->attrs & RAY_ATTR_HAS_NULLS) != 0; + if (base == RAY_F64) { + double* d = (double*)ray_data(seg); + for (int64_t i = 0; i < seg->len; i++) { + if (has_nulls && ray_vec_is_null(seg, i)) continue; + double v = d[i]; + if (!found || (want_max ? v > best_f : v < best_f)) { + best_f = v; found = 1; + } + } + } else { + for (int64_t i = 0; i < seg->len; i++) { + if (has_nulls && ray_vec_is_null(seg, i)) continue; + int64_t v = agg_read_i64(seg, i); + if (!found || (want_max ? v > best_i : v < best_i)) { + best_i = v; found = 1; + } + } + } + } + if (!found) return ray_typed_null(-base); + if (base == RAY_F64) return make_f64(best_f); + return agg_atom_i64_for_type(base, best_i); +} + ray_t* ray_sum_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_SUM); + if (RAY_IS_PARTED(x->type)) return agg_parted_sum(x); if (ray_is_atom(x)) { /* u8/i16 scalar sum promotes to i64 */ if (x->type == -RAY_U8) return make_i64((int64_t)x->u8); @@ -175,6 +302,7 @@ ray_t* ray_count_fn(ray_t* x) { ray_t* ray_avg_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_AVG); + if (RAY_IS_PARTED(x->type)) return agg_parted_avg(x); if (ray_is_atom(x)) { if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64); if (is_numeric(x)) return make_f64(as_f64(x)); @@ -198,6 +326,7 @@ ray_t* ray_avg_fn(ray_t* x) { ray_t* ray_min_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN); + if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0); if (ray_is_atom(x)) { ray_retain(x); return x; } if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op); if (!is_list(x)) return ray_error("type", NULL); @@ -219,6 +348,7 @@ ray_t* ray_min_fn(ray_t* x) { ray_t* ray_max_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX); + if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1); if (ray_is_atom(x)) { ray_retain(x); return x; } if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op); if (!is_list(x)) return ray_error("type", NULL); diff --git a/src/ops/arith.c b/src/ops/arith.c index 29521ad1..129c48e8 100644 --- a/src/ops/arith.c +++ b/src/ops/arith.c @@ -22,11 +22,14 @@ */ #include "lang/internal.h" +#include "ops/ops.h" /* Arithmetic builtins (atom-only). * Vector dispatch goes through the DAG executor. */ ray_t* ray_add_fn(ray_t* a, ray_t* b) { + if ((a && RAY_IS_PARTED(a->type)) || (b && RAY_IS_PARTED(b->type))) + return atomic_map_binary_op(ray_add_fn, OP_ADD, a, b); /* Vector fast path — only when at least one operand is a typed vector */ /* Temporal + integer arithmetic (only int types, not float) */ @@ -91,6 +94,8 @@ ray_t* ray_add_fn(ray_t* a, ray_t* b) { } ray_t* ray_sub_fn(ray_t* a, ray_t* b) { + if ((a && RAY_IS_PARTED(a->type)) || (b && RAY_IS_PARTED(b->type))) + return atomic_map_binary_op(ray_sub_fn, OP_SUB, a, b); /* Temporal - int null propagation (both operands) */ if (is_temporal(a) && is_numeric(b)) { @@ -161,6 +166,8 @@ ray_t* ray_sub_fn(ray_t* a, ray_t* b) { } ray_t* ray_mul_fn(ray_t* a, ray_t* b) { + if ((a && RAY_IS_PARTED(a->type)) || (b && RAY_IS_PARTED(b->type))) + return atomic_map_binary_op(ray_mul_fn, OP_MUL, a, b); /* int * TIME → TIME, TIME * int → TIME */ if (is_numeric(a) && b->type == -RAY_TIME) { @@ -186,6 +193,8 @@ ray_t* ray_mul_fn(ray_t* a, ray_t* b) { } ray_t* ray_div_fn(ray_t* a, ray_t* b) { + if ((a && RAY_IS_PARTED(a->type)) || (b && RAY_IS_PARTED(b->type))) + return atomic_map_binary_op(ray_div_fn, OP_DIV, a, b); if (!is_numeric(a) || !is_numeric(b)) return ray_error("type", "cannot divide %s by %s", ray_type_name(a->type), ray_type_name(b->type)); @@ -213,6 +222,8 @@ ray_t* ray_idiv_fn(ray_t* a, ray_t* b) { } ray_t* ray_mod_fn(ray_t* a, ray_t* b) { + if ((a && RAY_IS_PARTED(a->type)) || (b && RAY_IS_PARTED(b->type))) + return atomic_map_binary_op(ray_mod_fn, OP_MOD, a, b); /* Temporal % numeric → temporal (same type as left operand) */ if (is_temporal(a) && is_numeric(b)) { if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) diff --git a/src/ops/builtins.c b/src/ops/builtins.c index 0dd220e8..076961c5 100644 --- a/src/ops/builtins.c +++ b/src/ops/builtins.c @@ -34,6 +34,8 @@ #include "core/types.h" #include "io/csv.h" #include "ops/ops.h" +#include "store/part.h" +#include "store/splay.h" #include "table/sym.h" #include "core/profile.h" #include "mem/sys.h" @@ -470,12 +472,18 @@ static int8_t resolve_type_name(int64_t sym_id) { } ray_t* ray_read_csv_fn(ray_t** args, int64_t n) { - if (n < 1) return ray_error("domain", NULL); + if (n < 1 || n > 3) return ray_error("domain", NULL); - /* (read-csv [types] "path") or (read-csv "path") */ + /* (read-csv [types] "path"), (read-csv [names] [types] "path"), or (read-csv "path") */ ray_t* path_obj = NULL; ray_t* schema = NULL; - if (n >= 2 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM) { + ray_t* names = NULL; + if (n >= 3 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM && + ray_is_vec(args[1]) && args[1]->type == RAY_SYM) { + names = args[0]; + schema = args[1]; + path_obj = args[2]; + } else if (n >= 2 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM) { schema = args[0]; path_obj = args[1]; } else { @@ -493,21 +501,217 @@ ray_t* ray_read_csv_fn(ray_t** args, int64_t n) { int64_t ncols = schema->len; int8_t col_types[256]; if (ncols > 256) return ray_error("limit", NULL); - int64_t* sym_ids = (int64_t*)ray_data(schema); + void* sym_data = ray_data(schema); for (int64_t i = 0; i < ncols; i++) { - col_types[i] = resolve_type_name(sym_ids[i]); + int64_t sid = ray_read_sym(sym_data, i, schema->type, schema->attrs); + col_types[i] = resolve_type_name(sid); if (col_types[i] < 0) return ray_error("type", NULL); } - ray_t* tbl = ray_read_csv_opts(path, 0, true, col_types, (int32_t)ncols); - if (!tbl || RAY_IS_ERR(tbl)) return ray_error("io", NULL); + int64_t col_names[256]; + if (names) { + if (names->len != ncols) return ray_error("length", NULL); + void* name_data = ray_data(names); + for (int64_t i = 0; i < ncols; i++) + col_names[i] = ray_read_sym(name_data, i, names->type, names->attrs); + } + ray_t* tbl = names + ? ray_read_csv_named_opts(path, 0, false, col_types, (int32_t)ncols, + col_names, (int32_t)ncols) + : ray_read_csv_opts(path, 0, true, col_types, (int32_t)ncols); + if (!tbl) return ray_error("io", NULL); + if (RAY_IS_ERR(tbl)) return tbl; return tbl; } ray_t* tbl = ray_read_csv(path); - if (!tbl || RAY_IS_ERR(tbl)) return ray_error("io", NULL); + if (!tbl) return ray_error("io", NULL); + if (RAY_IS_ERR(tbl)) return tbl; return tbl; } +static const char* csv_str_arg(ray_t* s, char* buf, size_t bufsz) { + if (!s || s->type != -RAY_STR) return NULL; + const char* p = ray_str_ptr(s); + size_t len = ray_str_len(s); + if (!p || len == 0 || len >= bufsz) return NULL; + memcpy(buf, p, len); + buf[len] = '\0'; + return buf; +} + +static const char* csv_sym_arg(ray_t* s, char* buf, size_t bufsz) { + if (!s || s->type != -RAY_SYM) return NULL; + ray_t* text = ray_sym_str(s->i64); + if (!text) return NULL; + const char* p = ray_str_ptr(text); + size_t len = ray_str_len(text); + if (!p || len == 0 || len >= bufsz) { + ray_release(text); + return NULL; + } + memcpy(buf, p, len); + buf[len] = '\0'; + ray_release(text); + return buf; +} + +static int csv_valid_table_name(const char* name) { + return name && name[0] != '\0' && name[0] != '.' && + !strchr(name, '/') && !strchr(name, '\\') && !strstr(name, ".."); +} + +static const char* csv_default_sym_path(const char* dir, char* buf, size_t bufsz) { + int n = snprintf(buf, bufsz, "%s/sym", dir); + if (n < 0 || (size_t)n >= bufsz) return NULL; + return buf; +} + +ray_t* ray_read_csv_splayed_fn(ray_t** args, int64_t n) { + if (n < 2 || n > 4) return ray_error("domain", NULL); + + char dir[1024]; + if (!csv_str_arg(args[n - 1], dir, sizeof(dir))) return ray_error("type", NULL); + + int64_t data_n = n - 1; + ray_t* path_obj = NULL; + ray_t* schema = NULL; + ray_t* names = NULL; + if (data_n >= 3 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM && + ray_is_vec(args[1]) && args[1]->type == RAY_SYM) { + names = args[0]; + schema = args[1]; + path_obj = args[2]; + } else if (data_n >= 2 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM) { + schema = args[0]; + path_obj = args[1]; + } else if (data_n == 1) { + path_obj = args[0]; + } else { + return ray_error("domain", NULL); + } + if (!path_obj || path_obj->type != -RAY_STR) return ray_error("type", NULL); + const char* path = ray_str_ptr(path_obj); + if (!path) return ray_error("domain", NULL); + + int8_t col_types[256]; + int64_t col_names[256]; + int32_t ncols = 0; + const int8_t* types_arg = NULL; + const int64_t* names_arg = NULL; + bool header = true; + + if (schema) { + if (schema->len > 256) return ray_error("limit", NULL); + ncols = (int32_t)schema->len; + void* sym_data = ray_data(schema); + for (int64_t i = 0; i < schema->len; i++) { + int64_t sid = ray_read_sym(sym_data, i, schema->type, schema->attrs); + col_types[i] = resolve_type_name(sid); + if (col_types[i] < 0) return ray_error("type", NULL); + } + types_arg = col_types; + if (names) { + if (names->len != schema->len) return ray_error("length", NULL); + void* name_data = ray_data(names); + for (int64_t i = 0; i < names->len; i++) + col_names[i] = ray_read_sym(name_data, i, names->type, names->attrs); + names_arg = col_names; + header = false; + } + } + + ray_err_t err = ray_csv_save_splayed_named_opts(path, 0, header, + types_arg, ncols, + names_arg, ncols, + dir, 0); + if (err != RAY_OK) return ray_error(ray_err_code_str(err), NULL); + + char sym_path[1024]; + const char* sym = csv_default_sym_path(dir, sym_path, sizeof(sym_path)); + if (!sym) return ray_error("io", NULL); + return ray_read_splayed(dir, sym); +} + +ray_t* ray_read_csv_parted_fn(ray_t** args, int64_t n) { + if (n < 3 || n > 6) return ray_error("domain", NULL); + + char root[1024]; + if (!csv_str_arg(args[n - 2], root, sizeof(root))) return ray_error("type", NULL); + + char table_name[256]; + if (!csv_sym_arg(args[n - 1], table_name, sizeof(table_name))) + return ray_error("type", NULL); + if (!csv_valid_table_name(table_name)) return ray_error("domain", NULL); + + int64_t rows_per_part = 0; + int64_t data_n = n - 2; + if (data_n >= 2 && is_numeric(args[n - 3])) { + rows_per_part = as_i64(args[n - 3]); + if (rows_per_part <= 0) return ray_error("domain", NULL); + data_n--; + } + ray_t* path_obj = NULL; + ray_t* schema = NULL; + ray_t* names = NULL; + if (data_n >= 3 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM && + ray_is_vec(args[1]) && args[1]->type == RAY_SYM) { + names = args[0]; + schema = args[1]; + path_obj = args[2]; + } else if (data_n >= 2 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM) { + schema = args[0]; + path_obj = args[1]; + } else if (data_n == 1) { + path_obj = args[0]; + } else { + return ray_error("domain", NULL); + } + + if (!path_obj || path_obj->type != -RAY_STR) return ray_error("type", NULL); + const char* path = ray_str_ptr(path_obj); + if (!path) return ray_error("domain", NULL); + + int8_t col_types[256]; + int64_t col_names[256]; + int32_t ncols = 0; + const int8_t* types_arg = NULL; + const int64_t* names_arg = NULL; + bool header = true; + + if (schema) { + if (schema->len > 256) return ray_error("limit", NULL); + ncols = (int32_t)schema->len; + void* sym_data = ray_data(schema); + for (int64_t i = 0; i < schema->len; i++) { + int64_t sid = ray_read_sym(sym_data, i, schema->type, schema->attrs); + col_types[i] = resolve_type_name(sid); + if (col_types[i] < 0) return ray_error("type", NULL); + } + types_arg = col_types; + if (names) { + if (names->len != schema->len) return ray_error("length", NULL); + void* name_data = ray_data(names); + for (int64_t i = 0; i < names->len; i++) + col_names[i] = ray_read_sym(name_data, i, names->type, names->attrs); + names_arg = col_names; + header = false; + } + } + + ray_err_t err = ray_csv_save_parted_named_opts(path, 0, header, + types_arg, ncols, + names_arg, ncols, + root, table_name, rows_per_part); + if (err != RAY_OK) return ray_error(ray_err_code_str(err), NULL); + + ray_t* out = ray_read_parted(root, table_name); + if (getenv("RAY_CSV_TRACE") && out && RAY_IS_ERR(out)) { + fprintf(stderr, "csv.parted: read_parted failure root=%s table=%s err=%s\n", + root, table_name, ray_err_code(out)); + } + return out; +} + /* (write-csv table path) — write table to CSV file */ ray_t* ray_write_csv_fn(ray_t** args, int64_t n) { if (n < 2) return ray_error("domain", NULL); diff --git a/src/ops/collection.c b/src/ops/collection.c index 1a5079ad..f8c51469 100644 --- a/src/ops/collection.c +++ b/src/ops/collection.c @@ -29,6 +29,7 @@ #include "mem/sys.h" #include "ops/hash.h" #include "ops/internal.h" /* col_propagate_str_pool */ +#include "ops/ops.h" #include #include @@ -693,6 +694,19 @@ int atom_eq(ray_t* a, ray_t* b) { /* Forward declaration */ ray_t* list_to_typed_vec(ray_t* list, int8_t orig_vec_type); +static void propagate_sym_dict(ray_t* dst, const ray_t* src) { + if (!dst || !src || dst->type != RAY_SYM || src->type != RAY_SYM) return; + const ray_t* owner = (src->attrs & RAY_ATTR_SLICE) ? src->slice_parent : src; + if (owner && + !(owner->attrs & RAY_ATTR_SLICE) && + (!(owner->attrs & RAY_ATTR_HAS_NULLS) || + (owner->attrs & RAY_ATTR_NULLMAP_EXT)) && + owner->sym_dict) { + ray_retain(owner->sym_dict); + dst->sym_dict = owner->sym_dict; + } +} + /* Eager vector dedup — called by the DAG executor's OP_DISTINCT case. * Factored out so the executor doesn't go through ray_distinct_fn, which * is now a lazy producer for vectors and would re-wrap into a chain. */ @@ -726,6 +740,45 @@ ray_t* distinct_vec_eager(ray_t* x) { return result; } +static ray_t* parted_to_flat_vec(ray_t* x) { + if (!x || !RAY_IS_PARTED(x->type)) return ray_error("type", NULL); + int8_t base = (int8_t)RAY_PARTED_BASETYPE(x->type); + ray_t** segs = (ray_t**)ray_data(x); + int64_t total = 0; + for (int64_t s = 0; s < x->len; s++) + if (segs[s]) total += segs[s]->len; + + ray_t* out = ray_vec_new(base, base == RAY_STR ? 0 : total); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + if (base != RAY_STR) out->len = total; + + int64_t pos = 0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + for (int64_t i = 0; i < seg->len; i++) { + if (base == RAY_STR) { + size_t slen = 0; + const char* sp = ray_str_vec_get(seg, i, &slen); + out = ray_str_vec_append(out, sp ? sp : "", sp ? slen : 0); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + if (ray_vec_is_null(seg, i)) ray_vec_set_null(out, pos, true); + } else { + int allocated = 0; + ray_t* elem = collection_elem(seg, i, &allocated); + if (!elem || RAY_IS_ERR(elem) || store_typed_elem(out, pos, elem) != 0) { + if (allocated && elem && !RAY_IS_ERR(elem)) ray_release(elem); + ray_release(out); + return elem && RAY_IS_ERR(elem) ? elem : ray_error("type", NULL); + } + if (allocated) ray_release(elem); + } + pos++; + } + } + return out; +} + /* (distinct x) — remove duplicates. Dispatches on type: * table → deduplicate rows (via DAG GROUP with zero aggs) * vector → remove duplicate elements, preserving first occurrence @@ -739,6 +792,14 @@ ray_t* ray_distinct_fn(ray_t* x) { if (x->type == RAY_TABLE) return ray_table_distinct_fn(x); + if (RAY_IS_PARTED(x->type)) { + ray_t* flat = parted_to_flat_vec(x); + if (!flat || RAY_IS_ERR(flat)) return flat ? flat : ray_error("oom", NULL); + ray_t* out = distinct_vec_eager(flat); + ray_release(flat); + return out; + } + /* String distinct: unique chars, sorted */ if (ray_is_atom(x) && (-x->type) == RAY_STR) { const char* sp = ray_str_ptr(x); @@ -1224,7 +1285,8 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { ray_t* taken = ray_take_fn(col, n_obj); if (RAY_IS_ERR(taken)) { ray_release(result); return taken; } result = ray_table_add_col(result, name_id, taken); - if (RAY_IS_ERR(result)) { ray_release(taken); return result; } + ray_release(taken); + if (RAY_IS_ERR(result)) return result; } return result; } @@ -1258,8 +1320,10 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { if (end > len) end = len; int64_t count = end - start; int8_t vtype = vec->type; - int esz = ray_elem_size(vtype); - ray_t* result = ray_vec_new(vtype, count); + int esz = ray_sym_elem_size(vtype, vec->attrs); + ray_t* result = (vtype == RAY_SYM) + ? ray_sym_vec_new(vec->attrs & RAY_SYM_W_MASK, count) + : ray_vec_new(vtype, count); if (RAY_IS_ERR(result)) return result; result->len = count; memcpy(ray_data(result), (char*)ray_data(vec) + start * esz, (size_t)(count * esz)); @@ -1267,6 +1331,7 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { * source's str_pool by pool_off — propagate the pool ray_t * (with retain) so the result owns a valid backing store. */ if (vtype == RAY_STR) col_propagate_str_pool(result, vec); + if (vtype == RAY_SYM) propagate_sym_dict(result, vec); /* Propagate null bitmap — check parent's flag for slices */ bool has_nulls = (vec->attrs & RAY_ATTR_HAS_NULLS) || ((vec->attrs & RAY_ATTR_SLICE) && vec->slice_parent && @@ -1396,7 +1461,8 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { ray_t* taken = ray_take_fn(col, n_obj); if (RAY_IS_ERR(taken)) { ray_release(result); return taken; } result = ray_table_add_col(result, name_id, taken); - if (RAY_IS_ERR(result)) { ray_release(taken); return result; } + ray_release(taken); + if (RAY_IS_ERR(result)) return result; } return result; } @@ -1418,8 +1484,10 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { int64_t n = as_i64(n_obj); int64_t abs_n = n < 0 ? -n : n; int8_t vtype = vec->type; - int esz = ray_elem_size(vtype); - ray_t* result = ray_vec_new(vtype, abs_n); + int esz = ray_sym_elem_size(vtype, vec->attrs); + ray_t* result = (vtype == RAY_SYM) + ? ray_sym_vec_new(vec->attrs & RAY_SYM_W_MASK, abs_n) + : ray_vec_new(vtype, abs_n); if (RAY_IS_ERR(result)) return result; result->len = abs_n; char* src = (char*)ray_data(vec); @@ -1459,6 +1527,7 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { * past the SSO threshold, tripping the assertion in * ray_str_t_ptr / strsort_repack_window / strkey_cmp. */ if (vtype == RAY_STR) col_propagate_str_pool(result, vec); + if (vtype == RAY_SYM) propagate_sym_dict(result, vec); /* Propagate null bitmap — check parent's flag for slices */ bool has_nulls = len > 0 && ((vec->attrs & RAY_ATTR_HAS_NULLS) || diff --git a/src/ops/exec.c b/src/ops/exec.c index 0aace3b1..6ad817d7 100644 --- a/src/ops/exec.c +++ b/src/ops/exec.c @@ -24,6 +24,7 @@ #include "ops/internal.h" #include "ops/rowsel.h" #include "ops/fused_group.h" +#include "mem/heap.h" #include "mem/sys.h" /* Global profiler instance (zero-initialized = inactive) */ @@ -1344,6 +1345,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { } ray_t* result = exec_sort(g, child_op, tbl, n); if (sort_input != g->table) ray_release(sort_input); + if (result && !RAY_IS_ERR(result)) ray_heap_gc(); return result; } @@ -1412,6 +1414,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { ray_release(pred); if (filter_input != saved_table) ray_release(filter_input); + if (result && !RAY_IS_ERR(result)) ray_heap_gc(); return result; } else { input = exec_node(g, op->inputs[0]); diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index 2c7cffe9..c8fc9100 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -154,6 +154,22 @@ static int fp_col_supported(const ray_t* col) { return 1; } +static int fp_expr_const_str(ray_t* expr) { + if (!expr) return 0; + if (expr->type == -RAY_STR && !(expr->attrs & RAY_ATTR_NAME)) return 1; + if (expr->type != RAY_LIST || ray_len(expr) < 2) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* head = ray_sym_str(elems[0]->i64); + if (!head) return 0; + int is_concat = (ray_str_len(head) == 6 + && memcmp(ray_str_ptr(head), "concat", 6) == 0); + if (!is_concat) return 0; + for (int64_t i = 1; i < ray_len(expr); i++) + if (!fp_expr_const_str(elems[i])) return 0; + return 1; +} + /* Is `expr` a phase-3 simple comparison form (op col const)? Validates * that the column exists in `tbl` and that ordering ops only target * non-SYM columns. Returns the FP_* code on success, or -1 on miss. */ @@ -201,6 +217,59 @@ static int fp_check_simple_cmp(ray_t* expr, ray_t* tbl) { return code; } +static int fp_check_like(ray_t* expr, ray_t* tbl) { + if (!expr || expr->type != RAY_LIST) return 0; + if (ray_len(expr) != 3) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* op_sym = ray_sym_str(elems[0]->i64); + if (!op_sym || ray_str_len(op_sym) != 4 + || memcmp(ray_str_ptr(op_sym), "like", 4) != 0) + return 0; + ray_t* lhs = elems[1]; + if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME)) + return 0; + if (!fp_expr_const_str(elems[2])) return 0; + if (tbl) { + ray_t* col = ray_table_get_col(tbl, lhs->i64); + if (!col || !fp_col_supported(col)) return 0; + if (col->type != RAY_STR && col->type != RAY_SYM) return 0; + } + return 1; +} + +static int fp_int_family(int8_t t) { + return t == RAY_BOOL || t == RAY_U8 || t == RAY_I16 || t == RAY_I32 || + t == RAY_I64 || t == RAY_DATE || t == RAY_TIME || + t == RAY_TIMESTAMP; +} + +static int fp_check_in(ray_t* expr, ray_t* tbl) { + if (!expr || expr->type != RAY_LIST) return 0; + if (ray_len(expr) != 3) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* op_sym = ray_sym_str(elems[0]->i64); + if (!op_sym || ray_str_len(op_sym) != 2 + || memcmp(ray_str_ptr(op_sym), "in", 2) != 0) + return 0; + ray_t* lhs = elems[1]; + ray_t* rhs = elems[2]; + if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME)) + return 0; + if (!rhs || !ray_is_vec(rhs) || (rhs->attrs & RAY_ATTR_NAME)) + return 0; + if (ray_len(rhs) > 16) return 0; + if (!fp_int_family(rhs->type)) return 0; + if (tbl) { + ray_t* col = ray_table_get_col(tbl, lhs->i64); + if (!col || !fp_col_supported(col)) return 0; + if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return 0; + if (!fp_int_family(col->type)) return 0; + } + return 1; +} + /* Phase-3 supported shapes: * * pred = simple_cmp | (and simple_cmp simple_cmp …) @@ -224,13 +293,16 @@ int ray_fused_group_supported(ray_t* expr, ray_t* tbl) { int64_t k = n - 1; if (k < 1 || k > FP_PRED_MAX_CHILDREN) return 0; for (int64_t i = 0; i < k; i++) { - if (fp_check_simple_cmp(elems[i + 1], tbl) < 0) return 0; + if (fp_check_simple_cmp(elems[i + 1], tbl) < 0 + && !fp_check_like(elems[i + 1], tbl) + && !fp_check_in(elems[i + 1], tbl)) return 0; } return 1; } } /* Fall through: single simple cmp. */ - return fp_check_simple_cmp(expr, tbl) >= 0 ? 1 : 0; + return (fp_check_simple_cmp(expr, tbl) >= 0 || + fp_check_like(expr, tbl) || fp_check_in(expr, tbl)) ? 1 : 0; } /* ───────────────────────────────────────────────────────────────────────── @@ -279,6 +351,86 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, return; } + if (op == FP_LIKE) { + if (ct == RAY_SYM) { + uint32_t lut_n = p->like_lut_count; + uint8_t* lut = p->like_lut; + const void* base = p->col_base; + uint8_t esz_l = p->col_esz; + ray_t** sym_strings = p->like_sym_strings; + int use_simple = p->pat_compiled.shape != RAY_GLOB_SHAPE_NONE; + for (int64_t r = 0; r < n; r++) { + uint64_t sid = (uint64_t)read_by_esz(base, start + r, esz_l); + if (sid >= lut_n || !lut || !sym_strings) { + bits[r] = 0; + continue; + } + uint8_t state = lut[sid]; + if (!state) { + ray_t* s = sym_strings[sid]; + uint8_t match = 0; + if (s) { + const char* sp = ray_str_ptr(s); + size_t sl = ray_str_len(s); + match = use_simple + ? (uint8_t)ray_glob_match_compiled(&p->pat_compiled, sp, sl) + : (uint8_t)ray_glob_match(sp, sl, p->pat_str, p->pat_len); + } + state = (uint8_t)(match ? 2 : 1); + lut[sid] = state; + } + bits[r] = (uint8_t)(state == 2); + } + return; + } + if (ct != RAY_STR) { + memset(bits, 0, (size_t)n); + return; + } + int use_simple = p->pat_compiled.shape != RAY_GLOB_SHAPE_NONE; + for (int64_t r = 0; r < n; r++) { + size_t sl = 0; + const char* sp = ray_str_vec_get(p->col_obj, start + r, &sl); + if (!sp) sp = ""; + bits[r] = use_simple + ? (uint8_t)ray_glob_match_compiled(&p->pat_compiled, sp, sl) + : (uint8_t)ray_glob_match(sp, sl, p->pat_str, p->pat_len); + } + return; + } + + if (op == FP_IN) { + if (p->n_cvals == 0) { + memset(bits, 0, (size_t)n); + return; + } +#define FP_RUN_IN(T) do { \ + const T* d = (const T*)p->col_base + start; \ + for (int64_t r = 0; r < n; r++) { \ + int64_t v = (int64_t)d[r]; \ + uint8_t hit = 0; \ + for (uint8_t j = 0; j < p->n_cvals; j++) \ + hit |= (uint8_t)(v == p->cvals[j]); \ + bits[r] = hit; \ + } \ + } while (0) + if (esz == 1) { + FP_RUN_IN(uint8_t); + return; + } + if (esz == 2) { + FP_RUN_IN(int16_t); + return; + } + if (esz == 4) { + FP_RUN_IN(int32_t); + return; + } + FP_RUN_IN(int64_t); + return; +#undef FP_RUN_IN + } + if (esz == 1) { switch (op) { case FP_EQ: FP_RUN(uint8_t, ==); break; @@ -287,6 +439,7 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, case FP_LE: FP_RUN(uint8_t, <=); break; case FP_GT: FP_RUN(uint8_t, > ); break; case FP_GE: FP_RUN(uint8_t, >=); break; + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; } return; } @@ -295,7 +448,8 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, switch (op) { case FP_EQ: FP_RUN(uint16_t, ==); break; case FP_NE: FP_RUN(uint16_t, !=); break; - default: memset(bits, 0, (size_t)n); break; /* unreachable */ + case FP_LT: case FP_LE: case FP_GT: case FP_GE: + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; /* unreachable */ } } else { switch (op) { @@ -305,6 +459,7 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, case FP_LE: FP_RUN(int16_t, <=); break; case FP_GT: FP_RUN(int16_t, > ); break; case FP_GE: FP_RUN(int16_t, >=); break; + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; } } return; @@ -314,7 +469,8 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, switch (op) { case FP_EQ: FP_RUN(uint32_t, ==); break; case FP_NE: FP_RUN(uint32_t, !=); break; - default: memset(bits, 0, (size_t)n); break; /* unreachable */ + case FP_LT: case FP_LE: case FP_GT: case FP_GE: + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; /* unreachable */ } } else { switch (op) { @@ -324,6 +480,7 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, case FP_LE: FP_RUN(int32_t, <=); break; case FP_GT: FP_RUN(int32_t, > ); break; case FP_GE: FP_RUN(int32_t, >=); break; + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; } } return; @@ -336,6 +493,7 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, case FP_LE: FP_RUN(int64_t, <=); break; case FP_GT: FP_RUN(int64_t, > ); break; case FP_GE: FP_RUN(int64_t, >=); break; + case FP_LIKE: case FP_IN: memset(bits, 0, (size_t)n); break; } } #undef FP_RUN @@ -375,6 +533,8 @@ static int fp_compile_cmp(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, case OP_LE: out->op = FP_LE; break; case OP_GT: out->op = FP_GT; break; case OP_GE: out->op = FP_GE; break; + case OP_LIKE: out->op = FP_LIKE; break; + case OP_IN: out->op = FP_IN; break; default: return -1; } @@ -390,6 +550,68 @@ static int fp_compile_cmp(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, ray_t* col = ray_table_get_col(tbl, lext->sym); if (!col) return -1; if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return -1; + if (out->op == FP_IN) { + if (!fp_col_supported(col) || !fp_int_family(col->type)) return -1; + ray_t* sv = rext->literal; + if (!sv || !ray_is_vec(sv) || !fp_int_family(sv->type)) return -1; + if (ray_len(sv) > 16) return -1; + out->col_type = col->type; + out->col_attrs = col->attrs; + out->col_esz = ray_sym_elem_size(col->type, col->attrs); + out->col_base = ray_data(col); + out->col_obj = col; + out->col_len = col->len; + int8_t st = sv->type; + int64_t nsv = ray_len(sv); + int64_t out_n = 0; + for (int64_t i = 0; i < nsv; i++) { + if ((sv->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(sv, i)) + continue; + switch (st) { + case RAY_BOOL: + case RAY_U8: out->cvals[out_n++] = ((uint8_t*)ray_data(sv))[i]; break; + case RAY_I16: out->cvals[out_n++] = ((int16_t*)ray_data(sv))[i]; break; + case RAY_I32: + case RAY_DATE: + case RAY_TIME: out->cvals[out_n++] = ((int32_t*)ray_data(sv))[i]; break; + case RAY_I64: + case RAY_TIMESTAMP: out->cvals[out_n++] = ((int64_t*)ray_data(sv))[i]; break; + default: return -1; + } + } + out->n_cvals = (uint8_t)out_n; + out->cval_in_dict = 1; + return 0; + } + if (out->op == FP_LIKE) { + if (col->type != RAY_STR && col->type != RAY_SYM) return -1; + if (!fp_col_supported(col)) return -1; + ray_t* cv_like = rext->literal; + if (!cv_like || cv_like->type != -RAY_STR) return -1; + out->col_type = col->type; + out->col_attrs = col->attrs; + out->col_esz = ray_sym_elem_size(col->type, col->attrs); + out->col_base = ray_data(col); + out->col_obj = col; + out->col_len = col->len; + out->pat_str = ray_str_ptr(cv_like); + out->pat_len = ray_str_len(cv_like); + out->pat_compiled = ray_glob_compile(out->pat_str, out->pat_len); + if (col->type == RAY_SYM) { + ray_t** sym_strings = NULL; + uint32_t sym_count = 0; + ray_sym_strings_borrow(&sym_strings, &sym_count); + if (!sym_strings || sym_count == 0) return -1; + uint8_t* lut = (uint8_t*)scratch_calloc(&out->aux_hdr, sym_count); + if (!lut) return -1; + out->like_lut = lut; + out->like_lut_count = sym_count; + out->like_sym_strings = sym_strings; + } + out->cval_in_dict = 1; + return 0; + } + /* Ordering ops on SYM are meaningless (dict ID order != string order). */ if (col->type == RAY_SYM && (out->op == FP_LT || out->op == FP_LE || out->op == FP_GT || out->op == FP_GE)) @@ -409,6 +631,7 @@ static int fp_compile_cmp(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, out->col_attrs = col->attrs; out->col_esz = ray_sym_elem_size(col->type, col->attrs); out->col_base = ray_data(col); + out->col_obj = col; out->col_len = col->len; if (out->col_type == RAY_SYM) { @@ -477,6 +700,7 @@ static int fp_compile_cmp(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, case FP_LE: out->fold = below ? FP_FOLD_FALSE : FP_FOLD_TRUE; break; case FP_GT: out->fold = below ? FP_FOLD_TRUE : FP_FOLD_FALSE; break; case FP_GE: out->fold = below ? FP_FOLD_TRUE : FP_FOLD_FALSE; break; + case FP_LIKE: case FP_IN: break; } } return 0; @@ -497,12 +721,20 @@ static int fp_compile_pred_dag(ray_graph_t* g, ray_op_t* node, ray_t* tbl, return 0; } if (out->n_children >= FP_PRED_MAX_CHILDREN) return -1; - return fp_compile_cmp(g, node, tbl, &out->children[out->n_children++]); + fp_cmp_t tmp; + memset(&tmp, 0, sizeof(tmp)); + if (fp_compile_cmp(g, node, tbl, &tmp) != 0) { + if (tmp.aux_hdr) scratch_free(tmp.aux_hdr); + return -1; + } + out->children[out->n_children++] = tmp; + return 0; } int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, fp_pred_t* out) { + memset(out, 0, sizeof(*out)); out->n_children = 0; /* No predicate → const-true. fp_eval_pred memsets bits to 1 * when n_children == 0, so the worker treats every row as a hit. */ @@ -510,6 +742,19 @@ int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, return fp_compile_pred_dag(g, pred_op, tbl, out); } +void fp_pred_cleanup(fp_pred_t* p) { + if (!p) return; + for (uint8_t i = 0; i < p->n_children; i++) { + if (p->children[i].aux_hdr) { + scratch_free(p->children[i].aux_hdr); + p->children[i].aux_hdr = NULL; + p->children[i].like_lut = NULL; + p->children[i].like_lut_count = 0; + p->children[i].like_sym_strings = NULL; + } + } +} + /* ───────────────────────────────────────────────────────────────────────── * Phase-2 parallel fused exec. * @@ -560,6 +805,12 @@ typedef struct { _Atomic(uint32_t) oom; /* set by any worker on OOM; main bails */ } fp_par_ctx_t; +static int64_t fp_count_emit_keep_min(int64_t total_groups, + ray_group_emit_filter_t filter, + const int64_t* used_key_slots, + const int64_t* counts, + uint64_t n_slots); + static int fp_shard_init(fp_shard_t* sh, uint64_t cap) { sh->slots = (int64_t*)scratch_calloc(&sh->slots_hdr, (size_t)cap * 2 * sizeof(int64_t)); @@ -675,6 +926,152 @@ static void fp_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) } } +typedef struct { + const fp_pred_t* pred; + const void* kbase; + int8_t kt; + uint8_t kesz; + uint32_t n_slots; + int32_t bias; + int64_t* counts; /* [n_workers * n_slots] */ +} fp_direct_count_ctx_t; + +static void fp_direct_count_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw; + int64_t* counts = c->counts + (size_t)worker_id * c->n_slots; + int64_t row = start; + while (row < end) { + int64_t mend = row + RAY_MORSEL_ELEMS; + if (mend > end) mend = end; + int64_t mlen = mend - row; + uint8_t bits[RAY_MORSEL_ELEMS]; + fp_eval_pred(c->pred, row, mend, bits); + if (c->kt == RAY_I16) { + const int16_t* k = (const int16_t*)c->kbase + row; + for (int64_t r = 0; r < mlen; r++) + if (bits[r]) counts[(uint32_t)((int32_t)k[r] + c->bias)]++; + } else { + const uint8_t* k = (const uint8_t*)c->kbase + row; + for (int64_t r = 0; r < mlen; r++) + if (bits[r]) counts[(uint32_t)k[r]]++; + } + row = mend; + } +} + +static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows, + int64_t key_sym, uint32_t nw) { + uint32_t n_slots; + int32_t bias = 0; + if (ctx->kt == RAY_BOOL) { + n_slots = 2; + } else if (ctx->kt == RAY_U8) { + n_slots = 256; + } else if (ctx->kt == RAY_I16) { + n_slots = 65536; + bias = 32768; + } else { + return NULL; + } + + ray_t* counts_hdr = NULL; + int64_t* counts = (int64_t*)scratch_calloc(&counts_hdr, + (size_t)nw * (size_t)n_slots * sizeof(int64_t)); + if (!counts) return ray_error("oom", NULL); + + fp_direct_count_ctx_t dctx = { + .pred = &ctx->pred, + .kbase = ctx->kbase, + .kt = ctx->kt, + .kesz = ctx->kesz, + .n_slots = n_slots, + .bias = bias, + .counts = counts, + }; + + ray_pool_t* pool = ray_pool_get(); + if (pool) ray_pool_dispatch(pool, fp_direct_count_fn, &dctx, nrows); + else fp_direct_count_fn(&dctx, 0, 0, nrows); + + int64_t out_n = 0; + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0; + int64_t keep_min = emit_filter.min_count_exclusive + 1; + ray_t* totals_hdr = NULL; + int64_t* totals = NULL; + if (use_emit_filter && emit_filter.top_count_take > 0) { + totals = (int64_t*)scratch_calloc(&totals_hdr, + (size_t)n_slots * sizeof(int64_t)); + if (!totals) { + scratch_free(counts_hdr); + return ray_error("oom", NULL); + } + } + for (uint32_t s = 0; s < n_slots; s++) { + int64_t total = 0; + for (uint32_t w = 0; w < nw; w++) + total += counts[(size_t)w * n_slots + s]; + if (totals) totals[s] = total; + if (total) out_n++; + } + if (use_emit_filter) { + if (totals) { + keep_min = fp_count_emit_keep_min(out_n, emit_filter, NULL, + totals, n_slots); + } + out_n = 0; + for (uint32_t s = 0; s < n_slots; s++) { + int64_t total = totals ? totals[s] : 0; + if (!totals) { + for (uint32_t w = 0; w < nw; w++) + total += counts[(size_t)w * n_slots + s]; + } + if (total >= keep_min) out_n++; + } + } + + ray_t* k_out = ray_vec_new(ctx->kt, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); + if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { + if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); + if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); + if (totals_hdr) scratch_free(totals_hdr); + scratch_free(counts_hdr); + return ray_error("oom", NULL); + } + k_out->len = out_n; + c_out->len = out_n; + void* k_dst = ray_data(k_out); + int64_t* c_dst = (int64_t*)ray_data(c_out); + int64_t oi = 0; + for (uint32_t s = 0; s < n_slots; s++) { + int64_t total = 0; + for (uint32_t w = 0; w < nw; w++) + total += counts[(size_t)w * n_slots + s]; + if (totals) total = totals[s]; + if (total < keep_min) continue; + int64_t key = (ctx->kt == RAY_I16) ? ((int64_t)s - bias) : (int64_t)s; + write_col_i64(k_dst, oi, key, ctx->kt, ctx->katt); + c_dst[oi++] = total; + } + if (totals_hdr) scratch_free(totals_hdr); + scratch_free(counts_hdr); + + ray_t* result = ray_table_new(2); + if (!result || RAY_IS_ERR(result)) { + ray_release(k_out); + ray_release(c_out); + return ray_error("oom", NULL); + } + int64_t cnt_sym = ray_sym_intern("count", 5); + result = ray_table_add_col(result, key_sym, k_out); + result = ray_table_add_col(result, cnt_sym, c_out); + ray_release(k_out); + ray_release(c_out); + return result; +} + /* Parallel combine: 3-pass radix scatter. * * Pass A (per shard, parallel): histogram slot counts per partition. @@ -707,6 +1104,72 @@ typedef struct { _Atomic(uint32_t) oom; } fp_combine_par_ctx_t; +static void fp_count_heap_down(int64_t* heap, int64_t n, int64_t i) { + for (;;) { + int64_t l = i * 2 + 1; + int64_t r = l + 1; + int64_t m = i; + if (l < n && heap[l] < heap[m]) m = l; + if (r < n && heap[r] < heap[m]) m = r; + if (m == i) break; + int64_t tmp = heap[i]; + heap[i] = heap[m]; + heap[m] = tmp; + i = m; + } +} + +static void fp_count_heap_up(int64_t* heap, int64_t i) { + while (i > 0) { + int64_t p = (i - 1) / 2; + if (heap[p] <= heap[i]) break; + int64_t tmp = heap[p]; + heap[p] = heap[i]; + heap[i] = tmp; + i = p; + } +} + +static void fp_count_heap_consider(int64_t* heap, int64_t* hn, + int64_t cap, int64_t count) { + if (cap <= 0 || count <= 0) return; + if (*hn < cap) { + heap[(*hn)++] = count; + fp_count_heap_up(heap, *hn - 1); + } else if (count > heap[0]) { + heap[0] = count; + fp_count_heap_down(heap, *hn, 0); + } +} + +static int64_t fp_count_emit_keep_min(int64_t total_groups, + ray_group_emit_filter_t filter, + const int64_t* used_key_slots, + const int64_t* counts, + uint64_t n_slots) { + int64_t keep_min = filter.min_count_exclusive + 1; + int64_t k_take = filter.top_count_take; + if (!filter.enabled || k_take <= 0 || total_groups <= k_take) + return keep_min; + + ray_t* heap_hdr = NULL; + int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr, + (size_t)k_take * sizeof(int64_t)); + if (!heap) return keep_min; + + int64_t hn = 0; + for (uint64_t s = 0; s < n_slots; s++) { + if (used_key_slots && !used_key_slots[s * 2]) continue; + int64_t cnt = counts[s]; + if (cnt >= keep_min) + fp_count_heap_consider(heap, &hn, k_take, cnt); + } + if (hn == k_take && heap[0] > keep_min) + keep_min = heap[0]; + scratch_free(heap_hdr); + return keep_min; +} + static void fp_combine_hist_fn(void* vctx, uint32_t worker_id, int64_t start, int64_t end) { (void)worker_id; (void)end; @@ -869,11 +1332,15 @@ static ray_t* fp_combine_and_materialize(fp_shard_t* shards, uint32_t nw, return result; } + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0 && + (emit_filter.min_count_exclusive > 0 || emit_filter.top_count_take > 0); + /* Parallel combine for high-cardinality results: 3-pass radix scatter. * Crossover at 50 K entries — below that, the serial walk has lower * overhead than the dispatch + scratch alloc cost. */ ray_pool_t* cpool = ray_pool_get(); - if (cpool && total_local >= FP_COMBINE_PAR_MIN && + if (!use_emit_filter && cpool && total_local >= FP_COMBINE_PAR_MIN && ray_pool_total_workers(cpool) >= 2 && nw <= 256) { uint32_t cnw = ray_pool_total_workers(cpool); @@ -1073,24 +1540,37 @@ static ray_t* fp_combine_and_materialize(fp_shard_t* shards, uint32_t nw, } } + int64_t keep_min = use_emit_filter + ? fp_count_emit_keep_min(global_n, emit_filter, gs, gc, gcap) + : 1; + int64_t out_n = global_n; + if (use_emit_filter) { + out_n = 0; + for (uint64_t s = 0; s < gcap; s++) { + if (!gs[s * 2]) continue; + if (gc[s] >= keep_min) out_n++; + } + } + /* Materialize. */ ray_t* k_out = (kt == RAY_SYM) - ? ray_sym_vec_new(katt & RAY_SYM_W_MASK, global_n) - : ray_vec_new(kt, global_n); - ray_t* c_out = ray_vec_new(RAY_I64, global_n); + ? ray_sym_vec_new(katt & RAY_SYM_W_MASK, out_n) + : ray_vec_new(kt, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); scratch_free(gs_hdr); scratch_free(gc_hdr); return ray_error("oom", NULL); } - k_out->len = global_n; - c_out->len = global_n; + k_out->len = out_n; + c_out->len = out_n; void* k_dst = ray_data(k_out); int64_t* c_dst = (int64_t*)ray_data(c_out); int64_t gi = 0; for (uint64_t s = 0; s < gcap; s++) { if (!gs[s * 2]) continue; + if (gc[s] < keep_min) continue; int64_t kv = gs[s * 2 + 1]; write_col_i64(k_dst, gi, kv, kt, katt); c_dst[gi] = gc[s]; @@ -1152,6 +1632,9 @@ static ray_t* exec_filtered_group_count1(ray_graph_t* g, ray_op_ext_t* ext, ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + ray_t* direct = fp_try_direct_count1(&ctx, nrows, kext->sym, nw); + if (direct) return direct; + ray_t* shards_hdr = NULL; ctx.shards = (fp_shard_t*)scratch_calloc(&shards_hdr, (size_t)nw * sizeof(fp_shard_t)); @@ -1700,6 +2183,73 @@ static ray_t* mk_materialize_agg(const mk_agg_t* a, const int64_t* gstate, return col; } +static void mk_apply_count_emit_filter(const mk_par_ctx_t* c, + int64_t* gs, int64_t* gst, + int64_t gcap, int64_t* global_n) +{ + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + if (!emit_filter.enabled || emit_filter.agg_index >= c->n_aggs) + return; + + const mk_agg_t* count_agg = &c->aggs[emit_filter.agg_index]; + if (count_agg->kind != MK_AGG_COUNT) + return; + + int64_t keep_min = emit_filter.min_count_exclusive + 1; + int64_t k_take = emit_filter.top_count_take; + if (k_take > 0 && k_take < *global_n) { + ray_t* heap_hdr = NULL; + int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr, + (size_t)k_take * sizeof(int64_t)); + if (heap) { + int64_t heap_n = 0; + for (int64_t s = 0; s < gcap; s++) { + if (!gs[s * 2]) continue; + int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off]; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = cnt; + while (j > 0) { + int64_t p = (j - 1) >> 1; + if (heap[p] <= heap[j]) break; + int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp; + j = p; + } + } else if (cnt > heap[0]) { + heap[0] = cnt; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (r < heap_n && heap[r] < heap[m]) m = r; + if (m == j) break; + int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + j = m; + } + } + } + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + scratch_free(heap_hdr); + } + } + + if (keep_min <= 1) + return; + + int64_t kept = 0; + for (int64_t s = 0; s < gcap; s++) { + if (!gs[s * 2]) continue; + int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off]; + if (cnt < keep_min) { + gs[s * 2] = 0; + } else { + kept++; + } + } + *global_n = kept; +} + /* Parallel combine for the multi-agg/multi-key path. Same 3-pass radix * scatter as count1: histogram per (shard, partition), scatter packed * (kv + state[]) to a flat buffer using per-(shard, partition) cursors, @@ -2284,6 +2834,8 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw, * with -Werror,-Wc23-extensions. Empty statement is the * portable form. */ + mk_apply_count_emit_filter(c, gs, gst, gcap, &global_n); + /* Build n_keys key columns by decomposing the composite. */ ray_t* key_cols[FP_MAX_KEYS]; for (uint8_t k = 0; k < n_keys; k++) key_cols[k] = NULL; diff --git a/src/ops/fused_pred.h b/src/ops/fused_pred.h index f9c6f263..8658d7bc 100644 --- a/src/ops/fused_pred.h +++ b/src/ops/fused_pred.h @@ -26,6 +26,7 @@ #include "rayforce.h" #include "ops/internal.h" +#include "ops/glob.h" #define FP_PRED_MAX_CHILDREN 8 @@ -36,6 +37,8 @@ typedef enum { FP_LE = 3, FP_GT = 4, FP_GE = 5, + FP_LIKE = 6, + FP_IN = 7, } fp_op_t; /* fold values: when the predicate constant is provably outside the @@ -56,9 +59,19 @@ typedef struct { uint8_t col_esz; uint8_t fold; /* fp_fold_t — set when cval is out-of-range */ const void* col_base; + ray_t* col_obj; int64_t col_len; int64_t cval; int cval_in_dict; + int64_t cvals[16]; + uint8_t n_cvals; + const char* pat_str; + size_t pat_len; + ray_glob_compiled_t pat_compiled; + ray_t* aux_hdr; + uint8_t* like_lut; + uint32_t like_lut_count; + ray_t** like_sym_strings; } fp_cmp_t; typedef struct { @@ -79,4 +92,6 @@ void fp_eval_pred(const fp_pred_t* p, int64_t start, int64_t end, uint8_t* bits) int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, fp_pred_t* out); +void fp_pred_cleanup(fp_pred_t* p); + #endif /* RAY_OPS_FUSED_PRED_H */ diff --git a/src/ops/fused_topk.c b/src/ops/fused_topk.c index 975a6034..b4dfc52d 100644 --- a/src/ops/fused_topk.c +++ b/src/ops/fused_topk.c @@ -45,6 +45,7 @@ #include "ops/fused_pred.h" #include "ops/fused_group.h" /* ray_fused_group_supported */ #include "ops/internal.h" +#include "lang/internal.h" #include "core/pool.h" #include @@ -269,33 +270,13 @@ ray_t* ray_fused_topk_select(ray_t* tbl, int64_t nrows = ray_table_nrows(tbl); if (nrows <= 0 || k >= nrows) return NULL; - /* Output column type gate. The materialise loop reads via - * read_by_esz (which assumes a fixed-width scalar payload) and - * writes via write_col_i64 (which only handles BOOL/U8/I16/I32/ - * I64/DATE/TIME/TIMESTAMP/SYM). Variable-width types like - * RAY_STR or compound types like LIST/MAP/GUID would corrupt - * the output silently — gate the fused path off so the unfused - * FILTER + SORT + TAKE handles them. */ for (uint8_t c = 0; c < n_out; c++) { ray_t* col = ray_table_get_col(tbl, out_col_syms[c]); if (!col) return NULL; int8_t ot = col->type; if (RAY_IS_PARTED(ot) || ot == RAY_MAPCOMMON) return NULL; - if (ot != RAY_SYM && ot != RAY_BOOL && ot != RAY_U8 - && ot != RAY_I16 && ot != RAY_I32 && ot != RAY_I64 - && ot != RAY_DATE && ot != RAY_TIME && ot != RAY_TIMESTAMP) + if (!ray_is_vec(col)) return NULL; - /* SYM columns with a per-vector sym_dict store narrow-width - * indices into a LOCAL dictionary, not the global one. The - * fused materialiser builds a fresh ray_sym_vec_new and copies - * raw IDs without propagating sym_dict (cf. sort.c:3642-3660 / - * rerank.c:174-188 which DO propagate it). Falling back keeps - * the unfused gather, which propagates correctly. */ - if (ot == RAY_SYM) { - const ray_t* dict_owner = (col->attrs & RAY_ATTR_SLICE) - ? col->slice_parent : col; - if (dict_owner && dict_owner->sym_dict) return NULL; - } } /* Resolve sort-key columns + decide if any need the SYM dict snapshot. */ @@ -341,6 +322,7 @@ ray_t* ray_fused_topk_select(ray_t* tbl, ray_op_t* pred_dag = compile_expr_dag(g, where_expr); if (!pred_dag) { ray_graph_free(g); return NULL; } if (fp_compile_pred(g, pred_dag, tbl, &ctx.pred) != 0) { + fp_pred_cleanup(&ctx.pred); ray_graph_free(g); return NULL; } @@ -363,6 +345,7 @@ ray_t* ray_fused_topk_select(ray_t* tbl, if (!ctx.heap_idx || !ctx.heap_n) { if (idx_hdr) scratch_free(idx_hdr); if (hn_hdr) scratch_free(hn_hdr); + fp_pred_cleanup(&ctx.pred); ray_graph_free(g); return NULL; } @@ -372,6 +355,7 @@ ray_t* ray_fused_topk_select(ray_t* tbl, if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { scratch_free(idx_hdr); scratch_free(hn_hdr); + fp_pred_cleanup(&ctx.pred); ray_graph_free(g); return NULL; } @@ -405,6 +389,7 @@ ray_t* ray_fused_topk_select(ray_t* tbl, /* Materialize n_out output columns by gathering rows[global_idx]. */ ray_t* result = ray_table_new(n_out); if (!result || RAY_IS_ERR(result)) { + fp_pred_cleanup(&ctx.pred); ray_graph_free(g); return result ? result : ray_error("oom", NULL); } @@ -414,31 +399,13 @@ ray_t* ray_fused_topk_select(ray_t* tbl, int64_t alias = out_alias_syms ? out_alias_syms[c] : cs; ray_t* src = ray_table_get_col(tbl, cs); if (!src) { build_ok = 0; break; } - ray_t* col = (src->type == RAY_SYM) - ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, global_n) - : ray_vec_new(src->type, global_n); + ray_t* col = gather_by_idx(src, global_idx, global_n); if (!col || RAY_IS_ERR(col)) { build_ok = 0; break; } - col->len = global_n; - void* dst = ray_data(col); - uint8_t esz = ray_sym_elem_size(src->type, src->attrs); - for (int32_t i = 0; i < global_n; i++) { - int64_t v = read_by_esz(ray_data(src), global_idx[i], esz); - write_col_i64(dst, i, v, src->type, src->attrs); - } - /* Propagate the source nullmap so a nullable select column - * survives the top-K gather. ray_vec_set_null lazily allocates - * dst's nullmap on the first set, so we only pay the alloc cost - * when there are actual nulls to copy. */ - if (src->attrs & RAY_ATTR_HAS_NULLS) { - for (int32_t i = 0; i < global_n; i++) { - if (ray_vec_is_null(src, global_idx[i])) - ray_vec_set_null(col, i, true); - } - } result = ray_table_add_col(result, alias, col); ray_release(col); } ray_graph_free(g); + fp_pred_cleanup(&ctx.pred); if (!build_ok) { ray_release(result); return ray_error("schema", NULL); diff --git a/src/ops/group.c b/src/ops/group.c index 063c961d..6d18c008 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -1790,12 +1790,91 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, } } +static bool group_ht_insert_empty_group(group_ht_t* ht, const int64_t* keys, + const int8_t* key_types) { + const ght_layout_t* ly = &ht->layout; + if (ht->grp_count >= ht->grp_cap) { + if (!group_ht_grow(ht)) { ht->oom = 1; return false; } + } + uint64_t h = hash_keys_inline(keys, key_types, ly->n_keys, + ly->wide_key_mask, ly->wide_key_esz, + ht->key_data); + uint32_t mask = ht->ht_cap - 1; + uint32_t slot = (uint32_t)(h & mask); + uint8_t salt = HT_SALT(h); + while (ht->slots[slot] != HT_EMPTY) + slot = (slot + 1) & mask; + + uint32_t gid = ht->grp_count++; + char* row = ht->rows + (size_t)gid * ly->row_stride; + memset(row, 0, ly->row_stride); + memcpy(row + 8, keys, (size_t)(ly->n_keys + 1) * 8); + ht->slots[slot] = HT_PACK(salt, gid); + if (ht->grp_count * 2 > ht->ht_cap) { + group_ht_rehash(ht, key_types); + if (ht->oom) return false; + } + return true; +} + +static inline void group_probe_existing_entry(group_ht_t* ht, + const char* entry, const int8_t* key_types) { + const ght_layout_t* ly = &ht->layout; + uint64_t hash = *(const uint64_t*)entry; + const char* ekeys = entry + 8; + uint8_t salt = HT_SALT(hash); + uint32_t mask = ht->ht_cap - 1; + uint32_t slot = (uint32_t)(hash & mask); + + for (;;) { + uint32_t sv = ht->slots[slot]; + if (sv == HT_EMPTY) return; + if (HT_SALT_V(sv) == salt) { + uint32_t gid = HT_GID(sv); + char* row = ht->rows + (size_t)gid * ly->row_stride; + if (group_keys_equal((const int64_t*)(row + 8), + (const int64_t*)ekeys, ly, ht->key_data)) { + (*(int64_t*)row)++; + accum_from_entry(row, entry, ly); + return; + } + } + slot = (slot + 1) & mask; + } +} + /* Process rows [start, end) from original columns into a local hash table. * Converts each row to a fat entry on the stack, then probes. */ #define GROUP_PREFETCH_BATCH 16 +static inline int64_t group_strlen_at(const ray_t* col, int64_t row); + +static inline bool group_rowsel_pass(ray_t* sel, int64_t row) { + if (!sel) return true; + ray_rowsel_t* m = ray_rowsel_meta(sel); + if (row < 0 || row >= m->nrows) return false; + uint32_t seg = (uint32_t)(row / RAY_MORSEL_ELEMS); + uint8_t f = ray_rowsel_flags(sel)[seg]; + if (f == RAY_SEL_ALL) return true; + if (f == RAY_SEL_NONE) return false; + uint16_t local = (uint16_t)(row - (int64_t)seg * RAY_MORSEL_ELEMS); + uint32_t lo = ray_rowsel_offsets(sel)[seg]; + uint32_t hi = ray_rowsel_offsets(sel)[seg + 1]; + const uint16_t* idx = ray_rowsel_idx(sel); + while (lo < hi) { + uint32_t mid = lo + ((hi - lo) >> 1); + uint16_t v = idx[mid]; + if (v == local) return true; + if (v < local) lo = mid + 1; + else hi = mid; + } + return false; +} + void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types, uint8_t* key_attrs, ray_t** key_vecs, ray_t** agg_vecs, + uint8_t* agg_strlen, + ray_t* rowsel, int64_t start, int64_t end, const int64_t* match_idx) { const ght_layout_t* ly = &ht->layout; @@ -1830,6 +1909,7 @@ void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types, * sub-100ms response time on Ctrl-C. */ if (((i - start) & 65535) == 0 && ray_interrupted()) break; int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, row)) continue; uint64_t h = 0; int64_t* ek = (int64_t*)(ebuf + 8); int64_t null_mask = 0; @@ -1869,7 +1949,9 @@ void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types, for (uint8_t a = 0; a < na; a++) { ray_t* ac = agg_vecs[a]; if (!ac) continue; - if (ac->type == RAY_F64) + if (agg_strlen && agg_strlen[a]) + ev[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8); else ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); @@ -1885,6 +1967,87 @@ void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types, } } +static void group_rows_range_existing(group_ht_t* ht, void** key_data, + int8_t* key_types, uint8_t* key_attrs, + ray_t** key_vecs, ray_t** agg_vecs, + uint8_t* agg_strlen, ray_t* rowsel, + int64_t start, int64_t end, + const int64_t* match_idx) { + const ght_layout_t* ly = &ht->layout; + uint8_t nk = ly->n_keys; + uint8_t na = ly->n_aggs; + uint8_t wide = ly->wide_key_mask; + bool has_fl = (ly->agg_is_first | ly->agg_is_last) != 0; + char ebuf[8 + 9 * 8 + 8 * 8 + 8]; + + uint8_t nullable_mask = 0; + for (uint8_t k = 0; k < nk; k++) { + if (!key_vecs || !key_vecs[k]) continue; + ray_t* kv = key_vecs[k]; + ray_t* src = (kv->attrs & RAY_ATTR_SLICE) ? kv->slice_parent : kv; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + nullable_mask |= (uint8_t)(1u << k); + } + + if (wide) group_ht_set_key_data(ht, key_data); + + for (int64_t i = start; i < end; i++) { + if (((i - start) & 65535) == 0 && ray_interrupted()) break; + int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, row)) continue; + uint64_t h = 0; + int64_t* ek = (int64_t*)(ebuf + 8); + int64_t null_mask = 0; + for (uint8_t k = 0; k < nk; k++) { + int8_t t = key_types[k]; + uint64_t kh; + bool is_null = (nullable_mask & (1u << k)) + && ray_vec_is_null(key_vecs[k], row); + if (is_null) { + null_mask |= (int64_t)(1u << k); + ek[k] = 0; + kh = ray_hash_i64(0); + } else if (wide & (1u << k)) { + uint8_t esz = ly->wide_key_esz[k]; + const void* src = (const char*)key_data[k] + (size_t)row * esz; + ek[k] = row; + kh = ray_hash_bytes(src, esz); + } else if (t == RAY_F64) { + int64_t kv; + memcpy(&kv, &((double*)key_data[k])[row], 8); + ek[k] = kv; + kh = ray_hash_f64(((double*)key_data[k])[row]); + } else { + int64_t kv = read_col_i64(key_data[k], row, t, key_attrs[k]); + ek[k] = kv; + kh = ray_hash_i64(kv); + } + h = (k == 0) ? kh : ray_hash_combine(h, kh); + } + ek[nk] = null_mask; + if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask)); + *(uint64_t*)ebuf = h; + + int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8); + uint8_t vi = 0; + for (uint8_t a = 0; a < na; a++) { + ray_t* ac = agg_vecs[a]; + if (!ac) continue; + if (agg_strlen && agg_strlen[a]) + ev[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); + vi++; + } + if (has_fl) + memcpy(ebuf + ly->entry_stride - 8, &row, 8); + + group_probe_existing_entry(ht, ebuf, key_types); + } +} + /* ============================================================================ * Radix-partitioned parallel group-by * @@ -1945,9 +2108,11 @@ typedef struct { ray_t** key_vecs; uint8_t nullable_mask; /* bit k = key k column may contain nulls */ ray_t** agg_vecs; + uint8_t* agg_strlen; uint32_t n_workers; radix_buf_t* bufs; /* [n_workers * RADIX_P] */ ght_layout_t layout; + ray_t* rowsel; /* When non-NULL, workers iterate match_idx[start..end) and * read row=match_idx[i]. When NULL, row=i. */ const int64_t* match_idx; @@ -1975,6 +2140,7 @@ static void radix_phase1_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ * sub-100ms response time on Ctrl-C. */ if (((i - start) & 65535) == 0 && ray_interrupted()) break; int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row)) continue; uint64_t h = 0; int64_t null_mask = 0; for (uint8_t k = 0; k < nk; k++) { @@ -2009,7 +2175,9 @@ static void radix_phase1_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ for (uint8_t a = 0; a < na; a++) { ray_t* ac = c->agg_vecs[a]; if (!ac) continue; - if (ac->type == RAY_F64) + if (c->agg_strlen && c->agg_strlen[a]) + agg_vals[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) memcpy(&agg_vals[vi], &((double*)ray_data(ac))[row], 8); else agg_vals[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); @@ -2268,6 +2436,7 @@ typedef struct { int64_t* per_worker_max; /* [n_workers] */ uint32_t n_workers; const int64_t* match_idx; /* NULL = no selection */ + ray_t* rowsel; } minmax_ctx_t; static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) { @@ -2282,6 +2451,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t const TYPE* kd = (const TYPE*)c->key_data; \ for (int64_t i = start; i < end; i++) { \ int64_t r = match_idx ? match_idx[i] : i; \ + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ int64_t v = (int64_t)CAST kd[r]; \ if (v < kmin) kmin = v; \ if (v > kmax) kmax = v; \ @@ -2513,6 +2683,8 @@ typedef struct { uint8_t n_keys; void** agg_ptrs; int8_t* agg_types; + ray_t** agg_cols; + uint8_t* agg_strlen; uint16_t* agg_ops; /* per-agg operation code */ uint8_t n_aggs; uint8_t need_flags; /* DA_NEED_* bitmask */ @@ -2520,8 +2692,231 @@ typedef struct { bool all_sum; /* true when all ops are SUM/AVG/COUNT (no MIN/MAX/FIRST/LAST) */ uint32_t n_slots; const int64_t* match_idx; /* NULL = no selection */ + ray_t* rowsel; } da_ctx_t; +typedef struct { + uint8_t* used; + int64_t* keys; + int64_t* counts; + da_val_t* sums; + uint32_t cap; + uint32_t size; + ray_t* _h_used; + ray_t* _h_keys; + ray_t* _h_counts; + ray_t* _h_sums; +} sparse_i64_ht_t; + +static inline uint64_t sparse_i64_mix(uint64_t x) { + x ^= x >> 30; + x *= UINT64_C(0xbf58476d1ce4e5b9); + x ^= x >> 27; + x *= UINT64_C(0x94d049bb133111eb); + x ^= x >> 31; + return x; +} + +static inline uint32_t sparse_i64_pow2(uint32_t x) { + if (x <= 1) return 1; + x--; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x + 1; +} + +static void sparse_i64_free(sparse_i64_ht_t* ht) { + if (!ht) return; + scratch_free(ht->_h_used); + scratch_free(ht->_h_keys); + scratch_free(ht->_h_counts); + scratch_free(ht->_h_sums); + memset(ht, 0, sizeof(*ht)); +} + +static _Thread_local ray_group_emit_filter_t tl_group_emit_filter; + +ray_group_emit_filter_t ray_group_emit_filter_get(void) { + return tl_group_emit_filter; +} + +void ray_group_emit_filter_set(ray_group_emit_filter_t filter) { + tl_group_emit_filter = filter; +} + +static int64_t da_count_emit_keep_min(const int64_t* counts, uint32_t n_slots, + uint32_t group_count, + ray_group_emit_filter_t filter) +{ + int64_t keep_min = filter.min_count_exclusive + 1; + int64_t k_take = filter.top_count_take; + if (k_take <= 0 || k_take >= (int64_t)group_count) + return keep_min; + + ray_t* heap_hdr = NULL; + int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr, + (size_t)k_take * sizeof(int64_t)); + if (!heap) + return keep_min; + + int64_t heap_n = 0; + for (uint32_t s = 0; s < n_slots; s++) { + int64_t cnt = counts[s]; + if (cnt <= 0) + continue; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = cnt; + while (j > 0) { + int64_t p = (j - 1) >> 1; + if (heap[p] <= heap[j]) break; + int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp; + j = p; + } + } else if (cnt > heap[0]) { + heap[0] = cnt; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (r < heap_n && heap[r] < heap[m]) m = r; + if (m == j) break; + int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + j = m; + } + } + } + + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + scratch_free(heap_hdr); + return keep_min; +} + +static int64_t da_count_emit_keep_min_u32(const uint32_t* counts, + uint64_t n_slots, + uint32_t group_count, + ray_group_emit_filter_t filter) +{ + int64_t keep_min = filter.min_count_exclusive + 1; + int64_t k_take = filter.top_count_take; + if (k_take <= 0 || k_take >= (int64_t)group_count) + return keep_min; + + ray_t* heap_hdr = NULL; + int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr, + (size_t)k_take * sizeof(int64_t)); + if (!heap) + return keep_min; + + int64_t heap_n = 0; + for (uint64_t s = 0; s < n_slots; s++) { + int64_t cnt = (int64_t)counts[s]; + if (cnt <= 0) + continue; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = cnt; + while (j > 0) { + int64_t p = (j - 1) >> 1; + if (heap[p] <= heap[j]) break; + int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp; + j = p; + } + } else if (cnt > heap[0]) { + heap[0] = cnt; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (r < heap_n && heap[r] < heap[m]) m = r; + if (m == j) break; + int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + j = m; + } + } + } + + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + scratch_free(heap_hdr); + return keep_min; +} + +static bool sparse_i64_init(sparse_i64_ht_t* ht, uint32_t cap, uint8_t n_aggs, + bool need_sum) { + memset(ht, 0, sizeof(*ht)); + if (cap < 1024) cap = 1024; + cap = sparse_i64_pow2(cap); + ht->used = (uint8_t*)scratch_calloc(&ht->_h_used, cap); + ht->keys = (int64_t*)scratch_alloc(&ht->_h_keys, (size_t)cap * sizeof(int64_t)); + ht->counts = (int64_t*)scratch_calloc(&ht->_h_counts, + (size_t)cap * sizeof(int64_t)); + if (need_sum) { + ht->sums = (da_val_t*)scratch_calloc(&ht->_h_sums, + (size_t)cap * n_aggs * sizeof(da_val_t)); + } + if (!ht->used || !ht->keys || !ht->counts || (need_sum && !ht->sums)) { + sparse_i64_free(ht); + return false; + } + ht->cap = cap; + return true; +} + +static int32_t sparse_i64_find_slot(const sparse_i64_ht_t* ht, int64_t key) { + uint32_t mask = ht->cap - 1; + uint32_t pos = (uint32_t)sparse_i64_mix((uint64_t)key) & mask; + while (ht->used[pos] && ht->keys[pos] != key) + pos = (pos + 1) & mask; + return (int32_t)pos; +} + +static bool sparse_i64_rehash(sparse_i64_ht_t* ht, uint8_t n_aggs, + bool need_sum) { + sparse_i64_ht_t old = *ht; + sparse_i64_ht_t nw; + if (!sparse_i64_init(&nw, old.cap * 2u, n_aggs, need_sum)) + return false; + for (uint32_t i = 0; i < old.cap; i++) { + if (!old.used[i]) continue; + int32_t s = sparse_i64_find_slot(&nw, old.keys[i]); + nw.used[s] = 1; + nw.keys[s] = old.keys[i]; + nw.counts[s] = old.counts[i]; + if (need_sum) + memcpy(&nw.sums[(size_t)s * n_aggs], &old.sums[(size_t)i * n_aggs], + (size_t)n_aggs * sizeof(da_val_t)); + nw.size++; + } + sparse_i64_free(&old); + *ht = nw; + return true; +} + +static bool sparse_i64_touch(sparse_i64_ht_t* ht, int64_t key, uint8_t n_aggs, + bool need_sum, int32_t* out_slot) { + if ((uint64_t)(ht->size + 1) * 10u >= (uint64_t)ht->cap * 7u) { + if (!sparse_i64_rehash(ht, n_aggs, need_sum)) + return false; + } + int32_t s = sparse_i64_find_slot(ht, key); + if (!ht->used[s]) { + ht->used[s] = 1; + ht->keys[s] = key; + ht->counts[s] = 0; + if (need_sum) + memset(&ht->sums[(size_t)s * n_aggs], 0, + (size_t)n_aggs * sizeof(da_val_t)); + ht->size++; + } + *out_slot = s; + return true; +} + /* Composite GID from multi-key. Arithmetic overflow is prevented in practice * by the DA budget check (DA_PER_WORKER_MAX) which limits total_slots to 262K. */ static inline int32_t da_composite_gid(da_ctx_t* c, int64_t r) { @@ -2560,6 +2955,57 @@ static inline void da_read_val(const void* ptr, int8_t type, uint8_t attrs, } } +static inline int64_t group_strlen_at(const ray_t* col, int64_t row) { + if (!col || ray_vec_is_null((ray_t*)col, row)) return 0; + if (col->type == RAY_STR) { + const ray_str_t* elems; + const char* pool; + (void)pool; + str_resolve(col, &elems, &pool); + return (int64_t)elems[row].len; + } + const char* sp; + size_t sl; + (void)sp; + sym_elem(col, row, &sp, &sl); + return (int64_t)sl; +} + +static inline int64_t group_strlen_at_cached(const ray_t* col, int64_t row, + ray_t** sym_strings, + uint32_t sym_count) { + if (!col || ray_vec_is_null((ray_t*)col, row)) return 0; + if (col->type == RAY_STR) { + const ray_str_t* elems; + const char* pool; + (void)pool; + str_resolve(col, &elems, &pool); + return (int64_t)elems[row].len; + } + if (col->type == RAY_SYM && sym_strings) { + int64_t sym_id = ray_read_sym(ray_data((ray_t*)col), row, + col->type, col->attrs); + if (sym_id < 0 || (uint64_t)sym_id >= sym_count) return 0; + ray_t* atom = sym_strings[sym_id]; + return atom ? (int64_t)ray_str_len(atom) : 0; + } + return group_strlen_at(col, row); +} + +static bool try_strlen_sumavg_input(ray_graph_t* g, ray_t* tbl, + ray_op_t* input_op, ray_t** out_vec) { + if (!g || !tbl || !input_op || !out_vec) return false; + if (input_op->opcode != OP_STRLEN || input_op->arity != 1 || !input_op->inputs[0]) + return false; + ray_op_t* child = input_op->inputs[0]; + ray_op_ext_t* child_ext = find_ext(g, child->id); + if (!child_ext || child_ext->base.opcode != OP_SCAN) return false; + ray_t* col = ray_table_get_col(tbl, child_ext->sym); + if (!col || (col->type != RAY_STR && col->type != RAY_SYM)) return false; + *out_vec = col; + return true; +} + /* Materialize a scalar (atom or len-1 vector) into a full-length vector so * group-aggregation loops can read row-wise without out-of-bounds access. */ static ray_t* materialize_broadcast_input(ray_t* src, int64_t nrows) { @@ -2627,11 +3073,14 @@ static ray_t* materialize_broadcast_input(ray_t* src, int64_t nrows) { typedef struct { void** agg_ptrs; int8_t* agg_types; + ray_t** agg_cols; + uint8_t* agg_strlen; uint16_t* agg_ops; agg_linear_t* agg_linear; uint8_t n_aggs; uint8_t need_flags; const int64_t* match_idx; /* NULL = no selection */ + ray_t* rowsel; /* per-worker accumulators (1 slot each) */ da_accum_t* accums; uint32_t n_accums; @@ -2706,7 +3155,12 @@ static inline void scalar_accum_row(scalar_ctx_t* c, da_accum_t* acc, int64_t r) fv = (double)iv; } else { if (!c->agg_ptrs[a]) continue; - da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv); + if (c->agg_strlen && c->agg_strlen[a]) { + iv = group_strlen_at(c->agg_cols[a], r); + fv = (double)iv; + } else { + da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv); + } } uint16_t op = c->agg_ops[a]; bool is_f = (c->agg_types[a] == RAY_F64); @@ -2745,6 +3199,7 @@ static void scalar_accum_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ for (int64_t i = start; i < end; i++) { int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; scalar_accum_row(c, acc, r); } } @@ -2766,7 +3221,9 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64 for (uint8_t a = 0; a < n_aggs; a++) { if (!c->agg_ptrs[a]) continue; size_t idx = base + a; - if (f64m & (1u << a)) + if (c->agg_strlen && c->agg_strlen[a]) + acc->sum[idx].i += group_strlen_at(c->agg_cols[a], r); + else if (f64m & (1u << a)) acc->sum[idx].f += ((const double*)c->agg_ptrs[a])[r]; else acc->sum[idx].i += read_col_i64(c->agg_ptrs[a], r, @@ -2787,7 +3244,12 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64 if (!c->agg_ptrs[a]) continue; size_t idx = base + a; double fv; int64_t iv; - da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv); + if (c->agg_strlen && c->agg_strlen[a]) { + iv = group_strlen_at(c->agg_cols[a], r); + fv = (double)iv; + } else { + da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv); + } uint16_t op = c->agg_ops[a]; if (op == OP_SUM || op == OP_AVG || op == OP_STDDEV || op == OP_STDDEV_POP || op == OP_VAR || op == OP_VAR_POP) { if (c->agg_types[a] == RAY_F64) acc->sum[idx].f += fv; @@ -2849,6 +3311,7 @@ static void da_accum_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t en bool da_pf = c->n_slots >= 4096; \ for (int64_t i = start; i < end; i++) { \ int64_t r = match_idx ? match_idx[i] : i; \ + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ if (da_pf && RAY_LIKELY(i + DA_PF_DIST < end)) { \ int64_t pf_r = match_idx ? match_idx[i + DA_PF_DIST] : (i + DA_PF_DIST); \ int64_t pfk = (int64_t)KCAST kp[pf_r]; \ @@ -2879,6 +3342,7 @@ static void da_accum_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t en bool _da_pf = c->n_slots >= 4096; \ for (int64_t i = start; i < end; i++) { \ int64_t r = match_idx ? match_idx[i] : i; \ + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ if (_da_pf && RAY_LIKELY(i + DA_PF_DIST < end)) { \ int64_t pf_r = match_idx ? match_idx[i + DA_PF_DIST] : (i + DA_PF_DIST); \ int32_t pf_gid = GID_FN(pf_r); \ @@ -3051,7 +3515,7 @@ static ray_t* exec_group_parted(ray_graph_t* g, ray_op_t* op, ray_t* parted_tbl, * - All keys and agg inputs must be simple SCANs * - Supported agg ops: SUM, COUNT, MIN, MAX, AVG, FIRST, LAST, * STDDEV, STDDEV_POP, VAR, VAR_POP */ - int can_partition = 1; + int can_partition = g->selection ? 0 : 1; int has_avg = 0; int has_stddev = 0; int64_t key_syms[8]; @@ -3284,12 +3748,6 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, for (int64_t c = 0; c < nc; c++) { ray_t* col = ray_table_get_col_idx(tbl, c); if (col && (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON)) { - /* exec_group_parted has no rowsel plumbing — a - * selection in flight would be silently ignored. - * Reject rather than produce unfiltered results. */ - if (g->selection) - return ray_error("nyi", - "GROUP BY with selection on parted table"); return exec_group_parted(g, op, tbl, group_limit); } } @@ -3428,23 +3886,15 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, if (n_keys > 8 || n_aggs > 8) return ray_error("nyi", NULL); - /* Extract selection (rowsel) for pushdown. Workers iterate over - * [0, n_scan) and read row=match_idx[i]. When no selection is - * present, match_idx is NULL and n_scan equals nrows. The - * match_idx_block must be released on every exec_group exit - * path — see the various `goto cleanup` and early returns below. - * - * The top-of-function guard already rejected nrows mismatches, - * so if we reach here with a selection it's guaranteed valid - * for `tbl`. */ + /* Extract selection (rowsel) for pushdown. Prefer streaming the + * morsel-local rowsel directly; flattening to int64 indices is kept + * only as a fallback for callers that still pass match_idx. */ ray_t* match_idx_block = NULL; const int64_t* match_idx = NULL; + ray_t* rowsel = NULL; int64_t n_scan = nrows; if (g->selection) { - match_idx_block = ray_rowsel_to_indices(g->selection); - if (!match_idx_block) return ray_error("oom", NULL); - match_idx = (const int64_t*)ray_data(match_idx_block); - n_scan = ray_rowsel_meta(g->selection)->total_pass; + rowsel = g->selection; } /* Resolve key columns (VLA — n_keys ≤ 8; use ≥1 to avoid zero-size VLA UB) */ @@ -3481,10 +3931,12 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, uint8_t vla_aggs = n_aggs > 0 ? n_aggs : 1; ray_t* agg_vecs[vla_aggs]; uint8_t agg_owned[vla_aggs]; /* 1 = we allocated via exec_node, must free */ + uint8_t agg_strlen[vla_aggs]; agg_affine_t agg_affine[vla_aggs]; agg_linear_t agg_linear[vla_aggs]; memset(agg_vecs, 0, vla_aggs * sizeof(ray_t*)); memset(agg_owned, 0, vla_aggs * sizeof(uint8_t)); + memset(agg_strlen, 0, vla_aggs * sizeof(uint8_t)); memset(agg_affine, 0, vla_aggs * sizeof(agg_affine_t)); memset(agg_linear, 0, vla_aggs * sizeof(agg_linear_t)); @@ -3499,6 +3951,12 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, continue; } + if ((agg_kind == OP_SUM || agg_kind == OP_AVG) && + try_strlen_sumavg_input(g, tbl, agg_input_op, &agg_vecs[a])) { + agg_strlen[a] = 1; + continue; + } + /* SUM/AVG(integer-linear expr): scalar path can aggregate directly * without materializing the expression vector. */ if (n_keys == 0 && nrows > 0 && @@ -3540,6 +3998,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, for (uint8_t a = 0; a < n_aggs; a++) { if (!agg_vecs[a] || RAY_IS_ERR(agg_vecs[a])) continue; if (ext->agg_ops[a] == OP_COUNT) continue; /* value is ignored for COUNT */ + if (agg_strlen[a]) continue; bool needs_broadcast = ray_is_atom(agg_vecs[a]) || (agg_vecs[a]->type > 0 && agg_vecs[a]->len == 1 && nrows > 1); @@ -3576,6 +4035,10 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, key_attrs[k] = 0; } } + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + bool use_emit_filter = emit_filter.enabled && + emit_filter.agg_index < n_aggs && + ext->agg_ops[emit_filter.agg_index] == OP_COUNT; /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */ if (n_keys == 0 && nrows > 0) { @@ -3665,11 +4128,14 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, scalar_ctx_t sc_ctx = { .agg_ptrs = agg_ptrs, .agg_types = agg_types, + .agg_cols = agg_vecs, + .agg_strlen = agg_strlen, .agg_ops = ext->agg_ops, .agg_linear = agg_linear, .n_aggs = n_aggs, .need_flags = need_flags, .match_idx = match_idx, + .rowsel = rowsel, .accums = sc_acc, .n_accums = sc_n, }; @@ -3680,7 +4146,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, * safe when no selection is in flight. */ typedef void (*scalar_fn_t)(void*, uint32_t, int64_t, int64_t); scalar_fn_t sc_fn = scalar_accum_fn; - if (n_aggs == 1 && !match_idx && agg_ptrs[0] != NULL) { + if (n_aggs == 1 && !match_idx && !rowsel && agg_ptrs[0] != NULL) { uint16_t op0 = ext->agg_ops[0]; int8_t t0 = agg_types[0]; if ((op0 == OP_SUM || op0 == OP_AVG) && @@ -3688,7 +4154,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, sc_fn = scalar_sum_i64_fn; else if ((op0 == OP_SUM || op0 == OP_AVG) && t0 == RAY_F64) sc_fn = scalar_sum_f64_fn; - } else if (n_aggs == 1 && !match_idx && agg_linear[0].enabled) { + } else if (n_aggs == 1 && !match_idx && !rowsel && agg_linear[0].enabled) { uint16_t op0 = ext->agg_ops[0]; if (op0 == OP_SUM || op0 == OP_AVG) sc_fn = scalar_sum_linear_i64_fn; @@ -3837,6 +4303,7 @@ da_path:; .per_worker_max = mm_maxs, .n_workers = mm_n, .match_idx = match_idx, + .rowsel = rowsel, }; if (mm_n > 1) { ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan); @@ -4032,6 +4499,8 @@ da_path:; .n_keys = n_keys, .agg_ptrs = agg_ptrs, .agg_types = agg_types, + .agg_cols = agg_vecs, + .agg_strlen = agg_strlen, .agg_ops = ext->agg_ops, .n_aggs = n_aggs, .need_flags = need_flags, @@ -4039,6 +4508,7 @@ da_path:; .all_sum = all_sum, .n_slots = n_slots, .match_idx = match_idx, + .rowsel = rowsel, }; if (da_n_workers > 1) @@ -4214,9 +4684,17 @@ da_path:; double* da_sumsq = merged->sumsq_f64; /* may be NULL if !DA_NEED_SUMSQ */ int64_t* da_count = merged->count; + uint32_t all_grp_count = 0; + for (uint32_t s = 0; s < n_slots; s++) + if (da_count[s] > 0) all_grp_count++; + + int64_t da_keep_min = use_emit_filter + ? da_count_emit_keep_min(da_count, n_slots, all_grp_count, emit_filter) + : 1; + uint32_t grp_count = 0; for (uint32_t s = 0; s < n_slots; s++) - if (da_count[s] > 0) grp_count++; + if (da_count[s] >= da_keep_min) grp_count++; int64_t total_cols = n_keys + n_aggs; ray_t* result = ray_table_new(total_cols); @@ -4239,7 +4717,7 @@ da_path:; key_col->len = (int64_t)grp_count; uint32_t gi = 0; for (uint32_t s = 0; s < n_slots; s++) { - if (da_count[s] == 0) continue; + if (da_count[s] < da_keep_min) continue; int64_t offset = ((int64_t)s / da_key_stride[k]) % da_key_range[k]; int64_t key_val = da_key_min[k] + offset; write_col_i64(ray_data(key_col), gi, key_val, src_col->type, key_col->attrs); @@ -4259,11 +4737,13 @@ da_path:; da_val_t* dense_min_val = da_min_val ? (da_val_t*)scratch_alloc(&_h_dmin, dense_total * sizeof(da_val_t)) : NULL; da_val_t* dense_max_val = da_max_val ? (da_val_t*)scratch_alloc(&_h_dmax, dense_total * sizeof(da_val_t)) : NULL; double* dense_sumsq = da_sumsq ? (double*)scratch_alloc(&_h_dsq, dense_total * sizeof(double)) : NULL; - int64_t* dense_counts = (int64_t*)scratch_alloc(&_h_dcnt, grp_count * sizeof(int64_t)); + int64_t* dense_counts = grp_count + ? (int64_t*)scratch_alloc(&_h_dcnt, grp_count * sizeof(int64_t)) + : NULL; uint32_t gi = 0; for (uint32_t s = 0; s < n_slots; s++) { - if (da_count[s] == 0) continue; + if (da_count[s] < da_keep_min) continue; dense_counts[gi] = da_count[s]; for (uint8_t a = 0; a < n_aggs; a++) { size_t si = (size_t)s * n_aggs + a; @@ -4296,6 +4776,724 @@ da_path:; } } + { + bool sp_eligible = (nrows > 0 && n_keys == 1 && key_data[0] != NULL); + int8_t kt = sp_eligible ? key_types[0] : 0; + if (sp_eligible && kt != RAY_I64 && kt != RAY_I32 && kt != RAY_I16 && + kt != RAY_U8 && kt != RAY_BOOL && kt != RAY_DATE && + kt != RAY_TIME && kt != RAY_TIMESTAMP && kt != RAY_SYM) + sp_eligible = false; + if (sp_eligible && key_vecs[0]) { + ray_t* src = (key_vecs[0]->attrs & RAY_ATTR_SLICE) + ? key_vecs[0]->slice_parent : key_vecs[0]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + sp_eligible = false; + } + bool sp_need_sum = false; + for (uint8_t a = 0; a < n_aggs && sp_eligible; a++) { + uint16_t op = ext->agg_ops[a]; + if (op == OP_COUNT) continue; + if (op != OP_SUM && op != OP_AVG) + sp_eligible = false; + else + sp_need_sum = true; + } + + if (sp_eligible) { + void* agg_ptrs[vla_aggs]; + int8_t agg_types[vla_aggs]; + uint32_t agg_f64_mask = 0; + for (uint8_t a = 0; a < n_aggs; a++) { + if (agg_vecs[a]) { + agg_ptrs[a] = ray_data(agg_vecs[a]); + agg_types[a] = agg_vecs[a]->type; + if (agg_vecs[a]->type == RAY_F64) + agg_f64_mask |= (1u << a); + } else { + agg_ptrs[a] = NULL; + agg_types[a] = 0; + } + } + ray_t** strlen_sym_strings = NULL; + uint32_t strlen_sym_count = 0; + for (uint8_t a = 0; a < n_aggs; a++) { + if (agg_strlen[a] && agg_vecs[a] && + agg_vecs[a]->type == RAY_SYM) { + ray_sym_strings_borrow(&strlen_sym_strings, + &strlen_sym_count); + break; + } + } + + uint8_t key_esz = ray_sym_elem_size(key_types[0], key_attrs[0]); + + if (use_emit_filter && + (emit_filter.min_count_exclusive > 0 || + emit_filter.top_count_take > 0) && + n_scan <= UINT32_MAX) { + uint64_t cap = 1u << 20; + const uint64_t max_dense_cap = 1u << 24; + bool count_only_first = (key_types[0] == RAY_SYM); + ray_t *cnt_hdr = NULL, *range_sum_hdr = NULL; + uint32_t* range_count = (uint32_t*)scratch_calloc( + &cnt_hdr, (size_t)cap * sizeof(uint32_t)); + da_val_t* range_sum = NULL; + bool dyn_ok = range_count != NULL; + if (dyn_ok && sp_need_sum && !count_only_first) { + range_sum = (da_val_t*)scratch_calloc( + &range_sum_hdr, + (size_t)cap * n_aggs * sizeof(da_val_t)); + dyn_ok = range_sum != NULL; + } + + uint64_t max_seen = 0; + bool have_dyn_key = false; +#define DYN_DENSE_ACCUM_ROW(row_expr) \ + do { \ + int64_t dyn_row = (row_expr); \ + int64_t key = read_by_esz(key_data[0], dyn_row, key_esz); \ + if (key < 0 || (uint64_t)key >= max_dense_cap) { \ + dyn_ok = false; \ + goto dyn_dense_done; \ + } \ + uint64_t off = (uint64_t)key; \ + if (off >= cap) { \ + uint64_t old_cap = cap; \ + while (off >= cap) cap <<= 1; \ + uint32_t* new_count = (uint32_t*)scratch_realloc( \ + &cnt_hdr, (size_t)old_cap * sizeof(uint32_t), \ + (size_t)cap * sizeof(uint32_t)); \ + if (!new_count) { \ + dyn_ok = false; \ + goto dyn_dense_done; \ + } \ + range_count = new_count; \ + memset(range_count + old_cap, 0, \ + (size_t)(cap - old_cap) * sizeof(uint32_t)); \ + if (sp_need_sum && !count_only_first) { \ + da_val_t* new_sum = (da_val_t*)scratch_realloc( \ + &range_sum_hdr, \ + (size_t)old_cap * n_aggs * sizeof(da_val_t), \ + (size_t)cap * n_aggs * sizeof(da_val_t)); \ + if (!new_sum) { \ + dyn_ok = false; \ + goto dyn_dense_done; \ + } \ + range_sum = new_sum; \ + memset(range_sum + (size_t)old_cap * n_aggs, 0, \ + (size_t)(cap - old_cap) * n_aggs * sizeof(da_val_t)); \ + } \ + } \ + have_dyn_key = true; \ + if (off > max_seen) max_seen = off; \ + if (range_count[off] != UINT32_MAX) range_count[off]++; \ + if (range_sum) { \ + da_val_t* sums = &range_sum[(size_t)off * n_aggs]; \ + for (uint8_t a = 0; a < n_aggs; a++) { \ + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) continue; \ + if (agg_strlen[a]) \ + sums[a].i += group_strlen_at_cached( \ + agg_vecs[a], dyn_row, strlen_sym_strings, strlen_sym_count); \ + else if (agg_f64_mask & (1u << a)) \ + sums[a].f += ((const double*)agg_ptrs[a])[dyn_row]; \ + else \ + sums[a].i += read_col_i64(agg_ptrs[a], dyn_row, agg_types[a], 0); \ + } \ + } \ + } while (0) + + if (dyn_ok && match_idx) { + for (int64_t i = 0; i < n_scan; i++) + DYN_DENSE_ACCUM_ROW(match_idx[i]); + } else if (dyn_ok && rowsel) { + ray_rowsel_t* m = ray_rowsel_meta(rowsel); + const uint8_t* flags = ray_rowsel_flags(rowsel); + const uint32_t* offs = ray_rowsel_offsets(rowsel); + const uint16_t* idx = ray_rowsel_idx(rowsel); + uint32_t nseg = (uint32_t)((m->nrows + RAY_MORSEL_ELEMS - 1) / + RAY_MORSEL_ELEMS); + for (uint32_t seg = 0; seg < nseg; seg++) { + int64_t base = (int64_t)seg * RAY_MORSEL_ELEMS; + if (flags[seg] == RAY_SEL_NONE) continue; + if (flags[seg] == RAY_SEL_ALL) { + int64_t end = base + RAY_MORSEL_ELEMS; + if (end > m->nrows) end = m->nrows; + for (int64_t r = base; r < end; r++) + DYN_DENSE_ACCUM_ROW(r); + } else { + for (uint32_t p = offs[seg]; p < offs[seg + 1]; p++) + DYN_DENSE_ACCUM_ROW(base + idx[p]); + } + } + } else if (dyn_ok) { + for (int64_t r = 0; r < n_scan; r++) + DYN_DENSE_ACCUM_ROW(r); + } +dyn_dense_done: +#undef DYN_DENSE_ACCUM_ROW + + if (dyn_ok && have_dyn_key) { + uint32_t total_groups = 0; + for (uint64_t off = 0; off <= max_seen; off++) + if (range_count[off] > 0) + total_groups++; + int64_t keep_min = da_count_emit_keep_min_u32( + range_count, max_seen + 1, total_groups, emit_filter); + uint32_t grp_count = 0; + for (uint64_t off = 0; off <= max_seen; off++) + if ((int64_t)range_count[off] >= keep_min) + grp_count++; + + ray_t* result = ray_table_new((int64_t)n_keys + n_aggs); + if (!result || RAY_IS_ERR(result)) { + scratch_free(range_sum_hdr); scratch_free(cnt_hdr); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result ? result : ray_error("oom", NULL); + } + + ray_t* key_col = col_vec_new(key_vecs[0], (int64_t)grp_count); + if (!key_col || RAY_IS_ERR(key_col)) { + scratch_free(range_sum_hdr); scratch_free(cnt_hdr); + ray_release(result); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return key_col ? key_col : ray_error("oom", NULL); + } + key_col->len = (int64_t)grp_count; + + ray_t *_h_sum = NULL, *_h_cnt = NULL; + da_val_t* dense_sum = sp_need_sum + ? (da_val_t*)scratch_alloc(&_h_sum, + (size_t)grp_count * n_aggs * sizeof(da_val_t)) + : NULL; + int64_t* dense_count = (int64_t*)scratch_alloc( + &_h_cnt, (size_t)grp_count * sizeof(int64_t)); + if ((sp_need_sum && !dense_sum) || !dense_count) { + scratch_free(_h_sum); scratch_free(_h_cnt); + scratch_free(range_sum_hdr); scratch_free(cnt_hdr); + ray_release(key_col); ray_release(result); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return ray_error("oom", NULL); + } + if (sp_need_sum && !range_sum) + memset(dense_sum, 0, + (size_t)grp_count * n_aggs * sizeof(da_val_t)); + + uint32_t gi = 0; + for (uint64_t off = 0; off <= max_seen; off++) { + uint32_t cnt = range_count[off]; + if ((int64_t)cnt < keep_min) { + if (!range_sum) range_count[off] = 0; + continue; + } + write_col_i64(ray_data(key_col), gi, (int64_t)off, + key_col->type, key_col->attrs); + dense_count[gi] = (int64_t)cnt; + if (range_sum) { + memcpy(&dense_sum[(size_t)gi * n_aggs], + &range_sum[(size_t)off * n_aggs], + (size_t)n_aggs * sizeof(da_val_t)); + } + if (!range_sum) range_count[off] = gi + 1u; + gi++; + } + + if (sp_need_sum && !range_sum) { +#define DYN_DENSE_SUM_ROW(row_expr) \ + do { \ + int64_t dyn_row = (row_expr); \ + int64_t key = read_by_esz(key_data[0], dyn_row, key_esz); \ + if (key < 0 || (uint64_t)key > max_seen) break; \ + uint32_t marker = range_count[(uint64_t)key]; \ + if (!marker) break; \ + da_val_t* sums = &dense_sum[(size_t)(marker - 1u) * n_aggs]; \ + for (uint8_t a = 0; a < n_aggs; a++) { \ + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) continue; \ + if (agg_strlen[a]) \ + sums[a].i += group_strlen_at_cached( \ + agg_vecs[a], dyn_row, strlen_sym_strings, strlen_sym_count); \ + else if (agg_f64_mask & (1u << a)) \ + sums[a].f += ((const double*)agg_ptrs[a])[dyn_row]; \ + else \ + sums[a].i += read_col_i64(agg_ptrs[a], dyn_row, agg_types[a], 0);\ + } \ + } while (0) + if (match_idx) { + for (int64_t i = 0; i < n_scan; i++) + DYN_DENSE_SUM_ROW(match_idx[i]); + } else if (rowsel) { + ray_rowsel_t* m = ray_rowsel_meta(rowsel); + const uint8_t* flags = ray_rowsel_flags(rowsel); + const uint32_t* offs = ray_rowsel_offsets(rowsel); + const uint16_t* idx = ray_rowsel_idx(rowsel); + uint32_t nseg = (uint32_t)((m->nrows + RAY_MORSEL_ELEMS - 1) / + RAY_MORSEL_ELEMS); + for (uint32_t seg = 0; seg < nseg; seg++) { + int64_t base = (int64_t)seg * RAY_MORSEL_ELEMS; + if (flags[seg] == RAY_SEL_NONE) continue; + if (flags[seg] == RAY_SEL_ALL) { + int64_t end = base + RAY_MORSEL_ELEMS; + if (end > m->nrows) end = m->nrows; + for (int64_t r = base; r < end; r++) + DYN_DENSE_SUM_ROW(r); + } else { + for (uint32_t p = offs[seg]; p < offs[seg + 1]; p++) + DYN_DENSE_SUM_ROW(base + idx[p]); + } + } + } else { + for (int64_t r = 0; r < n_scan; r++) + DYN_DENSE_SUM_ROW(r); + } +#undef DYN_DENSE_SUM_ROW + } + + ray_op_ext_t* key_ext = find_ext(g, ext->keys[0]->id); + int64_t name_id = key_ext ? key_ext->sym : 0; + result = ray_table_add_col(result, name_id, key_col); + ray_release(key_col); + emit_agg_columns(&result, g, ext, agg_vecs, grp_count, n_aggs, + (double*)dense_sum, (int64_t*)dense_sum, + NULL, NULL, NULL, NULL, + dense_count, agg_affine, NULL); + + scratch_free(_h_sum); scratch_free(_h_cnt); + scratch_free(range_sum_hdr); scratch_free(cnt_hdr); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result; + } + + scratch_free(range_sum_hdr); + scratch_free(cnt_hdr); + } + + if (use_emit_filter && + (emit_filter.min_count_exclusive > 0 || + emit_filter.top_count_take > 0) && + key_types[0] != RAY_SYM && n_scan <= UINT32_MAX) { + bool have_key = false; + int64_t min_key = 0, max_key = 0; + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + if (!have_key) { + min_key = max_key = key; + have_key = true; + } else { + if (key < min_key) min_key = key; + if (key > max_key) max_key = key; + } + } + + uint64_t key_range = have_key + ? (uint64_t)((uint64_t)max_key - (uint64_t)min_key + 1u) + : 0u; + if (have_key && key_range > 0 && key_range <= (1u << 26)) { + ray_t *cnt_hdr = NULL, *range_sum_hdr = NULL; + ray_t *_h_sum = NULL, *_h_cnt = NULL; + uint32_t* range_count = (uint32_t*)scratch_calloc( + &cnt_hdr, (size_t)key_range * sizeof(uint32_t)); + if (!range_count) + goto ht_path; + da_val_t* range_sum = NULL; + if (sp_need_sum && key_range <= (1u << 24)) { + range_sum = (da_val_t*)scratch_calloc( + &range_sum_hdr, + (size_t)key_range * n_aggs * sizeof(da_val_t)); + if (!range_sum) { + scratch_free(cnt_hdr); + goto ht_path; + } + } + + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + uint64_t off = (uint64_t)((uint64_t)key - (uint64_t)min_key); + if (range_count[off] != UINT32_MAX) + range_count[off]++; + if (range_sum) { + da_val_t* sums = &range_sum[(size_t)off * n_aggs]; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) + continue; + if (agg_strlen[a]) + sums[a].i += group_strlen_at_cached( + agg_vecs[a], r, strlen_sym_strings, + strlen_sym_count); + else if (agg_f64_mask & (1u << a)) + sums[a].f += ((const double*)agg_ptrs[a])[r]; + else + sums[a].i += read_col_i64(agg_ptrs[a], r, + agg_types[a], 0); + } + } + } + + uint32_t total_groups = 0; + for (uint64_t off = 0; off < key_range; off++) { + if (range_count[off] > 0) + total_groups++; + } + int64_t keep_min = da_count_emit_keep_min_u32( + range_count, key_range, total_groups, emit_filter); + uint32_t grp_count = 0; + for (uint64_t off = 0; off < key_range; off++) { + if ((int64_t)range_count[off] >= keep_min) + grp_count++; + } + + ray_t* result = ray_table_new((int64_t)n_keys + n_aggs); + if (!result || RAY_IS_ERR(result)) { + scratch_free(range_sum_hdr); + scratch_free(cnt_hdr); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result ? result : ray_error("oom", NULL); + } + + ray_t* key_col = col_vec_new(key_vecs[0], (int64_t)grp_count); + if (!key_col || RAY_IS_ERR(key_col)) { + scratch_free(range_sum_hdr); + scratch_free(cnt_hdr); + ray_release(result); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return key_col ? key_col : ray_error("oom", NULL); + } + key_col->len = (int64_t)grp_count; + + da_val_t* dense_sum = sp_need_sum + ? (da_val_t*)scratch_calloc(&_h_sum, + (size_t)grp_count * n_aggs * sizeof(da_val_t)) + : NULL; + int64_t* dense_count = (int64_t*)scratch_alloc( + &_h_cnt, (size_t)grp_count * sizeof(int64_t)); + if ((sp_need_sum && !dense_sum) || !dense_count) { + scratch_free(_h_sum); scratch_free(_h_cnt); + scratch_free(range_sum_hdr); + scratch_free(cnt_hdr); + ray_release(key_col); ray_release(result); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return ray_error("oom", NULL); + } + + uint32_t gi = 0; + for (uint64_t off = 0; off < key_range; off++) { + uint32_t cnt = range_count[off]; + if ((int64_t)cnt < keep_min) { + range_count[off] = 0; + continue; + } + int64_t key = (int64_t)((uint64_t)min_key + off); + write_col_i64(ray_data(key_col), gi, key, + key_col->type, key_col->attrs); + dense_count[gi] = (int64_t)cnt; + if (range_sum) { + memcpy(&dense_sum[(size_t)gi * n_aggs], + &range_sum[(size_t)off * n_aggs], + (size_t)n_aggs * sizeof(da_val_t)); + } + range_count[off] = gi + 1u; + gi++; + } + + if (sp_need_sum && !range_sum) { + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + uint64_t off = (uint64_t)((uint64_t)key - (uint64_t)min_key); + uint32_t marker = range_count[off]; + if (!marker) continue; + da_val_t* sums = &dense_sum[(size_t)(marker - 1u) * n_aggs]; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) + continue; + if (agg_strlen[a]) + sums[a].i += group_strlen_at_cached( + agg_vecs[a], r, strlen_sym_strings, + strlen_sym_count); + else if (agg_f64_mask & (1u << a)) + sums[a].f += ((const double*)agg_ptrs[a])[r]; + else + sums[a].i += read_col_i64(agg_ptrs[a], r, + agg_types[a], 0); + } + } + } + + scratch_free(range_sum_hdr); + scratch_free(cnt_hdr); + ray_op_ext_t* key_ext = find_ext(g, ext->keys[0]->id); + int64_t name_id = key_ext ? key_ext->sym : 0; + result = ray_table_add_col(result, name_id, key_col); + ray_release(key_col); + + emit_agg_columns(&result, g, ext, agg_vecs, grp_count, n_aggs, + (double*)dense_sum, (int64_t*)dense_sum, + NULL, NULL, NULL, NULL, + dense_count, agg_affine, NULL); + + scratch_free(_h_sum); + scratch_free(_h_cnt); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result; + } + } + + sparse_i64_ht_t sp_ht; + memset(&sp_ht, 0, sizeof(sp_ht)); + bool sp_ok = true; + + if (use_emit_filter && + (emit_filter.min_count_exclusive > 0 || + emit_filter.top_count_take > 0)) { + uint64_t expected = (uint64_t)nrows / 64u; + if (expected < 4096) expected = 4096; + if (expected > (1u << 20)) expected = (1u << 20); + if (!sparse_i64_init(&sp_ht, (uint32_t)expected, n_aggs, false)) + goto ht_path; + + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + int32_t slot; + if (!sparse_i64_touch(&sp_ht, key, n_aggs, false, &slot)) { + sp_ok = false; + break; + } + sp_ht.counts[slot]++; + } + } else { + uint64_t expected = (uint64_t)nrows / 64u; + if (expected < 4096) expected = 4096; + if (expected > (1u << 20)) expected = (1u << 20); + if (!sparse_i64_init(&sp_ht, (uint32_t)expected, n_aggs, sp_need_sum)) + goto ht_path; + + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + int32_t slot; + if (!sparse_i64_touch(&sp_ht, key, n_aggs, sp_need_sum, &slot)) { + sp_ok = false; + break; + } + sp_ht.counts[slot]++; + if (!sp_need_sum) continue; + da_val_t* sums = &sp_ht.sums[(size_t)slot * n_aggs]; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) + continue; + if (agg_strlen[a]) + sums[a].i += group_strlen_at_cached( + agg_vecs[a], r, strlen_sym_strings, + strlen_sym_count); + else if (agg_f64_mask & (1u << a)) + sums[a].f += ((const double*)agg_ptrs[a])[r]; + else + sums[a].i += read_col_i64(agg_ptrs[a], r, agg_types[a], 0); + } + } + } + if (!sp_ok) { + sparse_i64_free(&sp_ht); + goto ht_path; + } + + uint32_t total_groups = 0; + for (uint32_t s = 0; s < sp_ht.cap; s++) { + if (!sp_ht.used[s]) continue; + total_groups++; + } + int64_t keep_min = use_emit_filter + ? da_count_emit_keep_min(sp_ht.counts, sp_ht.cap, + total_groups, emit_filter) + : 1; + uint32_t grp_count = 0; + for (uint32_t s = 0; s < sp_ht.cap; s++) { + if (!sp_ht.used[s]) continue; + if (sp_ht.counts[s] < keep_min) continue; + grp_count++; + } + ray_t* result = ray_table_new((int64_t)n_keys + n_aggs); + if (!result || RAY_IS_ERR(result)) { + sparse_i64_free(&sp_ht); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result ? result : ray_error("oom", NULL); + } + + ray_t* key_col = col_vec_new(key_vecs[0], (int64_t)grp_count); + if (!key_col || RAY_IS_ERR(key_col)) { + sparse_i64_free(&sp_ht); + ray_release(result); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return key_col ? key_col : ray_error("oom", NULL); + } + key_col->len = (int64_t)grp_count; + + ray_t *_h_sum = NULL, *_h_cnt = NULL; + da_val_t* dense_sum = sp_need_sum + ? (da_val_t*)scratch_alloc(&_h_sum, + (size_t)grp_count * n_aggs * sizeof(da_val_t)) + : NULL; + int64_t* dense_count = (int64_t*)scratch_alloc(&_h_cnt, + (size_t)grp_count * sizeof(int64_t)); + if ((sp_need_sum && !dense_sum) || !dense_count) { + scratch_free(_h_sum); scratch_free(_h_cnt); + ray_release(key_col); ray_release(result); + sparse_i64_free(&sp_ht); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return ray_error("oom", NULL); + } + if (use_emit_filter && sp_need_sum) + memset(dense_sum, 0, (size_t)grp_count * n_aggs * sizeof(da_val_t)); + + sparse_i64_ht_t heavy_ht; + memset(&heavy_ht, 0, sizeof(heavy_ht)); + if (use_emit_filter && grp_count > 0) { + if (!sparse_i64_init(&heavy_ht, grp_count * 2u, n_aggs, false)) { + scratch_free(_h_sum); scratch_free(_h_cnt); + ray_release(key_col); ray_release(result); + sparse_i64_free(&sp_ht); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return ray_error("oom", NULL); + } + } + + uint32_t gi = 0; + for (uint32_t s = 0; s < sp_ht.cap; s++) { + if (!sp_ht.used[s]) continue; + if (sp_ht.counts[s] < keep_min) continue; + write_col_i64(ray_data(key_col), gi, sp_ht.keys[s], + key_col->type, key_col->attrs); + dense_count[gi] = sp_ht.counts[s]; + if (use_emit_filter) { + int32_t hslot; + if (!sparse_i64_touch(&heavy_ht, sp_ht.keys[s], n_aggs, false, &hslot)) { + scratch_free(_h_sum); scratch_free(_h_cnt); + ray_release(key_col); ray_release(result); + sparse_i64_free(&heavy_ht); + sparse_i64_free(&sp_ht); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return ray_error("oom", NULL); + } + heavy_ht.counts[hslot] = gi; + } else if (sp_need_sum) { + memcpy(&dense_sum[(size_t)gi * n_aggs], + &sp_ht.sums[(size_t)s * n_aggs], + (size_t)n_aggs * sizeof(da_val_t)); + } + gi++; + } + sparse_i64_free(&sp_ht); + + if (use_emit_filter && sp_need_sum) { + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t key = read_by_esz(key_data[0], r, key_esz); + int32_t hslot = sparse_i64_find_slot(&heavy_ht, key); + if (!heavy_ht.used[hslot] || heavy_ht.keys[hslot] != key) + continue; + uint32_t out_gi = (uint32_t)heavy_ht.counts[hslot]; + da_val_t* sums = &dense_sum[(size_t)out_gi * n_aggs]; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] == OP_COUNT || !agg_ptrs[a]) + continue; + if (agg_strlen[a]) + sums[a].i += group_strlen_at_cached( + agg_vecs[a], r, strlen_sym_strings, + strlen_sym_count); + else if (agg_f64_mask & (1u << a)) + sums[a].f += ((const double*)agg_ptrs[a])[r]; + else + sums[a].i += read_col_i64(agg_ptrs[a], r, agg_types[a], 0); + } + } + } + sparse_i64_free(&heavy_ht); + ray_op_ext_t* key_ext = find_ext(g, ext->keys[0]->id); + int64_t name_id = key_ext ? key_ext->sym : 0; + result = ray_table_add_col(result, name_id, key_col); + ray_release(key_col); + + emit_agg_columns(&result, g, ext, agg_vecs, grp_count, n_aggs, + (double*)dense_sum, (int64_t*)dense_sum, + NULL, NULL, NULL, NULL, + dense_count, agg_affine, NULL); + + scratch_free(_h_sum); + scratch_free(_h_cnt); + for (uint8_t a = 0; a < n_aggs; a++) + if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); + for (uint8_t k = 0; k < n_keys; k++) + if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); + if (match_idx_block) ray_release(match_idx_block); + return result; + } + } + ht_path:; /* Compute which accumulator arrays the HT needs based on agg ops. * COUNT only reads group row's count field — no accumulator needed. */ @@ -4340,7 +5538,9 @@ ht_path:; uint32_t n_total = pool ? ray_pool_total_workers(pool) : 1; group_ht_t single_ht; + group_ht_t top_ht; group_ht_t* final_ht = NULL; + bool top_ht_ready = false; ray_t* result = NULL; ray_t* radix_bufs_hdr = NULL; @@ -4348,7 +5548,495 @@ ht_path:; ray_t* part_hts_hdr = NULL; group_ht_t* part_hts = NULL; - if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1) { + if (use_emit_filter && emit_filter.top_count_take > 0 && + n_keys > 1) { + bool top_count_nonselective = false; + if (n_keys >= 2 && n_keys <= 5) { + bool supported = true; + bool nullable = false; + for (uint16_t k = 0; k < n_keys; k++) { + if (key_types[k] == RAY_F64 || key_types[k] == RAY_GUID || + key_types[k] == RAY_STR) { + supported = false; + break; + } + ray_t* src = key_vecs[k] && (key_vecs[k]->attrs & RAY_ATTR_SLICE) + ? key_vecs[k]->slice_parent : key_vecs[k]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) { + nullable = true; + break; + } + } + if (!nullable && n_scan > 0 && n_scan <= INT32_MAX) { + uint64_t want = ((uint64_t)n_scan * 4u) / 3u; + uint32_t cap = 256; + while ((uint64_t)cap < want && cap < (1u << 28)) cap <<= 1; + if (supported && (uint64_t)cap >= want) { + ray_t *hk[5] = { NULL, NULL, NULL, NULL, NULL }; + ray_t *hc = NULL; + int64_t* ck64[5] = { NULL, NULL, NULL, NULL, NULL }; + int32_t* ck32[5] = { NULL, NULL, NULL, NULL, NULL }; + uint8_t key_is_i32[5] = { 0, 0, 0, 0, 0 }; + for (uint16_t k = 0; k < n_keys; k++) { + int8_t kt = key_types[k]; + key_is_i32[k] = (kt == RAY_I32 || kt == RAY_DATE || + kt == RAY_TIME || kt == RAY_I16 || + kt == RAY_BOOL || kt == RAY_U8); + if (key_is_i32[k]) + ck32[k] = (int32_t*)scratch_alloc(&hk[k], (size_t)cap * sizeof(int32_t)); + else + ck64[k] = (int64_t*)scratch_alloc(&hk[k], (size_t)cap * sizeof(int64_t)); + } + uint16_t* cc = (uint16_t*)scratch_calloc(&hc, (size_t)cap * sizeof(uint16_t)); + bool keys_alloc_ok = true; + for (uint16_t k = 0; k < n_keys; k++) { + if (key_is_i32[k] ? (ck32[k] == NULL) : (ck64[k] == NULL)) { + keys_alloc_ok = false; + break; + } + } + if (keys_alloc_ok && cc) { + uint32_t mask = cap - 1; + bool counted_fast = false; + bool count_overflow = false; +#define TOP_COUNT2_FIXED_LOOP(T0, T1) do { \ + const T0* d0 = (const T0*)key_data[0]; \ + const T1* d1 = (const T1*)key_data[1]; \ + for (int64_t r = 0; r < n_scan; r++) { \ + int64_t v0 = (int64_t)d0[r]; \ + int64_t v1 = (int64_t)d1[r]; \ + uint64_t h = ray_hash_combine(ray_hash_i64(v0), \ + ray_hash_i64(v1));\ + uint32_t slot = (uint32_t)(h & mask); \ + while (cc[slot]) { \ + int64_t s0 = key_is_i32[0] \ + ? (int64_t)ck32[0][slot] \ + : ck64[0][slot]; \ + int64_t s1 = key_is_i32[1] \ + ? (int64_t)ck32[1][slot] \ + : ck64[1][slot]; \ + if (s0 == v0 && s1 == v1) break; \ + slot = (slot + 1) & mask; \ + } \ + if (!cc[slot]) { \ + if (key_is_i32[0]) ck32[0][slot] = (int32_t)v0; \ + else ck64[0][slot] = v0; \ + if (key_is_i32[1]) ck32[1][slot] = (int32_t)v1; \ + else ck64[1][slot] = v1; \ + cc[slot] = 1; \ + } else if (cc[slot] != UINT16_MAX) { \ + cc[slot]++; \ + } else { \ + count_overflow = true; \ + } \ + } \ + counted_fast = true; \ + } while (0) + if (!rowsel && !match_idx && + n_keys == 2 && key_types[0] != RAY_SYM && + key_types[1] != RAY_SYM) { + bool k0_64 = (key_types[0] == RAY_I64 || + key_types[0] == RAY_TIMESTAMP); + bool k1_64 = (key_types[1] == RAY_I64 || + key_types[1] == RAY_TIMESTAMP); + bool k0_32 = (key_types[0] == RAY_I32 || + key_types[0] == RAY_DATE || + key_types[0] == RAY_TIME); + bool k1_32 = (key_types[1] == RAY_I32 || + key_types[1] == RAY_DATE || + key_types[1] == RAY_TIME); + if (k0_64 && k1_64) TOP_COUNT2_FIXED_LOOP(int64_t, int64_t); + else if (k0_64 && k1_32) TOP_COUNT2_FIXED_LOOP(int64_t, int32_t); + else if (k0_32 && k1_64) TOP_COUNT2_FIXED_LOOP(int32_t, int64_t); + else if (k0_32 && k1_32) TOP_COUNT2_FIXED_LOOP(int32_t, int32_t); + } +#undef TOP_COUNT2_FIXED_LOOP + if (!counted_fast) { + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t vals[5] = { 0, 0, 0, 0, 0 }; + uint64_t h = 0; + for (uint16_t k = 0; k < n_keys; k++) { + vals[k] = read_col_i64(key_data[k], r, key_types[k], key_attrs[k]); + uint64_t kh = ray_hash_i64(vals[k]); + h = (k == 0) ? kh : ray_hash_combine(h, kh); + } + uint32_t slot = (uint32_t)(h & mask); + while (cc[slot]) { + bool same = true; + for (uint16_t k = 0; k < n_keys; k++) { + int64_t stored = key_is_i32[k] + ? (int64_t)ck32[k][slot] + : ck64[k][slot]; + if (stored != vals[k]) { + same = false; + break; + } + } + if (same) break; + slot = (slot + 1) & mask; + } + if (!cc[slot]) { + for (uint16_t k = 0; k < n_keys; k++) { + if (key_is_i32[k]) + ck32[k][slot] = (int32_t)vals[k]; + else + ck64[k][slot] = vals[k]; + } + cc[slot] = 1; + } else if (cc[slot] != UINT16_MAX) { + cc[slot]++; + } else { + count_overflow = true; + } + } + } + + if (!count_overflow) { + int64_t k_take = emit_filter.top_count_take; + int64_t heap_n = 0; + int64_t heap[1024]; + uint32_t heap_slots[1024]; + if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0]))) + k_take = (int64_t)(sizeof(heap) / sizeof(heap[0])); + uint32_t total_groups = 0; + for (uint32_t i = 0; i < cap; i++) { + if (!cc[i]) continue; + total_groups++; + int64_t cnt = cc[i]; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = cnt; + heap_slots[j] = i; + while (j > 0) { + int64_t parent = (j - 1) >> 1; + if (heap[parent] <= heap[j]) break; + int64_t tmp = heap[parent]; heap[parent] = heap[j]; heap[j] = tmp; + uint32_t stmp = heap_slots[parent]; heap_slots[parent] = heap_slots[j]; heap_slots[j] = stmp; + j = parent; + } + } else if (cnt > heap[0]) { + heap[0] = cnt; + heap_slots[0] = i; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, rr = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (rr < heap_n && heap[rr] < heap[m]) m = rr; + if (m == j) break; + int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + uint32_t stmp = heap_slots[m]; heap_slots[m] = heap_slots[j]; heap_slots[j] = stmp; + j = m; + } + } + } + uint32_t heavy_count = (uint32_t)heap_n; + + if (heavy_count > 0 && total_groups > 0) { + uint32_t hcap = 256; + while (hcap < heavy_count * 2u && hcap < (1u << 30)) hcap <<= 1; + memset(&top_ht, 0, sizeof(top_ht)); + if (group_ht_init_sized(&top_ht, hcap, &ght_layout, heavy_count)) { + top_ht_ready = true; + group_ht_set_key_data(&top_ht, key_data); + int64_t keys[6] = { 0, 0, 0, 0, 0, 0 }; + for (uint32_t hi = 0; hi < heavy_count && !top_ht.oom; hi++) { + uint32_t i = heap_slots[hi]; + for (uint16_t k = 0; k < n_keys; k++) + keys[k] = key_is_i32[k] + ? (int64_t)ck32[k][i] + : ck64[k][i]; + keys[n_keys] = 0; + uint32_t gid = top_ht.grp_count; + if (!group_ht_insert_empty_group(&top_ht, keys, key_types)) + break; + if (gid < top_ht.grp_count) { + char* row = top_ht.rows + (size_t)gid * ght_layout.row_stride; + *(int64_t*)row = (int64_t)cc[i]; + } + } + if (!top_ht.oom) { + bool count_only = true; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] != OP_COUNT) { + count_only = false; + break; + } + } + if (count_only) { + for (uint16_t k = 0; k < n_keys; k++) + scratch_free(hk[k]); + scratch_free(hc); + final_ht = &top_ht; + goto build_from_final_ht; + } + bool direct_ok = (heavy_count <= 64); + for (uint8_t a = 0; a < n_aggs && direct_ok; a++) { + uint16_t aop = ext->agg_ops[a]; + if (aop == OP_COUNT) continue; + if ((aop == OP_SUM || aop == OP_AVG) && + agg_vecs[a] && !agg_strlen[a] && + agg_vecs[a]->type != RAY_STR && + agg_vecs[a]->type != RAY_GUID) + continue; + direct_ok = false; + } + if (direct_ok) { + int64_t sel_keys[64][5]; + for (uint32_t hi = 0; hi < heavy_count; hi++) { + uint32_t i = heap_slots[hi]; + for (uint16_t k = 0; k < n_keys; k++) + sel_keys[hi][k] = key_is_i32[k] + ? (int64_t)ck32[k][i] + : ck64[k][i]; + } + bool unique_first_key = true; + for (uint32_t i = 0; i < heavy_count && unique_first_key; i++) { + for (uint32_t j = i + 1; j < heavy_count; j++) { + if (sel_keys[i][0] == sel_keys[j][0]) { + unique_first_key = false; + break; + } + } + } + uint8_t lk_used[256]; + uint32_t lk_idx[256]; + int64_t lk_key[256]; + if (unique_first_key) { + memset(lk_used, 0, sizeof(lk_used)); + for (uint32_t hi = 0; hi < heavy_count; hi++) { + uint32_t pos = (uint32_t)ray_hash_i64(sel_keys[hi][0]) & 255u; + while (lk_used[pos]) + pos = (pos + 1u) & 255u; + lk_used[pos] = 1; + lk_key[pos] = sel_keys[hi][0]; + lk_idx[pos] = hi; + } + } + for (uint16_t k = 0; k < n_keys; k++) + scratch_free(hk[k]); + scratch_free(hc); + + for (int64_t i = 0; i < n_scan; i++) { + int64_t r = match_idx ? match_idx[i] : i; + if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) + continue; + int64_t v0 = read_col_i64(key_data[0], r, key_types[0], key_attrs[0]); + uint32_t hit = UINT32_MAX; + if (unique_first_key) { + uint32_t pos = (uint32_t)ray_hash_i64(v0) & 255u; + while (lk_used[pos]) { + if (lk_key[pos] == v0) { + hit = lk_idx[pos]; + break; + } + pos = (pos + 1u) & 255u; + } + if (hit == UINT32_MAX) continue; + int64_t v1 = read_col_i64(key_data[1], r, key_types[1], key_attrs[1]); + if (sel_keys[hit][1] != v1) continue; + if (n_keys >= 3) { + int64_t v2 = read_col_i64(key_data[2], r, key_types[2], key_attrs[2]); + if (sel_keys[hit][2] != v2) continue; + } + if (n_keys >= 4) { + int64_t v3 = read_col_i64(key_data[3], r, key_types[3], key_attrs[3]); + if (sel_keys[hit][3] != v3) continue; + } + if (n_keys == 5) { + int64_t v4 = read_col_i64(key_data[4], r, key_types[4], key_attrs[4]); + if (sel_keys[hit][4] != v4) continue; + } + } else { + int64_t v1 = read_col_i64(key_data[1], r, key_types[1], key_attrs[1]); + int64_t v2 = 0; + int64_t v3 = 0; + int64_t v4 = 0; + if (n_keys >= 3) + v2 = read_col_i64(key_data[2], r, key_types[2], key_attrs[2]); + if (n_keys >= 4) + v3 = read_col_i64(key_data[3], r, key_types[3], key_attrs[3]); + if (n_keys == 5) + v4 = read_col_i64(key_data[4], r, key_types[4], key_attrs[4]); + for (uint32_t hi = 0; hi < heavy_count; hi++) { + if (sel_keys[hi][0] == v0 && sel_keys[hi][1] == v1 && + (n_keys == 2 || sel_keys[hi][2] == v2) && + (n_keys < 4 || sel_keys[hi][3] == v3) && + (n_keys < 5 || sel_keys[hi][4] == v4)) { + hit = hi; + break; + } + } + if (hit == UINT32_MAX) continue; + } + char* row = top_ht.rows + (size_t)hit * ght_layout.row_stride; + (*(int64_t*)row)++; + for (uint8_t a = 0; a < n_aggs; a++) { + uint16_t aop = ext->agg_ops[a]; + if (aop == OP_COUNT || !agg_vecs[a]) continue; + int8_t s = ght_layout.agg_val_slot[a]; + if (s < 0) continue; + if (agg_vecs[a]->type == RAY_F64) { + double* dst = &ROW_WR_F64(row, ght_layout.off_sum, s); + *dst += ((const double*)ray_data(agg_vecs[a]))[r]; + } else { + int64_t* dst = &ROW_WR_I64(row, ght_layout.off_sum, s); + *dst += read_col_i64(ray_data(agg_vecs[a]), r, + agg_vecs[a]->type, + agg_vecs[a]->attrs); + } + } + } + final_ht = &top_ht; + goto build_from_final_ht; + } + for (uint16_t k = 0; k < n_keys; k++) + scratch_free(hk[k]); + scratch_free(hc); + group_rows_range_existing(&top_ht, key_data, key_types, + key_attrs, key_vecs, agg_vecs, agg_strlen, rowsel, + 0, n_scan, match_idx); + if (ray_interrupted()) { + result = ray_error("cancel", "interrupted"); + goto cleanup; + } + final_ht = &top_ht; + goto build_from_final_ht; + } + } + } + } + } + for (uint16_t k = 0; k < n_keys; k++) + scratch_free(hk[k]); + scratch_free(hc); + if (top_ht_ready) { + group_ht_free(&top_ht); + top_ht_ready = false; + } + } + } + } + + if (top_count_nonselective) + goto skip_top_count_filter; + if (rowsel || match_idx) + goto skip_top_count_filter; + + uint16_t cnt_op = OP_COUNT; + ray_t* cnt_vecs[1] = { NULL }; + ght_layout_t cnt_layout = + ght_compute_layout(n_keys, 1, cnt_vecs, 0, &cnt_op, key_types); + pivot_ingest_t cnt_ingest; + if (pivot_ingest_run(&cnt_ingest, &cnt_layout, key_data, key_types, + key_attrs, key_vecs, cnt_vecs, n_scan)) { + if (ray_interrupted()) { + pivot_ingest_free(&cnt_ingest); + result = ray_error("cancel", "interrupted"); + goto cleanup; + } + + int64_t k_take = emit_filter.top_count_take; + int64_t heap_n = 0; + int64_t heap[1024]; + if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0]))) + k_take = (int64_t)(sizeof(heap) / sizeof(heap[0])); + int64_t total_count_groups = 0; + for (uint32_t p = 0; p < cnt_ingest.n_parts; p++) { + group_ht_t* ph = &cnt_ingest.part_hts[p]; + uint16_t rs = ph->layout.row_stride; + total_count_groups += ph->grp_count; + for (uint32_t gi = 0; gi < ph->grp_count; gi++) { + const char* row = ph->rows + (size_t)gi * rs; + int64_t cnt = *(const int64_t*)(const void*)row; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = cnt; + while (j > 0) { + int64_t parent = (j - 1) >> 1; + if (heap[parent] <= heap[j]) break; + int64_t tmp = heap[parent]; heap[parent] = heap[j]; heap[j] = tmp; + j = parent; + } + } else if (cnt > heap[0]) { + heap[0] = cnt; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (r < heap_n && heap[r] < heap[m]) m = r; + if (m == j) break; + int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + j = m; + } + } + } + } + + int64_t threshold = (heap_n == k_take) ? heap[0] : 1; + uint32_t heavy_count = 0; + for (uint32_t p = 0; p < cnt_ingest.n_parts; p++) { + group_ht_t* ph = &cnt_ingest.part_hts[p]; + uint16_t rs = ph->layout.row_stride; + for (uint32_t gi = 0; gi < ph->grp_count; gi++) { + const char* row = ph->rows + (size_t)gi * rs; + int64_t cnt = *(const int64_t*)(const void*)row; + if (cnt >= threshold) heavy_count++; + } + } + + if (threshold <= 1 || + (uint64_t)heavy_count * 4u >= (uint64_t)total_count_groups * 3u) { + pivot_ingest_free(&cnt_ingest); + goto skip_top_count_filter; + } + + if (heavy_count > 0 && total_count_groups > 0) { + uint32_t cap = 256; + while (cap < heavy_count * 2u && cap < (1u << 30)) cap <<= 1; + memset(&top_ht, 0, sizeof(top_ht)); + if (group_ht_init_sized(&top_ht, cap, &ght_layout, heavy_count)) { + top_ht_ready = true; + group_ht_set_key_data(&top_ht, key_data); + for (uint32_t p = 0; p < cnt_ingest.n_parts && !top_ht.oom; p++) { + group_ht_t* ph = &cnt_ingest.part_hts[p]; + uint16_t rs = ph->layout.row_stride; + for (uint32_t gi = 0; gi < ph->grp_count; gi++) { + const char* row = ph->rows + (size_t)gi * rs; + int64_t cnt = *(const int64_t*)(const void*)row; + if (cnt < threshold) continue; + const int64_t* keys = (const int64_t*)(const void*)(row + 8); + if (!group_ht_insert_empty_group(&top_ht, keys, key_types)) + break; + } + } + if (!top_ht.oom) { + pivot_ingest_free(&cnt_ingest); + group_rows_range_existing(&top_ht, key_data, key_types, + key_attrs, key_vecs, agg_vecs, agg_strlen, rowsel, + 0, n_scan, match_idx); + if (ray_interrupted()) { + result = ray_error("cancel", "interrupted"); + goto cleanup; + } + final_ht = &top_ht; + goto build_from_final_ht; + } + } + } + pivot_ingest_free(&cnt_ingest); + if (top_ht_ready) { + group_ht_free(&top_ht); + top_ht_ready = false; + } + } + } + +skip_top_count_filter: + + if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) { size_t n_bufs = (size_t)n_total * RADIX_P; radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr, n_bufs * sizeof(radix_buf_t)); @@ -4394,9 +6082,11 @@ ht_path:; .key_vecs = key_vecs, .nullable_mask = p1_nullable, .agg_vecs = agg_vecs, + .agg_strlen = agg_strlen, .n_workers = n_total, .bufs = radix_bufs, .layout = ght_layout, + .rowsel = rowsel, .match_idx = match_idx, }; ray_pool_dispatch(pool, radix_phase1_fn, &p1ctx, n_scan); @@ -4438,6 +6128,16 @@ ht_path:; ray_pool_dispatch_n(pool, radix_phase2_fn, &p2ctx, RADIX_P); CHECK_CANCEL_GOTO(pool, cleanup); + if (radix_bufs) { + size_t n_bufs_free = (size_t)n_total * RADIX_P; + for (size_t i = 0; i < n_bufs_free; i++) + scratch_free(radix_bufs[i]._hdr); + scratch_free(radix_bufs_hdr); + radix_bufs = NULL; + radix_bufs_hdr = NULL; + ray_heap_gc(); + } + /* Prefix offsets */ uint32_t part_offsets[RADIX_P + 1]; part_offsets[0] = 0; @@ -4640,12 +6340,14 @@ sequential_fallback:; goto cleanup; } group_rows_range(&single_ht, key_data, key_types, key_attrs, key_vecs, agg_vecs, + agg_strlen, rowsel, 0, n_scan, match_idx); final_ht = &single_ht; if (ray_interrupted()) { result = ray_error("cancel", "interrupted"); goto cleanup; } if (single_ht.oom) { result = ray_error("oom", NULL); goto cleanup; } /* Build result from sequential HT (inline row layout) */ +build_from_final_ht: { uint32_t grp_count = final_ht->grp_count; const ght_layout_t* ly = &final_ht->layout; @@ -4854,6 +6556,9 @@ sequential_fallback:; if (final_ht == &single_ht) { group_ht_free(&single_ht); } + if (top_ht_ready) { + group_ht_free(&top_ht); + } if (radix_bufs) { size_t n_bufs = (size_t)n_total * RADIX_P; for (size_t i = 0; i < n_bufs; i++) scratch_free(radix_bufs[i]._hdr); @@ -4871,6 +6576,8 @@ sequential_fallback:; if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); if (match_idx_block) ray_release(match_idx_block); + ray_heap_gc(); + return result; } @@ -5448,7 +7155,7 @@ static void pivot_ingest_sequential(pivot_ingest_t* out, const ght_layout_t* ly, out->n_parts = 1; out->row_stride = ly->row_stride; group_rows_range(scratch_ht, key_data, key_types, key_attrs, key_vecs, - agg_vecs, 0, n_scan, NULL); + agg_vecs, NULL, NULL, 0, n_scan, NULL); out->total_grps = scratch_ht->grp_count; out->part_offsets[0] = 0; out->part_offsets[1] = scratch_ht->grp_count; diff --git a/src/ops/internal.h b/src/ops/internal.h index 33b7218f..658bc0cf 100644 --- a/src/ops/internal.h +++ b/src/ops/internal.h @@ -896,6 +896,14 @@ ght_layout_t ght_compute_layout(uint8_t n_keys, uint8_t n_aggs, const int8_t* key_types); bool group_ht_init(group_ht_t* ht, uint32_t cap, const ght_layout_t* ly); void group_ht_free(group_ht_t* ht); +typedef struct { + int enabled; + uint8_t agg_index; + int64_t min_count_exclusive; + int64_t top_count_take; +} ray_group_emit_filter_t; +ray_group_emit_filter_t ray_group_emit_filter_get(void); +void ray_group_emit_filter_set(ray_group_emit_filter_t filter); /* Hash-aggregate rows [start, end) into ht. * * When match_idx is non-NULL, the loop iterates `i` in [start, end) @@ -905,6 +913,8 @@ void group_ht_free(group_ht_t* ht); * column rows (no selection). */ void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types, uint8_t* key_attrs, ray_t** key_vecs, ray_t** agg_vecs, + uint8_t* agg_strlen, + ray_t* rowsel, int64_t start, int64_t end, const int64_t* match_idx); diff --git a/src/ops/query.c b/src/ops/query.c index fdb9a6e1..f7b07968 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -37,6 +37,7 @@ #include "ops/temporal.h" #include "table/sym.h" #include "table/dict.h" +#include "mem/heap.h" #include "mem/sys.h" #include @@ -148,6 +149,88 @@ static ray_t* groups_to_pair_list(ray_t* d) { return out; } +typedef struct { + ray_t* col; + int8_t base_type; + ray_t** segs; + ray_t* mc_keys; + const int64_t* mc_counts; + int64_t n_segs; + int64_t seg_idx; + int64_t seg_start; + int64_t seg_end; +} query_key_reader_t; + +static bool query_key_reader_init(query_key_reader_t* r, ray_t* col) { + memset(r, 0, sizeof(*r)); + r->col = col; + r->seg_end = INT64_MAX; + if (!col) return false; + if (RAY_IS_PARTED(col->type)) { + r->base_type = (int8_t)RAY_PARTED_BASETYPE(col->type); + r->segs = (ray_t**)ray_data(col); + r->n_segs = col->len; + r->seg_end = (r->n_segs > 0 && r->segs[0]) ? r->segs[0]->len : 0; + return true; + } + if (col->type == RAY_MAPCOMMON) { + ray_t** ptrs = (ray_t**)ray_data(col); + r->mc_keys = ptrs[0]; + ray_t* counts = ptrs[1]; + if (!r->mc_keys || !counts || counts->type != RAY_I64) return false; + r->base_type = r->mc_keys->type; + r->mc_counts = (const int64_t*)ray_data(counts); + r->n_segs = r->mc_keys->len; + r->seg_end = (r->n_segs > 0) ? r->mc_counts[0] : 0; + return true; + } + r->base_type = col->type; + return true; +} + +static bool query_key_reader_read(query_key_reader_t* r, int64_t row, + int64_t* out, uint8_t* is_null) { + if (!r || !r->col || !out || !is_null) return false; + *is_null = 0; + *out = 0; + + if (!RAY_IS_PARTED(r->col->type) && r->col->type != RAY_MAPCOMMON) { + *is_null = (r->col->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(r->col, row); + if (*is_null) return true; + if (r->base_type == RAY_F64) memcpy(out, &((double*)ray_data(r->col))[row], 8); + else *out = read_col_i64(ray_data(r->col), row, r->base_type, r->col->attrs); + return true; + } + + while (row >= r->seg_end && r->seg_idx + 1 < r->n_segs) { + r->seg_start = r->seg_end; + r->seg_idx++; + int64_t len = 0; + if (r->col->type == RAY_MAPCOMMON) len = r->mc_counts[r->seg_idx]; + else if (r->segs[r->seg_idx]) len = r->segs[r->seg_idx]->len; + r->seg_end += len; + } + if (row < r->seg_start || row >= r->seg_end) return false; + + if (r->col->type == RAY_MAPCOMMON) { + if (r->base_type == RAY_F64) + memcpy(out, (const char*)ray_data(r->mc_keys) + (size_t)r->seg_idx * 8, 8); + else + *out = read_col_i64(ray_data(r->mc_keys), r->seg_idx, + r->base_type, r->mc_keys->attrs); + return true; + } + + ray_t* seg = r->segs[r->seg_idx]; + if (!seg) return false; + int64_t local = row - r->seg_start; + *is_null = (seg->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(seg, local); + if (*is_null) return true; + if (r->base_type == RAY_F64) memcpy(out, &((double*)ray_data(seg))[local], 8); + else *out = read_col_i64(ray_data(seg), local, r->base_type, seg->attrs); + return true; +} + /* Map a Rayfall builtin name to a DAG binary op constructor */ typedef ray_op_t* (*dag_binary_ctor)(ray_graph_t*, ray_op_t*, ray_op_t*); typedef ray_op_t* (*dag_unary_ctor)(ray_graph_t*, ray_op_t*); @@ -266,6 +349,55 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, } if (!has_sort && !take_val_expr) return result; + if (!has_sort && take_val_expr) { + ray_t* tv = ray_eval(take_val_expr); + if (!tv || RAY_IS_ERR(tv)) { + ray_release(result); + return tv ? tv : ray_error("domain", NULL); + } + if (ray_is_atom(tv) && (tv->type == -RAY_I64 || tv->type == -RAY_I32)) { + int64_t atom_n = (tv->type == -RAY_I64) ? tv->i64 : tv->i32; + ray_release(tv); + + int64_t nrows = (result->type == RAY_TABLE) + ? ray_table_nrows(result) + : (ray_is_vec(result) ? result->len : 0); + int64_t start, amount; + if (atom_n >= 0) { + start = 0; + amount = atom_n < nrows ? atom_n : nrows; + } else { + int64_t want = -atom_n; + amount = want < nrows ? want : nrows; + start = nrows - amount; + } + + ray_t* rng = ray_vec_new(RAY_I64, 2); + if (!rng || RAY_IS_ERR(rng)) { + ray_release(result); + return rng ? rng : ray_error("oom", NULL); + } + ((int64_t*)ray_data(rng))[0] = start; + ((int64_t*)ray_data(rng))[1] = amount; + rng->len = 2; + ray_t* sliced = ray_take_fn(result, rng); + ray_release(result); + ray_heap_gc(); + ray_release(rng); + return sliced; + } + if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) { + ray_t* sliced = ray_take_fn(result, tv); + ray_release(result); + ray_heap_gc(); + ray_release(tv); + return sliced; + } + ray_release(tv); + ray_release(result); + return ray_error("domain", NULL); + } + /* ---- Top-K fast path detection ---- * Conditions: * - Exactly ONE asc:/desc: clause naming a SINGLE scalar column. @@ -356,6 +488,7 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, } if (topk && !RAY_IS_ERR(topk)) { ray_release(result); + ray_heap_gc(); return topk; } if (topk && RAY_IS_ERR(topk)) ray_release(topk); @@ -473,6 +606,39 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, return sorted; } +static bool unsorted_positive_take_limit(ray_t** dict_elems, int64_t dict_n, + int64_t asc_id, int64_t desc_id, + int64_t take_id, int64_t nrows, + int64_t* out_nrows) { + bool has_sort = false; + ray_t* take_val_expr = NULL; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == asc_id || kid == desc_id) has_sort = true; + if (kid == take_id) take_val_expr = dict_elems[i + 1]; + } + if (has_sort || !take_val_expr) return false; + + ray_t* tv = ray_eval(take_val_expr); + if (!tv || RAY_IS_ERR(tv)) { + if (tv && !RAY_IS_ERR(tv)) ray_release(tv); + return false; + } + + bool ok = false; + int64_t limit = nrows; + if (ray_is_atom(tv) && (tv->type == -RAY_I64 || tv->type == -RAY_I32)) { + int64_t k = (tv->type == -RAY_I64) ? tv->i64 : tv->i32; + if (k >= 0) { + limit = k < nrows ? k : nrows; + ok = true; + } + } + ray_release(tv); + if (ok && out_nrows) *out_nrows = limit; + return ok; +} + /* -------------------------------------------------------------------------- * Compile-time local env helpers for lambda / let inlining. * @@ -513,6 +679,55 @@ static void cexpr_env_pop(ray_graph_t* g, int n) { if (g->cexpr_env_top < 0) g->cexpr_env_top = 0; /* defensive */ } +static int const_str_expr_len(ray_t* expr, size_t* out_len) { + if (!expr || !out_len) return 0; + if (expr->type == -RAY_STR && !(expr->attrs & RAY_ATTR_NAME)) { + *out_len += ray_str_len(expr); + return 1; + } + if (expr->type != RAY_LIST || ray_len(expr) < 2) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* head = ray_sym_str(elems[0]->i64); + if (!head || ray_str_len(head) != 6 + || memcmp(ray_str_ptr(head), "concat", 6) != 0) + return 0; + for (int64_t i = 1; i < ray_len(expr); i++) + if (!const_str_expr_len(elems[i], out_len)) return 0; + return 1; +} + +static void const_str_expr_copy(ray_t* expr, char* dst, size_t* off) { + if (expr->type == -RAY_STR) { + size_t len = ray_str_len(expr); + memcpy(dst + *off, ray_str_ptr(expr), len); + *off += len; + return; + } + ray_t** elems = (ray_t**)ray_data(expr); + for (int64_t i = 1; i < ray_len(expr); i++) + const_str_expr_copy(elems[i], dst, off); +} + +static ray_op_t* compile_const_str_expr(ray_graph_t* g, ray_t* expr) { + size_t len = 0; + if (!const_str_expr_len(expr, &len)) return NULL; + if (len == 0) return ray_const_str(g, "", 0); + char stack_buf[256]; + char* buf = stack_buf; + ray_t* heap_buf = NULL; + if (len > sizeof(stack_buf)) { + heap_buf = ray_alloc(len); + if (!heap_buf) return NULL; + buf = (char*)ray_data(heap_buf); + } + size_t off = 0; + const_str_expr_copy(expr, buf, &off); + ray_op_t* out = ray_const_str(g, buf, len); + if (heap_buf) ray_release(heap_buf); + return out; +} + /* Re-resolve a ray_op_t* by its stable node ID. Use this whenever * a pointer to an op node has been held across another DAG-building * call (which may grow g->nodes via graph_alloc_node and invalidate @@ -808,6 +1023,8 @@ ray_op_t* compile_expr_dag(ray_graph_t* g, ray_t* expr) { /* (concat a b ...) — variadic string concat. */ if (fname_len == 6 && memcmp(fname, "concat", 6) == 0) { + ray_op_t* folded = compile_const_str_expr(g, expr); + if (folded) return folded; if (n < 2 || n - 1 > 16) return NULL; uint32_t arg_ids[16]; for (int64_t i = 1; i < n; i++) { @@ -1146,6 +1363,8 @@ static int expr_contains_call_named(ray_t* expr, const char* name, size_t name_l return 0; } +static ray_t* query_materialize_parted_col(ray_t* col); + /* True when a grouped aggregate expression can be lowered to OP_GROUP. * `(count (distinct col))` is semantically an aggregate, but `distinct` * is not a row-aligned DAG input inside GROUP. Route it through the @@ -1156,6 +1375,313 @@ static int is_group_dag_agg_expr(ray_t* expr) { return !expr_contains_call_named(elems[1], "distinct", 8); } +static int is_single_group_key_projection(ray_t* by_expr, ray_t* val_expr) { + int64_t key_id = -1; + if (by_expr && by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) { + key_id = by_expr->i64; + } else if (by_expr && by_expr->type == RAY_SYM && ray_len(by_expr) == 1) { + key_id = ((int64_t*)ray_data(by_expr))[0]; + } + + return key_id >= 0 && + val_expr && + val_expr->type == -RAY_SYM && + (val_expr->attrs & RAY_ATTR_NAME) && + val_expr->i64 == key_id; +} + +static int atom_i64_const(ray_t* v, int64_t* out) { + if (!v || !ray_is_atom(v) || (v->attrs & RAY_ATTR_NAME) || + RAY_ATOM_IS_NULL(v)) + return 0; + switch (v->type) { + case -RAY_BOOL: + case -RAY_U8: *out = v->u8; return 1; + case -RAY_I16: *out = v->i16; return 1; + case -RAY_I32: + case -RAY_DATE: + case -RAY_TIME: *out = v->i32; return 1; + case -RAY_I64: + case -RAY_TIMESTAMP: *out = v->i64; return 1; + default: return 0; + } +} + +static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) { + if (!expr) return 0; + if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) && + expr->i64 == sym) { + *bias = 0; + return 1; + } + if (expr->type != RAY_LIST || ray_len(expr) != 3) return 0; + ray_t** e = (ray_t**)ray_data(expr); + if (!e[0] || e[0]->type != -RAY_SYM) return 0; + ray_t* op = ray_sym_str(e[0]->i64); + if (!op || ray_str_len(op) != 1) return 0; + char opc = ray_str_ptr(op)[0]; + int lhs_sym = e[1] && e[1]->type == -RAY_SYM && + (e[1]->attrs & RAY_ATTR_NAME) && e[1]->i64 == sym; + int rhs_sym = e[2] && e[2]->type == -RAY_SYM && + (e[2]->attrs & RAY_ATTR_NAME) && e[2]->i64 == sym; + int64_t c = 0; + if (opc == '+') { + if (lhs_sym && atom_i64_const(e[2], &c)) { + *bias = c; + return 1; + } + if (rhs_sym && atom_i64_const(e[1], &c)) { + *bias = c; + return 1; + } + } else if (opc == '-') { + if (lhs_sym && atom_i64_const(e[2], &c)) { + *bias = -c; + return 1; + } + } + return 0; +} + +static int key_type_i64_projectable(int8_t t) { + return t == RAY_BOOL || t == RAY_U8 || t == RAY_I16 || + t == RAY_I32 || t == RAY_I64 || t == RAY_DATE || + t == RAY_TIME || t == RAY_TIMESTAMP; +} + +static int64_t key_col_read_i64(ray_t* col, int64_t row) { + const void* d = ray_data(col); + switch (col->type) { + case RAY_BOOL: + case RAY_U8: return ((const uint8_t*)d)[row]; + case RAY_I16: return ((const int16_t*)d)[row]; + case RAY_I32: + case RAY_DATE: + case RAY_TIME: return ((const int32_t*)d)[row]; + case RAY_I64: + case RAY_TIMESTAMP: return ((const int64_t*)d)[row]; + default: return 0; + } +} + +static bool parse_gt_name_i64(ray_t* expr, int64_t* out_name, int64_t* out_threshold) { + if (!expr || expr->type != RAY_LIST || ray_len(expr) != 3) + return false; + ray_t** e = (ray_t**)ray_data(expr); + if (!e[0] || e[0]->type != -RAY_SYM) + return false; + ray_t* op = ray_sym_str(e[0]->i64); + if (!op || ray_str_len(op) != 1 || ray_str_ptr(op)[0] != '>') + return false; + if (!e[1] || e[1]->type != -RAY_SYM || !(e[1]->attrs & RAY_ATTR_NAME)) + return false; + if (!e[2] || !ray_is_atom(e[2]) || (e[2]->attrs & RAY_ATTR_NAME)) + return false; + int64_t threshold; + switch (e[2]->type) { + case -RAY_I64: threshold = e[2]->i64; break; + case -RAY_I32: + case -RAY_DATE: + case -RAY_TIME: threshold = e[2]->i32; break; + case -RAY_I16: threshold = e[2]->i16; break; + case -RAY_U8: + case -RAY_BOOL: threshold = e[2]->u8; break; + default: return false; + } + *out_name = e[1]->i64; + *out_threshold = threshold; + return true; +} + +static bool can_defer_single_key_where(ray_t* by_expr, ray_t* where_expr, + ray_t* tbl) { + if (!by_expr || !where_expr || !tbl || + by_expr->type != -RAY_SYM || !(by_expr->attrs & RAY_ATTR_NAME) || + where_expr->type != RAY_LIST || ray_len(where_expr) != 3) + return false; + + ray_t** e = (ray_t**)ray_data(where_expr); + if (!e[0] || e[0]->type != -RAY_SYM) return false; + ray_t* op = ray_sym_str(e[0]->i64); + if (!op) return false; + size_t op_len = ray_str_len(op); + const char* op_s = ray_str_ptr(op); + bool cmp = (op_len == 1 && (op_s[0] == '<' || op_s[0] == '>')) || + (op_len == 2 && + ((op_s[0] == '=' && op_s[1] == '=') || + (op_s[0] == '!' && op_s[1] == '=') || + (op_s[0] == '<' && op_s[1] == '=') || + (op_s[0] == '>' && op_s[1] == '='))); + if (!cmp) return false; + + ray_t* lhs = e[1]; + ray_t* rhs = e[2]; + bool lhs_key = lhs && lhs->type == -RAY_SYM && + (lhs->attrs & RAY_ATTR_NAME) && + lhs->i64 == by_expr->i64; + bool rhs_key = rhs && rhs->type == -RAY_SYM && + (rhs->attrs & RAY_ATTR_NAME) && + rhs->i64 == by_expr->i64; + if (lhs_key == rhs_key) return false; + + ray_t* atom = lhs_key ? rhs : lhs; + if (!atom || !ray_is_atom(atom) || (atom->attrs & RAY_ATTR_NAME) || + RAY_ATOM_IS_NULL(atom)) + return false; + + ray_t* key_col = ray_table_get_col(tbl, by_expr->i64); + if (!key_col) return false; + int8_t kt = key_col->type; + if (!RAY_IS_PARTED(kt)) return false; + if (RAY_IS_PARTED(kt)) kt = (int8_t)RAY_PARTED_BASETYPE(kt); + return kt == RAY_SYM || kt == RAY_BOOL || kt == RAY_U8 || + kt == RAY_I16 || kt == RAY_I32 || kt == RAY_I64 || + kt == RAY_DATE || kt == RAY_TIME || kt == RAY_TIMESTAMP || + kt == RAY_F32 || kt == RAY_F64; +} + +static ray_t* filter_group_result(ray_t* result, ray_t* where_expr) { + if (!result || RAY_IS_ERR(result) || !where_expr) return result; + if (result->type != RAY_TABLE) return result; + if (ray_is_lazy(result)) { + result = ray_lazy_materialize(result); + if (!result || RAY_IS_ERR(result)) return result; + } + + ray_graph_t* fg = ray_graph_new(result); + if (!fg) { + ray_release(result); + return ray_error("oom", NULL); + } + ray_op_t* root = ray_const_table(fg, result); + ray_op_t* pred = compile_expr_dag(fg, where_expr); + if (!pred) { + ray_graph_free(fg); + ray_release(result); + return ray_error("domain", NULL); + } + root = ray_filter(fg, root, pred); + root = ray_optimize(fg, root); + ray_t* filtered = ray_execute(fg, root); + if (filtered && !RAY_IS_ERR(filtered) && ray_is_lazy(filtered)) + filtered = ray_lazy_materialize(filtered); + ray_graph_free(fg); + ray_release(result); + return filtered ? filtered : ray_error("domain", NULL); +} + +static bool match_group_count_emit_filter(ray_t* from_expr, ray_t* where_expr, + ray_group_emit_filter_t* out) { + int64_t filter_name, threshold; + if (!parse_gt_name_i64(where_expr, &filter_name, &threshold)) + return false; + if (!from_expr || from_expr->type != RAY_LIST || ray_len(from_expr) != 2) + return false; + ray_t** fe = (ray_t**)ray_data(from_expr); + if (!fe[0] || fe[0]->type != -RAY_SYM) + return false; + ray_t* fname = ray_sym_str(fe[0]->i64); + if (!fname || ray_str_len(fname) != 6 || + memcmp(ray_str_ptr(fname), "select", 6) != 0) + return false; + ray_t* inner = fe[1]; + if (!inner || inner->type != RAY_DICT) + return false; + ray_t* by = dict_get(inner, "by"); + if (!by) return false; + + DICT_VIEW_DECL(iv); + DICT_VIEW_OPEN(inner, iv); + if (DICT_VIEW_OVERFLOW(iv)) + return false; + int64_t from_id = ray_sym_intern("from", 4); + int64_t where_id = ray_sym_intern("where", 5); + int64_t by_id = ray_sym_intern("by", 2); + int64_t take_id = ray_sym_intern("take", 4); + int64_t asc_id = ray_sym_intern("asc", 3); + int64_t desc_id = ray_sym_intern("desc", 4); + + uint8_t agg_index = 0; + for (int64_t i = 0; i + 1 < iv_n; i += 2) { + int64_t kid = iv[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id) + continue; + ray_t* val = iv[i + 1]; + if (!is_group_dag_agg_expr(val)) + continue; + ray_t** ae = (ray_t**)ray_data(val); + uint16_t op = resolve_agg_opcode(ae[0]->i64); + if (kid == filter_name && op == OP_COUNT) { + out->enabled = 1; + out->agg_index = agg_index; + out->min_count_exclusive = threshold; + return true; + } + agg_index++; + } + return false; +} + +static bool positive_take_i64(ray_t* expr, int64_t* out) { + if (!expr) return false; + ray_t* v = ray_eval(expr); + if (!v || RAY_IS_ERR(v)) return false; + bool ok = false; + int64_t n = 0; + if (v->type == -RAY_I64) { n = v->i64; ok = n > 0; } + else if (v->type == -RAY_I32) { n = v->i32; ok = n > 0; } + ray_release(v); + if (!ok) return false; + *out = n; + return true; +} + +static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n, + int64_t from_id, int64_t where_id, + int64_t by_id, int64_t take_id, + int64_t asc_id, int64_t desc_id, + ray_group_emit_filter_t* out) { + ray_t* take_expr = NULL; + int64_t desc_name = -1; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == take_id) take_expr = dict_elems[i + 1]; + else if (kid == desc_id) { + ray_t* v = dict_elems[i + 1]; + if (!v || v->type != -RAY_SYM) return false; + desc_name = v->i64; + } else if (kid == asc_id) { + return false; + } + } + int64_t take_n = 0; + if (desc_name < 0 || !positive_take_i64(take_expr, &take_n)) + return false; + + uint8_t agg_index = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id) + continue; + ray_t* val = dict_elems[i + 1]; + if (!is_group_dag_agg_expr(val)) + continue; + ray_t** ae = (ray_t**)ray_data(val); + uint16_t op = resolve_agg_opcode(ae[0]->i64); + if (kid == desc_name && op == OP_COUNT) { + out->enabled = 1; + out->agg_index = agg_index; + out->min_count_exclusive = 0; + out->top_count_take = take_n; + return true; + } + agg_index++; + } + return false; +} + /* True for `(fn arg ...)` where fn resolves to a RAY_UNARY marked * RAY_FN_AGGR — i.e. a builtin aggregator (sum/avg/min/max/count and * the non-whitelisted med/dev/var/stddev/etc). Used to route these @@ -1180,6 +1706,38 @@ static int is_streaming_aggr_unary_call(ray_t* expr) { return !expr_contains_call_named(elems[1], "distinct", 8); } +static int is_plain_count_expr(ray_t* expr) { + if (!expr || expr->type != RAY_LIST) return 0; + int64_t n = ray_len(expr); + if (n < 2) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + if (resolve_agg_opcode(elems[0]->i64) != OP_COUNT) return 0; + return !expr_contains_call_named(elems[1], "distinct", 8); +} + +static bool bounded_multikey_count_take_candidate(ray_t** dict_elems, int64_t dict_n, + int64_t from_id, int64_t where_id, + int64_t by_id, int64_t take_id, + int64_t asc_id, int64_t desc_id, + int64_t nrows, int64_t max_groups) { + int64_t limit = nrows; + if (!unsorted_positive_take_limit(dict_elems, dict_n, asc_id, desc_id, + take_id, nrows, &limit)) + return false; + if (limit > max_groups) return false; + + int n_count_out = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id) continue; + if (!is_plain_count_expr(dict_elems[i + 1])) return false; + n_count_out++; + } + return n_count_out > 0; +} + /* Detect `(count (distinct ))` exactly — the only shape that * routes through the OP_COUNT_DISTINCT fast path per group. Returns * the inner expression on success, NULL otherwise. More complex @@ -1645,7 +2203,7 @@ static ray_t* aggr_unary_per_group_buf(ray_t* expr, ray_t* tbl, * Specialised on element width (1/2/4/8 bytes + F64) so the inner read * folds to a typed pointer dereference. Has-nulls falls through to * ray_count_distinct_per_group_buf serial path (acceptable: null-bearing - * columns are rare in ClickBench-style aggregates). */ + * columns are rare in wide analytical aggregates). */ typedef struct { int8_t in_type; uint8_t in_attrs; @@ -1836,6 +2394,41 @@ static void idxbuf_scat_fn(void* vctx, uint32_t worker_id, } } +static ray_t* query_materialize_parted_col(ray_t* col) { + if (!col) return NULL; + if (col->type == RAY_MAPCOMMON) return materialize_mapcommon(col); + if (!RAY_IS_PARTED(col->type)) { + ray_retain(col); + return col; + } + + int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type); + ray_t** segs = (ray_t**)ray_data(col); + int64_t total = ray_parted_nrows(col); + if (base == RAY_STR) return parted_flatten_str(segs, col->len, total); + + uint8_t attrs = (base == RAY_SYM) ? parted_first_attrs(segs, col->len) : 0; + ray_t* flat = typed_vec_new(base, attrs, total); + if (!flat || RAY_IS_ERR(flat)) return flat ? flat : ray_error("oom", NULL); + flat->len = total; + + size_t esz = (size_t)ray_sym_elem_size(base, attrs); + int64_t off = 0; + for (int64_t s = 0; s < col->len; s++) { + ray_t* seg = segs[s]; + if (!seg || seg->len <= 0) continue; + if (parted_seg_esz_ok(seg, base, (uint8_t)esz)) { + memcpy((char*)ray_data(flat) + (size_t)off * esz, + ray_data(seg), (size_t)seg->len * esz); + } else { + memset((char*)ray_data(flat) + (size_t)off * esz, 0, + (size_t)seg->len * esz); + } + off += seg->len; + } + return flat; +} + /* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel. * Mirrors aggr_unary_per_group_buf but slices the source column once per * group and calls exec_count_distinct directly — bypasses the full @@ -1866,6 +2459,12 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl, ray_env_pop_scope(); if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL); } + if (src && !RAY_IS_ERR(src) && (RAY_IS_PARTED(src->type) || src->type == RAY_MAPCOMMON)) { + ray_t* flat = query_materialize_parted_col(src); + ray_release(src); + src = flat; + if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("oom", NULL); + } ray_t* out = ray_vec_new(RAY_I64, n_groups); if (!out || RAY_IS_ERR(out)) { @@ -1967,6 +2566,12 @@ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl, ray_env_pop_scope(); if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL); } + if (src && !RAY_IS_ERR(src) && (RAY_IS_PARTED(src->type) || src->type == RAY_MAPCOMMON)) { + ray_t* flat = query_materialize_parted_col(src); + ray_release(src); + src = flat; + if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("oom", NULL); + } ray_t* out = ray_vec_new(RAY_I64, n_groups); if (!out || RAY_IS_ERR(out)) { ray_release(src); return out ? out : ray_error("oom", NULL); } @@ -2324,6 +2929,365 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { return ray_select(args, n); } +typedef enum { + COUNT_CMP_EQ = 1, + COUNT_CMP_NE, + COUNT_CMP_LT, + COUNT_CMP_LE, + COUNT_CMP_GT, + COUNT_CMP_GE, +} count_cmp_op_t; + +typedef struct { + const ray_t* col; + int64_t rhs; + count_cmp_op_t op; + int64_t* counts; +} count_compare_ctx_t; + +typedef struct { + const ray_t* col; + const void* data; + int64_t len; + int8_t type; + uint8_t attrs; + count_cmp_op_t op; + int64_t rhs; + int64_t result; +} count_compare_cache_entry_t; + +#define COUNT_COMPARE_CACHE_N 32 +static _Thread_local count_compare_cache_entry_t count_compare_cache[COUNT_COMPARE_CACHE_N]; +static _Thread_local uint8_t count_compare_cache_next = 0; + +static int count_compare_cache_lookup(ray_t* col, count_cmp_op_t op, + int64_t rhs, int64_t* out) { + const void* data = ray_data(col); + for (uint8_t i = 0; i < COUNT_COMPARE_CACHE_N; i++) { + count_compare_cache_entry_t* e = &count_compare_cache[i]; + if (e->col == col && e->data == data && e->len == col->len && + e->type == col->type && e->attrs == col->attrs && + e->op == op && e->rhs == rhs) { + *out = e->result; + return 1; + } + } + return 0; +} + +static void count_compare_cache_store(ray_t* col, count_cmp_op_t op, + int64_t rhs, int64_t result) { + count_compare_cache_entry_t* e = &count_compare_cache[count_compare_cache_next]; + e->col = col; + e->data = ray_data(col); + e->len = col->len; + e->type = col->type; + e->attrs = col->attrs; + e->op = op; + e->rhs = rhs; + e->result = result; + count_compare_cache_next = (uint8_t)((count_compare_cache_next + 1) % COUNT_COMPARE_CACHE_N); +} + +static inline int64_t count_atom_i64(ray_t* a) { + if (a->type == -RAY_BOOL) return (int64_t)a->b8; + if (a->type == -RAY_U8) return (int64_t)a->u8; + if (a->type == -RAY_I16) return (int64_t)a->i16; + if (a->type == -RAY_I32 || a->type == -RAY_DATE || a->type == -RAY_TIME) + return (int64_t)a->i32; + if (a->type == -RAY_I64 || a->type == -RAY_TIMESTAMP || a->type == -RAY_SYM) + return a->i64; + return 0; +} + +static inline int count_compare_i64(int64_t lhs, int64_t rhs, count_cmp_op_t op) { + switch (op) { + case COUNT_CMP_EQ: return lhs == rhs; + case COUNT_CMP_NE: return lhs != rhs; + case COUNT_CMP_LT: return lhs < rhs; + case COUNT_CMP_LE: return lhs <= rhs; + case COUNT_CMP_GT: return lhs > rhs; + case COUNT_CMP_GE: return lhs >= rhs; + default: return 0; + } +} + +static count_cmp_op_t count_cmp_flip(count_cmp_op_t op) { + switch (op) { + case COUNT_CMP_LT: return COUNT_CMP_GT; + case COUNT_CMP_LE: return COUNT_CMP_GE; + case COUNT_CMP_GT: return COUNT_CMP_LT; + case COUNT_CMP_GE: return COUNT_CMP_LE; + default: return op; + } +} + +static void count_compare_task(void* vctx, uint32_t worker_id, int64_t start, int64_t end) { + count_compare_ctx_t* ctx = (count_compare_ctx_t*)vctx; + const ray_t* col = ctx->col; + const void* data = ray_data((ray_t*)col); + int64_t rhs = ctx->rhs; + count_cmp_op_t op = ctx->op; + int64_t local = 0; + + switch (col->type) { + case RAY_BOOL: + case RAY_U8: { + const uint8_t* x = (const uint8_t*)data; + if ((op == COUNT_CMP_EQ || op == COUNT_CMP_NE) && rhs >= 0 && rhs <= 255) { + uint8_t needle = (uint8_t)rhs; + if (op == COUNT_CMP_EQ) { + for (int64_t i = start; i < end; i++) + local += x[i] == needle; + } else { + for (int64_t i = start; i < end; i++) + local += x[i] != needle; + } + break; + } + for (int64_t i = start; i < end; i++) + local += count_compare_i64((int64_t)x[i], rhs, op); + break; + } + case RAY_I16: { + const int16_t* x = (const int16_t*)data; + for (int64_t i = start; i < end; i++) local += count_compare_i64((int64_t)x[i], rhs, op); + break; + } + case RAY_I32: + case RAY_DATE: + case RAY_TIME: { + const int32_t* x = (const int32_t*)data; + for (int64_t i = start; i < end; i++) local += count_compare_i64((int64_t)x[i], rhs, op); + break; + } + case RAY_I64: + case RAY_TIMESTAMP: { + const int64_t* x = (const int64_t*)data; + for (int64_t i = start; i < end; i++) local += count_compare_i64(x[i], rhs, op); + break; + } + case RAY_SYM: { + uint8_t esz = col_esz(col); + for (int64_t i = start; i < end; i++) + local += count_compare_i64(read_by_esz(data, i, esz), rhs, op); + break; + } + default: + break; + } + ctx->counts[worker_id] += local; +} + +static int try_count_simple_compare(ray_t* tbl, ray_t* where_expr, int64_t* out_count) { + if (!tbl || !where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) + return 0; + ray_t** we = (ray_t**)ray_data(where_expr); + if (!we[0] || we[0]->type != -RAY_SYM) return 0; + ray_t* op_name = ray_sym_str(we[0]->i64); + if (!op_name) return 0; + const char* op_s = ray_str_ptr(op_name); + size_t op_len = ray_str_len(op_name); + count_cmp_op_t op = 0; + if (op_len == 1) { + if (op_s[0] == '<') op = COUNT_CMP_LT; + else if (op_s[0] == '>') op = COUNT_CMP_GT; + } else if (op_len == 2) { + if (op_s[0] == '=' && op_s[1] == '=') op = COUNT_CMP_EQ; + else if (op_s[0] == '!' && op_s[1] == '=') op = COUNT_CMP_NE; + else if (op_s[0] == '<' && op_s[1] == '=') op = COUNT_CMP_LE; + else if (op_s[0] == '>' && op_s[1] == '=') op = COUNT_CMP_GE; + } + if (!op) return 0; + + ray_t* col_expr = we[1]; + ray_t* rhs_expr = we[2]; + if (col_expr && ray_is_atom(col_expr) && !(col_expr->attrs & RAY_ATTR_NAME) && + rhs_expr && rhs_expr->type == -RAY_SYM && (rhs_expr->attrs & RAY_ATTR_NAME)) { + col_expr = we[2]; + rhs_expr = we[1]; + op = count_cmp_flip(op); + } + if (!col_expr || col_expr->type != -RAY_SYM || !(col_expr->attrs & RAY_ATTR_NAME)) + return 0; + if (!rhs_expr || !ray_is_atom(rhs_expr) || (rhs_expr->attrs & RAY_ATTR_NAME) || + RAY_ATOM_IS_NULL(rhs_expr)) + return 0; + + ray_t* col = ray_table_get_col(tbl, col_expr->i64); + if (!col || col->len != ray_table_nrows(tbl)) return 0; + if ((col->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE)) != 0) return 0; + + switch (col->type) { + case RAY_BOOL: + case RAY_U8: + case RAY_I16: + case RAY_I32: + case RAY_I64: + case RAY_DATE: + case RAY_TIME: + case RAY_TIMESTAMP: + if (!is_numeric(rhs_expr) && !is_temporal(rhs_expr)) return 0; + break; + case RAY_SYM: + if (rhs_expr->type != -RAY_SYM) return 0; + break; + default: + return 0; + } + + int64_t rhs = count_atom_i64(rhs_expr); + if (count_compare_cache_lookup(col, op, rhs, out_count)) + return 1; + + ray_pool_t* pool = ray_pool_get(); + uint32_t nworkers = pool ? ray_pool_total_workers(pool) : 1; + ray_t* counts_block = ray_alloc((size_t)nworkers * sizeof(int64_t)); + if (!counts_block) return 0; + int64_t* counts = (int64_t*)ray_data(counts_block); + memset(counts, 0, (size_t)nworkers * sizeof(int64_t)); + count_compare_ctx_t ctx = { + .col = col, + .rhs = rhs, + .op = op, + .counts = counts, + }; + int64_t nrows = col->len; + if (pool && nrows >= RAY_PARALLEL_THRESHOLD) + ray_pool_dispatch(pool, count_compare_task, &ctx, nrows); + else + count_compare_task(&ctx, 0, 0, nrows); + + int64_t total = 0; + for (uint32_t i = 0; i < nworkers; i++) total += counts[i]; + ray_release(counts_block); + count_compare_cache_store(col, op, rhs, total); + *out_count = total; + return 1; +} + +ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) { + if (handled) *handled = 0; + if (!expr || expr->type != RAY_LIST || ray_len(expr) != 2) return NULL; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return NULL; + ray_t* name = ray_sym_str(elems[0]->i64); + if (!name || ray_str_len(name) != 6 || + memcmp(ray_str_ptr(name), "select", 6) != 0) + return NULL; + + ray_t* dict = elems[1]; + if (!dict || dict->type != RAY_DICT) return NULL; + + ray_t* from_expr = dict_get(dict, "from"); + ray_t* where_expr = dict_get(dict, "where"); + if (!from_expr) return NULL; + if (where_expr && where_expr->type == RAY_LIST && ray_len(where_expr) >= 3) { + ray_t** we = (ray_t**)ray_data(where_expr); + if (we[0] && we[0]->type == -RAY_SYM) { + ray_t* wn = ray_sym_str(we[0]->i64); + if (wn && ray_str_len(wn) == 3 && + memcmp(ray_str_ptr(wn), "and", 3) == 0) + return NULL; + } + } + + int64_t from_id = ray_sym_intern("from", 4); + int64_t where_id = ray_sym_intern("where", 5); + int64_t by_id = ray_sym_intern("by", 2); + int64_t take_id = ray_sym_intern("take", 4); + int64_t asc_id = ray_sym_intern("asc", 3); + int64_t desc_id = ray_sym_intern("desc", 4); + int64_t nearest_id = ray_sym_intern("nearest", 7); + + DICT_VIEW_DECL(dv); + DICT_VIEW_OPEN(dict, dv); + if (DICT_VIEW_OVERFLOW(dv)) return NULL; + for (int64_t i = 0; i + 1 < dv_n; i += 2) { + int64_t kid = dv[i]->i64; + if (kid == by_id || kid == take_id || kid == asc_id || + kid == desc_id || kid == nearest_id) + return NULL; + if (kid != from_id && kid != where_id) + return NULL; + } + + ray_t* tbl = ray_eval(from_expr); + if (!tbl || RAY_IS_ERR(tbl)) return tbl ? tbl : ray_error("type", NULL); + if (tbl->type != RAY_TABLE) { + ray_release(tbl); + return ray_error("type", NULL); + } + + if (!where_expr) { + if (handled) *handled = 1; + int64_t nrows = ray_table_nrows(tbl); + ray_release(tbl); + return ray_i64(nrows); + } + + int64_t direct_count = 0; + if (try_count_simple_compare(tbl, where_expr, &direct_count)) { + if (handled) *handled = 1; + ray_release(tbl); + return ray_i64(direct_count); + } + + ray_graph_t* g = ray_graph_new(tbl); + if (!g) { + ray_release(tbl); + return ray_error("oom", NULL); + } + ray_op_t* pred = compile_expr_dag(g, where_expr); + if (!pred) { + ray_graph_free(g); + ray_release(tbl); + return ray_error("domain", "WHERE predicate not supported by DAG compiler"); + } + int has_scan = 0; + ray_op_t* stk[64]; + int sp = 0; + stk[sp++] = pred; + while (sp > 0) { + ray_op_t* cur = stk[--sp]; + if (cur->opcode == OP_SCAN) { has_scan = 1; break; } + for (uint8_t a = 0; a < cur->arity && sp < 64; a++) + if (cur->inputs[a]) stk[sp++] = cur->inputs[a]; + } + if (!has_scan) { + ray_graph_free(g); + ray_release(tbl); + return NULL; + } + + ray_t* pred_vec = exec_node(g, pred); + if (!pred_vec || RAY_IS_ERR(pred_vec)) { + ray_graph_free(g); + ray_release(tbl); + return pred_vec ? pred_vec : ray_error("type", NULL); + } + int64_t tbl_nrows = ray_table_nrows(tbl); + if (pred_vec->type != RAY_BOOL || pred_vec->len != tbl_nrows) { + ray_release(pred_vec); + ray_graph_free(g); + ray_release(tbl); + return ray_error("type", NULL); + } + + if (handled) *handled = 1; + int64_t nrows = tbl_nrows; + ray_t* sel = ray_rowsel_from_pred(pred_vec); + if (sel) { + ray_rowsel_t* sm = ray_rowsel_meta(sel); + nrows = sm ? sm->total_pass : 0; + ray_release(sel); + } + ray_release(pred_vec); + ray_graph_free(g); + ray_release(tbl); + return ray_i64(nrows); +} + ray_t* ray_select(ray_t** args, int64_t n) { if (n < 1) return ray_error("domain", NULL); ray_t* dict = args[0]; @@ -2333,11 +3297,19 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Evaluate 'from:' to get the source table */ ray_t* from_expr = dict_get(dict, "from"); if (!from_expr) return ray_error("domain", NULL); + ray_t* where_expr = dict_get(dict, "where"); + ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get(); + ray_group_emit_filter_t emit_filter = {0}; + bool emit_filter_set = match_group_count_emit_filter( + from_expr, where_expr, &emit_filter); + if (emit_filter_set) + ray_group_emit_filter_set(emit_filter); ray_t* tbl = ray_eval(from_expr); + if (emit_filter_set) + ray_group_emit_filter_set(prev_emit_filter); if (RAY_IS_ERR(tbl)) return tbl; if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); } - ray_t* where_expr = dict_get(dict, "where"); ray_t* by_expr = dict_get(dict, "by"); ray_t* take_expr = dict_get(dict, "take"); ray_t* nearest_expr = dict_get(dict, "nearest"); @@ -2417,8 +3389,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { uint8_t sort_descs[16]; uint8_t n_sort_keys = 0; int bad_clause = 0; - int64_t out_syms[64]; - int64_t out_aliases[64]; + int64_t out_syms[256]; + int64_t out_aliases[256]; uint8_t n_out_syms = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { int64_t kid = dict_elems[i]->i64; @@ -2455,32 +3427,16 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (kid == by_id) { bad_clause = 1; break; } /* Output column must be a trivial projection of a source * column. The dict key is the alias the result publishes; - * the value names the source column to gather from. The - * source column's storage class must be one the fused - * materialiser can gather/write — variable-width and - * compound types fall back to the unfused path. */ - if (n_out_syms >= 64) { bad_clause = 1; break; } + * the value names the source column to gather from. */ + if (n_out_syms >= 255) { bad_clause = 1; break; } if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME)) { ray_t* oc = ray_table_get_col(tbl, v->i64); if (!oc) { bad_clause = 1; break; } int8_t ot = oc->type; if (RAY_IS_PARTED(ot) || ot == RAY_MAPCOMMON) { bad_clause = 1; break; } - if (ot != RAY_SYM && ot != RAY_BOOL && ot != RAY_U8 - && ot != RAY_I16 && ot != RAY_I32 && ot != RAY_I64 - && ot != RAY_DATE && ot != RAY_TIME - && ot != RAY_TIMESTAMP) + if (!ray_is_vec(oc)) { bad_clause = 1; break; } - /* Local-dict SYM columns route through the unfused - * path so the gather can propagate sym_dict (the - * fused materialiser doesn't). See ray_fused_topk_select - * for the parallel executor-side gate. */ - if (ot == RAY_SYM) { - const ray_t* dict_owner = (oc->attrs & RAY_ATTR_SLICE) - ? oc->slice_parent : oc; - if (dict_owner && dict_owner->sym_dict) - { bad_clause = 1; break; } - } out_syms[n_out_syms] = v->i64; out_aliases[n_out_syms] = kid; n_out_syms++; @@ -2489,6 +3445,26 @@ ray_t* ray_select(ray_t** args, int64_t n) { break; } } + if (!bad_clause && n_out_syms == 0 && n_out == 0) { + int64_t nc = ray_table_ncols(tbl); + if (nc > 255) { + bad_clause = 1; + } else { + for (int64_t c = 0; c < nc; c++) { + ray_t* oc = ray_table_get_col_idx(tbl, c); + if (!oc) { bad_clause = 1; break; } + int8_t ot = oc->type; + if (RAY_IS_PARTED(ot) || ot == RAY_MAPCOMMON) + { bad_clause = 1; break; } + if (!ray_is_vec(oc)) + { bad_clause = 1; break; } + int64_t cn = ray_table_col_name(tbl, c); + out_syms[n_out_syms] = cn; + out_aliases[n_out_syms] = cn; + n_out_syms++; + } + } + } /* Sort keys: only verify the column exists. Nulls are now * handled by the null-aware leg in fpk_cmp (NULLS LAST for * ASC, NULLS FIRST for DESC, matching sort.c's default). @@ -2537,6 +3513,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { * plain RAY_SYM vector of the dict keys so the rest of * ray_select_fn sees a standard multi-key group-by. */ ray_t* by_sym_vec_owned = NULL; + int64_t dep_key_base_sym = -1; + int64_t dep_key_names[16]; + int64_t dep_key_biases[16]; + uint8_t n_dep_keys = 0; /* Selection saved across the path-A graph free for count(distinct * col_ref) non-aggs. Path B leaves this NULL because the @@ -2544,6 +3524,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { * positions. Declared here at function scope so the cleanup at * the bottom of ray_select_fn can release it. */ ray_t* saved_selection = NULL; + ray_t* post_group_where_expr = NULL; DICT_VIEW_DECL(byv); if (by_expr && by_expr->type == RAY_DICT) { DICT_VIEW_OPEN(by_expr, byv); @@ -2559,6 +3540,133 @@ ray_t* ray_select(ray_t** args, int64_t n) { } ray_t** d_elems = byv; + int64_t base_sym = -1; + int64_t base_key_name = -1; + bool dep_candidate = true; + for (int64_t i = 0; i < nk && dep_candidate; i++) { + ray_t* k = d_elems[i * 2]; + ray_t* v = d_elems[i * 2 + 1]; + if (!k || k->type != -RAY_SYM) { + dep_candidate = false; + break; + } + if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME)) { + if (base_sym < 0) { + base_sym = v->i64; + base_key_name = k->i64; + } + if (v->i64 == base_sym) + continue; + } + } + if (dep_candidate && base_sym >= 0) { + if (base_key_name != base_sym) + dep_candidate = false; + } + if (dep_candidate && base_sym >= 0) { + ray_t* base_col = ray_table_get_col(tbl, base_sym); + dep_candidate = base_col && key_type_i64_projectable(base_col->type) && + !(base_col->attrs & RAY_ATTR_HAS_NULLS); + } + int64_t local_dep_names[16]; + int64_t local_dep_biases[16]; + uint8_t local_n_dep = 0; + if (dep_candidate && base_sym >= 0) { + for (int64_t i = 0; i < nk && dep_candidate; i++) { + ray_t* k = d_elems[i * 2]; + ray_t* v = d_elems[i * 2 + 1]; + bool duplicate_key = false; + for (int64_t j = 0; j < i && !duplicate_key; j++) + if (d_elems[j * 2]->i64 == k->i64) duplicate_key = true; + if (duplicate_key) { + dep_candidate = false; + break; + } + bool already_in_tbl = (ray_table_get_col(tbl, k->i64) != NULL); + bool trivial_self = (v->type == -RAY_SYM && v->i64 == k->i64); + if (already_in_tbl && !trivial_self) { + dep_candidate = false; + break; + } + int64_t bias = 0; + if (!expr_affine_of_sym(v, base_sym, &bias)) { + dep_candidate = false; + break; + } + if (k->i64 != base_key_name || bias != 0) { + local_dep_names[local_n_dep] = k->i64; + local_dep_biases[local_n_dep] = bias; + local_n_dep++; + } + } + } + if (dep_candidate && base_sym >= 0 && local_n_dep > 0) { + by_sym_vec_owned = ray_vec_new(RAY_SYM, 1); + if (!by_sym_vec_owned || RAY_IS_ERR(by_sym_vec_owned)) { + ray_release(tbl); + return ray_error("oom", NULL); + } + ((int64_t*)ray_data(by_sym_vec_owned))[0] = base_key_name; + by_sym_vec_owned->len = 1; + by_expr = by_sym_vec_owned; + dep_key_base_sym = base_key_name; + n_dep_keys = local_n_dep; + for (uint8_t i = 0; i < n_dep_keys; i++) { + dep_key_names[i] = local_dep_names[i]; + dep_key_biases[i] = local_dep_biases[i]; + } + goto by_dict_done; + } + + bool has_computed_by_val = false; + for (int64_t i = 0; i < nk; i++) { + ray_t* k = d_elems[i * 2]; + ray_t* v = d_elems[i * 2 + 1]; + if (!k || k->type != -RAY_SYM) continue; + if (!(v && v->type == -RAY_SYM && v->i64 == k->i64)) { + has_computed_by_val = true; + break; + } + } + ray_group_emit_filter_t prefilter_top_count; + memset(&prefilter_top_count, 0, sizeof(prefilter_top_count)); + bool prefilter_computed_by = + has_computed_by_val && + match_group_desc_count_take(dict_elems, dict_n, from_id, where_id, + by_id, take_id, asc_id, desc_id, + &prefilter_top_count); + if (where_expr && prefilter_computed_by) { + ray_graph_t* fg = ray_graph_new(tbl); + if (!fg) { + ray_release(tbl); + return ray_error("oom", NULL); + } + ray_op_t* froot = ray_const_table(fg, tbl); + ray_op_t* pred = compile_expr_dag(fg, where_expr); + if (!pred) { + ray_graph_free(fg); + ray_release(tbl); + return ray_error("domain", NULL); + } + froot = ray_filter(fg, froot, pred); + froot = ray_optimize(fg, froot); + ray_t* filtered = ray_execute(fg, froot); + ray_graph_free(fg); + if (!filtered || RAY_IS_ERR(filtered)) { + ray_release(tbl); + return filtered ? filtered : ray_error("domain", NULL); + } + if (ray_is_lazy(filtered)) + filtered = ray_lazy_materialize(filtered); + if (!filtered || RAY_IS_ERR(filtered)) { + ray_release(tbl); + return filtered ? filtered : ray_error("domain", NULL); + } + ray_release(tbl); + tbl = filtered; + where_expr = NULL; + } + ray_env_push_scope(); int64_t in_ncols = ray_table_ncols(tbl); for (int64_t c = 0; c < in_ncols; c++) { @@ -2616,7 +3724,41 @@ ray_t* ray_select(ray_t** args, int64_t n) { sv_data[i] = k->i64; continue; } + int64_t ref_syms[16]; + ray_t* materialized_refs[16]; + int n_refs = collect_col_refs(v, tbl, ref_syms, 16, 0); + for (int ri = 0; ri < n_refs; ri++) materialized_refs[ri] = NULL; + for (int ri = 0; ri < n_refs; ri++) { + ray_t* ref_col = ray_table_get_col(tbl, ref_syms[ri]); + if (ref_col && (RAY_IS_PARTED(ref_col->type) || + ref_col->type == RAY_MAPCOMMON)) { + ray_t* flat = query_materialize_parted_col(ref_col); + if (!flat || RAY_IS_ERR(flat)) { + fail_err = flat ? flat : ray_error("oom", NULL); + failed = true; break; + } + materialized_refs[ri] = flat; + ray_env_set_local(ref_syms[ri], flat); + } + } + if (failed) { + for (int ri = 0; ri < n_refs; ri++) { + if (materialized_refs[ri]) { + ray_t* ref_col = ray_table_get_col(tbl, ref_syms[ri]); + if (ref_col) ray_env_set_local(ref_syms[ri], ref_col); + ray_release(materialized_refs[ri]); + } + } + break; + } ray_t* col_vec = ray_eval(v); + for (int ri = 0; ri < n_refs; ri++) { + if (materialized_refs[ri]) { + ray_t* ref_col = ray_table_get_col(tbl, ref_syms[ri]); + if (ref_col) ray_env_set_local(ref_syms[ri], ref_col); + ray_release(materialized_refs[ri]); + } + } if (!col_vec || RAY_IS_ERR(col_vec)) { fail_err = col_vec ? col_vec : ray_error("domain", "by-dict val eval"); failed = true; break; @@ -2646,6 +3788,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { } by_expr = by_sym_vec_owned; } +by_dict_done: + ; /* Build DAG */ ray_graph_t* g = ray_graph_new(tbl); @@ -2662,6 +3806,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { uint8_t n_nonaggs = 0; int synth_count_col = 0; /* 1 if we synthesized OP_COUNT for group boundaries */ + if (where_expr && by_expr && !nearest_expr && + can_defer_single_key_where(by_expr, where_expr, tbl)) { + post_group_where_expr = where_expr; + where_expr = NULL; + } + /* Phase-1 OP_FILTERED_GROUP gate. When the (select … where … by …) * shape matches the supported vocabulary, route through the fused * operator instead of FILTER + GROUP. We pre-scan the dict here so @@ -3232,6 +4382,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { int64_t kid = dict_elems[i]->i64; if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; + if (is_single_group_key_projection(by_expr, dict_elems[i + 1])) + continue; if (!is_group_dag_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; } } } @@ -3279,6 +4431,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { break; } } + if (!use_eval_group && + bounded_multikey_count_take_candidate( + dict_elems, dict_n, from_id, where_id, by_id, take_id, + asc_id, desc_id, ray_table_nrows(tbl), 1024)) { + use_eval_group = 1; + } } /* Non-aggregation expressions (arithmetic, lambda, etc.) are * handled post-DAG: aggs go through the parallel GROUP pipeline, @@ -3335,6 +4493,179 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } + int64_t pre_take_groups = nrows; + bool has_pre_take = unsorted_positive_take_limit( + dict_elems, dict_n, asc_id, desc_id, take_id, + nrows, &pre_take_groups); + if (has_pre_take && pre_take_groups <= 1024) { + query_key_reader_t key_readers[16]; + for (int64_t k = 0; k < nk; k++) { + if (!query_key_reader_init(&key_readers[k], key_cols[k])) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("type", "unsupported group key"); + } + } + int n_count_out = 0; + int64_t count_names[16]; + bool count_only = true; + for (int64_t i = 0; i + 1 < dict_n && n_count_out < 16; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id) continue; + ray_t* val_expr_item = dict_elems[i + 1]; + if (!is_plain_count_expr(val_expr_item)) { + count_only = false; + break; + } + count_names[n_count_out++] = kid; + } + if (count_only && n_count_out > 0) { + int64_t cap = pre_take_groups; + ray_t* vals_hdr = ray_alloc((size_t)cap * (size_t)nk * sizeof(int64_t)); + ray_t* null_hdr = ray_alloc((size_t)cap * (size_t)nk); + ray_t* cnt_hdr = ray_alloc((size_t)cap * sizeof(int64_t)); + if (!vals_hdr || !null_hdr || !cnt_hdr) { + if (vals_hdr) ray_free(vals_hdr); + if (null_hdr) ray_free(null_hdr); + if (cnt_hdr) ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("oom", NULL); + } + int64_t* key_vals = (int64_t*)ray_data(vals_hdr); + uint8_t* key_null = (uint8_t*)ray_data(null_hdr); + int64_t* counts = (int64_t*)ray_data(cnt_hdr); + memset(key_null, 0, (size_t)cap * (size_t)nk); + memset(counts, 0, (size_t)cap * sizeof(int64_t)); + + int64_t found = 0; + for (int64_t r = 0; r < nrows; r++) { + int64_t rv[16]; + uint8_t rn[16]; + for (int64_t k = 0; k < nk; k++) { + if (!query_key_reader_read(&key_readers[k], r, &rv[k], &rn[k])) { + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("type", "unsupported group key"); + } + } + + int64_t gi = -1; + for (int64_t gidx = 0; gidx < found; gidx++) { + bool match = true; + for (int64_t k = 0; k < nk; k++) { + size_t off = (size_t)gidx * (size_t)nk + (size_t)k; + if (key_null[off] != rn[k] || + (!rn[k] && key_vals[off] != rv[k])) { + match = false; + break; + } + } + if (match) { gi = gidx; break; } + } + if (gi < 0 && found < cap) { + gi = found++; + for (int64_t k = 0; k < nk; k++) { + size_t off = (size_t)gi * (size_t)nk + (size_t)k; + key_vals[off] = rv[k]; + key_null[off] = rn[k]; + } + } + if (gi >= 0) counts[gi]++; + } + + ray_t* result = ray_table_new(nk + n_count_out); + if (!result || RAY_IS_ERR(result)) { + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return result ? result : ray_error("oom", NULL); + } + for (int64_t k = 0; k < nk; k++) { + ray_t* src = key_cols[k]; + int8_t kt = src->type; + if (RAY_IS_PARTED(kt)) kt = (int8_t)RAY_PARTED_BASETYPE(kt); + else if (kt == RAY_MAPCOMMON) kt = key_readers[k].base_type; + ray_t* key_vec = (kt == RAY_SYM) + ? ray_sym_vec_new( + (src->type == RAY_SYM) + ? (src->attrs & RAY_SYM_W_MASK) + : RAY_SYM_W64, + found) + : ray_vec_new(kt, found); + if (!key_vec || RAY_IS_ERR(key_vec)) { + ray_release(result); + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return key_vec ? key_vec : ray_error("oom", NULL); + } + key_vec->len = found; + for (int64_t gi = 0; gi < found; gi++) { + size_t off = (size_t)gi * (size_t)nk + (size_t)k; + ray_t* atom = NULL; + switch (kt) { + case RAY_SYM: atom = ray_sym(key_vals[off]); break; + case RAY_I64: + case RAY_TIMESTAMP: atom = ray_i64(key_vals[off]); break; + case RAY_I32: + case RAY_DATE: + case RAY_TIME: atom = ray_i32((int32_t)key_vals[off]); break; + case RAY_I16: atom = ray_i16((int16_t)key_vals[off]); break; + case RAY_BOOL: + case RAY_U8: atom = ray_u8((uint8_t)key_vals[off]); break; + case RAY_F64: { + double dv; + memcpy(&dv, &key_vals[off], 8); + atom = ray_f64(dv); + break; + } + default: atom = ray_i64(key_vals[off]); break; + } + if (atom) { + if (key_null[off]) atom->nullmap[0] |= 1; + store_typed_elem(key_vec, gi, atom); + ray_release(atom); + } + } + result = ray_table_add_col(result, key_syms[k], key_vec); + ray_release(key_vec); + if (RAY_IS_ERR(result)) { + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return result; + } + } + for (int ai = 0; ai < n_count_out; ai++) { + ray_t* cv = ray_vec_new(RAY_I64, found); + if (!cv || RAY_IS_ERR(cv)) { + ray_release(result); + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return cv ? cv : ray_error("oom", NULL); + } + cv->len = found; + memcpy(ray_data(cv), counts, (size_t)found * sizeof(int64_t)); + result = ray_table_add_col(result, count_names[ai], cv); + ray_release(cv); + if (RAY_IS_ERR(result)) { + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return result; + } + } + ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return result; + } + } + ray_t* composite_keys = ray_list_new(nrows); if (!composite_keys || RAY_IS_ERR(composite_keys)) { if (eval_tbl != tbl) ray_release(eval_tbl); @@ -3392,6 +4723,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { return groups ? groups : ray_error("domain", NULL); } int64_t n_groups = ray_len(groups) / 2; + int64_t out_groups = n_groups; + bool take_preapplied = unsorted_positive_take_limit( + dict_elems, dict_n, asc_id, desc_id, take_id, + n_groups, &out_groups); int n_agg_out = 0; int64_t agg_names[16]; @@ -3406,10 +4741,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { * group and dispatch directly to exec_count_distinct on * each group's slice. Same kernel the standalone * `(count (distinct col))` fast path uses. */ - ray_t* cd_inner = match_count_distinct(val_expr_item); + ray_t* cd_inner = match_count_distinct(val_expr_item); if (cd_inner) { ray_t* per_group = count_distinct_per_group_groups( - cd_inner, eval_tbl, groups, n_groups); + cd_inner, eval_tbl, groups, out_groups); if (!per_group || RAY_IS_ERR(per_group)) { for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); @@ -3441,7 +4776,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_t* agg_vec = NULL; ray_t** grp_items = (ray_t**)ray_data(groups); - for (int64_t gi = 0; gi < n_groups; gi++) { + for (int64_t gi = 0; gi < out_groups; gi++) { ray_t* idx_list = grp_items[gi * 2 + 1]; ray_t* subset = ray_at_fn(src_col_val, idx_list); if (!subset || RAY_IS_ERR(subset)) continue; @@ -3455,9 +4790,9 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!agg_val || RAY_IS_ERR(agg_val)) continue; if (!agg_vec) { int8_t vt = -(agg_val->type); - agg_vec = ray_vec_new(vt, n_groups); + agg_vec = ray_vec_new(vt, out_groups); if (!agg_vec || RAY_IS_ERR(agg_vec)) { ray_release(agg_val); break; } - agg_vec->len = n_groups; + agg_vec->len = out_groups; } store_typed_elem(agg_vec, gi, agg_val); ray_release(agg_val); @@ -3492,8 +4827,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (RAY_IS_PARTED(kt)) kt = (int8_t)RAY_PARTED_BASETYPE(kt); ray_t* key_vec = NULL; if (kt == RAY_STR) { - key_vec = ray_vec_new(RAY_STR, n_groups); - for (int64_t gi = 0; gi < n_groups && key_vec && !RAY_IS_ERR(key_vec); gi++) { + key_vec = ray_vec_new(RAY_STR, out_groups); + for (int64_t gi = 0; gi < out_groups && key_vec && !RAY_IS_ERR(key_vec); gi++) { ray_t* row_key = grp_items[gi * 2]; ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len) ? ((ray_t**)ray_data(row_key))[k] : NULL; @@ -3503,12 +4838,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } else { key_vec = (kt == RAY_SYM) - ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, n_groups) - : ray_vec_new(kt, n_groups); + ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, out_groups) + : ray_vec_new(kt, out_groups); if (key_vec && !RAY_IS_ERR(key_vec)) { - key_vec->len = n_groups; - memset(ray_data(key_vec), 0, (size_t)n_groups * ray_sym_elem_size(kt, key_vec->attrs)); - for (int64_t gi = 0; gi < n_groups; gi++) { + key_vec->len = out_groups; + memset(ray_data(key_vec), 0, (size_t)out_groups * ray_sym_elem_size(kt, key_vec->attrs)); + for (int64_t gi = 0; gi < out_groups; gi++) { ray_t* row_key = grp_items[gi * 2]; ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len) ? ((ray_t**)ray_data(row_key))[k] : NULL; @@ -3540,6 +4875,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + if (take_preapplied) + return result; return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); } @@ -4107,12 +5444,22 @@ ray_t* ray_select(ray_t** args, int64_t n) { * count(distinct col_ref) doesn't need the materialization. * That's worth ~100 ms on Q14 (937 K rows × 105 cols filtered * → 937 K rows × 105 cols copy). */ + int table_is_parted = 0; + { + int64_t ncols = ray_table_ncols(tbl); + for (int64_t c = 0; c < ncols; c++) { + ray_t* col = ray_table_get_col_idx(tbl, c); + if (col && RAY_IS_PARTED(col->type)) { table_is_parted = 1; break; } + } + } int has_nonagg_needing_flat = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { int64_t kid = dict_elems[i]->i64; if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; ray_t* expr = dict_elems[i + 1]; + if (is_single_group_key_projection(by_expr, expr)) + continue; if (is_group_dag_agg_expr(expr)) continue; ray_t* cd_inner = match_count_distinct(expr); int is_simple_cd = cd_inner && cd_inner->type == -RAY_SYM && @@ -4120,21 +5467,47 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!is_simple_cd) { has_nonagg_needing_flat = 1; break; } } - /* The post-DAG scatter needs a flat single-segment table: it - * reads key columns directly and runs ray_eval over the whole - * input. Detect parted tables up front — if the source is - * parted and there's no WHERE to materialize it, return nyi. */ - int table_is_parted = 0; - if (has_nonagg_needing_flat) { + /* The post-DAG scatter reads key columns directly and runs ray_eval + * over the whole input. Simple count(distinct col_ref) is handled + * below by materializing only the group key and distinct column when + * needed; other non-aggs still require a flat table view. */ + if (has_nonagg_needing_flat && table_is_parted) { + ray_t* flat_tbl = ray_table_new(ray_table_ncols(tbl)); + if (!flat_tbl || RAY_IS_ERR(flat_tbl)) { + ray_graph_free(g); ray_release(tbl); + return flat_tbl ? flat_tbl : ray_error("oom", NULL); + } int64_t ncols = ray_table_ncols(tbl); for (int64_t c = 0; c < ncols; c++) { ray_t* col = ray_table_get_col_idx(tbl, c); - if (col && RAY_IS_PARTED(col->type)) { table_is_parted = 1; break; } + int64_t name = ray_table_col_name(tbl, c); + ray_t* flat_col = query_materialize_parted_col(col); + if (!flat_col || RAY_IS_ERR(flat_col)) { + ray_release(flat_tbl); ray_graph_free(g); ray_release(tbl); + return flat_col ? flat_col : ray_error("oom", NULL); + } + flat_tbl = ray_table_add_col(flat_tbl, name, flat_col); + ray_release(flat_col); + if (!flat_tbl || RAY_IS_ERR(flat_tbl)) { + ray_graph_free(g); ray_release(tbl); + return flat_tbl ? flat_tbl : ray_error("oom", NULL); + } } - if (table_is_parted && !where_expr) { - ray_graph_free(g); ray_release(tbl); - return ray_error("nyi", "non-agg expression on parted table without WHERE"); + ray_graph_free(g); + ray_release(tbl); + tbl = flat_tbl; + g = ray_graph_new(tbl); + if (!g) { ray_release(tbl); return ray_error("oom", NULL); } + root = ray_const_table(g, tbl); + if (where_expr) { + ray_op_t* pred = compile_expr_dag(g, where_expr); + if (!pred) { + ray_graph_free(g); ray_release(tbl); + return ray_error("domain", NULL); + } + root = ray_filter(g, root, pred); } + table_is_parted = 0; } /* WHERE + BY handling. Two paths: @@ -4248,6 +5621,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!agg_ins[n_aggs]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); } n_aggs++; } else if (!is_group_dag_agg_expr(val_expr) && n_nonaggs < 16) { + if (is_single_group_key_projection(by_expr, val_expr)) + continue; nonagg_names[n_nonaggs] = kid; nonagg_exprs[n_nonaggs] = val_expr; n_nonaggs++; @@ -5052,9 +6427,28 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } + ray_group_emit_filter_t prev_self_emit = {0}; + bool self_emit_set = false; + if (by_expr) { + ray_group_emit_filter_t cur_emit = ray_group_emit_filter_get(); + ray_group_emit_filter_t top_emit = {0}; + if (!cur_emit.enabled && + match_group_desc_count_take(dict_elems, dict_n, from_id, where_id, + by_id, take_id, asc_id, desc_id, + &top_emit)) { + prev_self_emit = cur_emit; + ray_group_emit_filter_set(top_emit); + self_emit_set = true; + } + } + /* Optimize and execute */ root = ray_optimize(g, root); ray_t* result = ray_execute(g, root); + if (self_emit_set) + ray_group_emit_filter_set(prev_self_emit); + if (post_group_where_expr && result && !RAY_IS_ERR(result)) + result = filter_group_result(result, post_group_where_expr); ray_graph_free(g); /* The nearest-query buffer was only referenced by ext->rerank.query_vec @@ -5275,7 +6669,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_t* orig_key = ray_table_get_col(tbl, ks); ray_t* grp_key = ray_table_get_col(result, ks); - int64_t nrows = orig_key ? orig_key->len : 0; + int64_t nrows = orig_key ? ray_parted_nrows(orig_key) : 0; if (!orig_key || !grp_key) { ray_release(result); ray_release(tbl); @@ -5283,7 +6677,21 @@ ray_t* ray_select(ray_t** args, int64_t n) { } if (n_groups > 0 && nrows > 0) { - int8_t okt = orig_key->type; + ray_t* scan_key = orig_key; + int scan_key_owned = 0; + if (RAY_IS_PARTED(scan_key->type) || scan_key->type == RAY_MAPCOMMON) { + scan_key = query_materialize_parted_col(scan_key); + if (!scan_key || RAY_IS_ERR(scan_key)) { + ray_release(result); ray_release(tbl); + return scan_key ? scan_key : ray_error("oom", NULL); + } + scan_key_owned = 1; + } + #define RELEASE_SCAN_KEY() do { \ + if (scan_key_owned && scan_key) ray_release(scan_key); \ + } while (0) + + int8_t okt = scan_key->type; int8_t gkt = grp_key->type; if (RAY_IS_PARTED(okt)) okt = (int8_t)RAY_PARTED_BASETYPE(okt); if (RAY_IS_PARTED(gkt)) gkt = (int8_t)RAY_PARTED_BASETYPE(gkt); @@ -5336,6 +6744,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { okt == RAY_DATE || okt == RAY_TIME || okt == RAY_TIMESTAMP || okt == RAY_SYM); if (!key_supported) { + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("nyi", "non-agg scatter: unsupported group key type"); } @@ -5345,6 +6754,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { * unexpectedly, fall back to error rather than mis- * compare. */ if (okt != gkt) { + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("type", "group key type mismatch"); } @@ -5362,6 +6772,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (cnt_hdr) ray_free(cnt_hdr); if (off_hdr) ray_free(off_hdr); if (pos_hdr) ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5393,6 +6804,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!c) { ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5409,6 +6821,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (gk_idx_hdr) scratch_free(gk_idx_hdr); ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5426,6 +6839,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { scratch_free(gk_keys_hdr); scratch_free(gk_idx_hdr); ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5462,9 +6876,9 @@ ray_t* ray_select(ray_t** args, int64_t n) { .hk_gid_p1 = use_i64_gid ? NULL : hk_gid_p1, .hk_gid64 = use_i64_gid ? hk_gid64 : NULL, .mask = mask, - .orig_key_data = ray_data(orig_key), + .orig_key_data = ray_data(scan_key), .okt = okt, - .okt_attrs = orig_key->attrs, + .okt_attrs = scan_key->attrs, .row_gid = row_gid, .selection = saved_selection, .sel_flg = NULL, @@ -5484,7 +6898,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { } else { for (int64_t r = 0; r < nrows; r++) { int64_t rv; - KEY_READ(rv, orig_key, okt, r); + KEY_READ(rv, scan_key, okt, r); uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL; h ^= h >> 33; uint64_t s = h & mask; @@ -5639,6 +7053,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5664,6 +7079,7 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!idx_hdr) { ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); + RELEASE_SCAN_KEY(); ray_release(result); ray_release(tbl); return ray_error("oom", NULL); } @@ -5704,14 +7120,30 @@ ray_t* ray_select(ray_t** args, int64_t n) { (cd_inner->attrs & RAY_ATTR_NAME)) { src_for_global = ray_table_get_col(tbl, cd_inner->i64); } + if (src_for_global && n_groups > 50000) { + if (RAY_IS_PARTED(src_for_global->type) || + src_for_global->type == RAY_MAPCOMMON) { + ray_t* flat = query_materialize_parted_col(src_for_global); + if (!flat || RAY_IS_ERR(flat)) { + col = flat ? flat : ray_error("oom", NULL); + src_for_global = NULL; + } else { + src_for_global = flat; + src_owned = 1; + } + } + } if (src_for_global) { /* Path selection: global-hash kernel scales * with n_rows (per-row probe of one shared * hash table); per-group-slice scales with * n_groups (per-group setup + small dedup). * Empirically the cross-over is around 50 K - * groups on the local hardware — beyond - * that, per-group setup overhead dominates. */ + * groups on the local hardware. Partitioned + * high-cardinality columns are flattened above, + * so keep them on the single-pass kernel and + * avoid slicing through the partition layout + * again. */ if (n_groups <= 50000) { col = count_distinct_per_group_buf( cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups); @@ -5871,12 +7303,14 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); ray_free(off_hdr); ray_free(pos_hdr); if (idx_hdr) ray_free(idx_hdr); + RELEASE_SCAN_KEY(); if (scatter_err) { if (result) ray_release(result); ray_release(tbl); return scatter_err; } + #undef RELEASE_SCAN_KEY } else { /* Empty group set: add empty LIST columns so the * output schema still includes the user-declared @@ -5896,6 +7330,44 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } + if (n_dep_keys > 0 && result && !RAY_IS_ERR(result) && + result->type == RAY_TABLE) { + if (ray_is_lazy(result)) + result = ray_lazy_materialize(result); + if (result && RAY_IS_ERR(result)) { + ray_release(tbl); + return result; + } + if (result && result->type == RAY_TABLE) { + ray_t* base_col = ray_table_get_col(result, dep_key_base_sym); + if (!base_col || !key_type_i64_projectable(base_col->type)) { + ray_release(result); + ray_release(tbl); + return ray_error("domain", "dependent group key base missing"); + } + int64_t n_groups = ray_table_nrows(result); + for (uint8_t dk = 0; dk < n_dep_keys; dk++) { + ray_t* col = ray_vec_new(RAY_I64, n_groups); + if (!col || RAY_IS_ERR(col)) { + ray_release(result); + ray_release(tbl); + return col ? col : ray_error("oom", NULL); + } + col->len = n_groups; + int64_t* out = (int64_t*)ray_data(col); + int64_t bias = dep_key_biases[dk]; + for (int64_t i = 0; i < n_groups; i++) + out[i] = key_col_read_i64(base_col, i) + bias; + result = ray_table_add_col(result, dep_key_names[dk], col); + ray_release(col); + if (RAY_IS_ERR(result)) { + ray_release(tbl); + return result; + } + } + } + } + ray_release(tbl); /* Post-process: apply sort/take for group-by queries. Runs diff --git a/src/ops/string.c b/src/ops/string.c index a3b0ced3..84e72a35 100644 --- a/src/ops/string.c +++ b/src/ops/string.c @@ -33,7 +33,7 @@ /* Parallelism crossover thresholds. Below these row counts the * pool dispatch + per-task setup cost outweighs the parallel speedup. - * Determined empirically against ClickBench-shaped workloads. STR + * Determined empirically against wide analytical workloads. STR * scans set their threshold higher because the pattern is matched * per row (no dict-shared prefix); SYM is per-dict-entry so the work * scales with cardinality, not row count, and parallelises well at @@ -333,6 +333,14 @@ static void str_like_par_fn(void* vctx, uint32_t worker_id, } } +static int64_t parted_row_count(ray_t* input) { + ray_t** segs = (ray_t**)ray_data(input); + int64_t total = 0; + for (int64_t s = 0; s < input->len; s++) + if (segs[s]) total += segs[s]->len; + return total; +} + static void like_resolve_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) { (void)worker_id; @@ -350,8 +358,180 @@ static void like_resolve_fn(void* ctx, uint32_t worker_id, } } +static void exec_like_parted_str(ray_t* input, uint8_t* dst, + const ray_glob_compiled_t* pc, + bool use_simple, + const char* pat_str, size_t pat_len) { + ray_t** segs = (ray_t**)ray_data(input); + ray_pool_t* pool = ray_pool_get(); + int64_t out_off = 0; + + for (int64_t s = 0; s < input->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int64_t seg_len = seg->len; + const ray_str_t* elems; + const char* pool_data; + str_resolve(seg, &elems, &pool_data); + + str_like_par_ctx_t lctx = { + .elems = elems, + .pool_data = pool_data, + .dst = dst + out_off, + .pc = pc, + .use_simple = use_simple, + .pat_str = pat_str, + .pat_len = pat_len, + }; + if (pool && seg_len >= LIKE_PAR_MIN_ROWS_STR && + ray_pool_total_workers(pool) >= 2) { + ray_pool_dispatch(pool, str_like_par_fn, &lctx, seg_len); + } else { + str_like_par_fn(&lctx, 0, 0, seg_len); + } + out_off += seg_len; + } +} + +static void exec_like_parted_sym(ray_t* input, uint8_t* dst, + const ray_glob_compiled_t* pc, + bool use_simple, + const char* pat_str, size_t pat_len, + int64_t total_len) { + ray_t** segs = (ray_t**)ray_data(input); + ray_t** sym_strings = NULL; + uint32_t dict_n = 0; + ray_sym_strings_borrow(&sym_strings, &dict_n); + ray_t* lut_hdr = NULL; + ray_t* seen_hdr = NULL; + uint8_t* lut = NULL; + uint8_t* seen = NULL; + if (dict_n > 0) { + lut = (uint8_t*)scratch_alloc (&lut_hdr, (size_t)dict_n); + seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n); + } + + ray_pool_t* pool = ray_pool_get(); + if (lut && seen) { + for (int64_t s = 0; s < input->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int64_t seg_len = seg->len; + like_seen_ctx_t sctx = { + .base = ray_data(seg), + .seen = seen, + .dict_n = (uint64_t)dict_n, + .sym_w = (int)(seg->attrs & RAY_SYM_W_MASK), + .sel_flg = NULL, + .sel_offs = NULL, + .sel_idx = NULL, + .sel_n_segs = 0, + .total_rows = seg_len, + }; + if (pool && seg_len >= LIKE_PAR_MIN_ROWS_SYM && + ray_pool_total_workers(pool) >= 2) { + ray_pool_dispatch(pool, like_seen_fn, &sctx, seg_len); + } else { + like_seen_fn(&sctx, 0, 0, seg_len); + } + } + + like_resolve_ctx_t rctx = { + .sym_strings = sym_strings, .seen = seen, .lut = lut, + .pc = pc, .use_simple = use_simple, + .pat_str = pat_str, .pat_len = pat_len, + }; + if (pool && (int64_t)dict_n >= 16384) { + ray_pool_dispatch(pool, like_resolve_fn, &rctx, (int64_t)dict_n); + } else { + like_resolve_fn(&rctx, 0, 0, (int64_t)dict_n); + } + + int64_t out_off = 0; + for (int64_t s = 0; s < input->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + int64_t seg_len = seg->len; + like_proj_ctx_t pctx = { + .base = ray_data(seg), + .dst = dst + out_off, + .lut = lut, + .dict_n = (uint64_t)dict_n, + .sym_w = (int)(seg->attrs & RAY_SYM_W_MASK), + .sel_flg = NULL, + .sel_offs = NULL, + .sel_idx = NULL, + .sel_n_segs = 0, + }; + if (pool && seg_len >= LIKE_PAR_MIN_ROWS_SYM && + ray_pool_total_workers(pool) >= 2) { + ray_pool_dispatch(pool, like_proj_fn, &pctx, seg_len); + } else { + like_proj_fn(&pctx, 0, 0, seg_len); + } + out_off += seg_len; + } + scratch_free(lut_hdr); + scratch_free(seen_hdr); + return; + } + + if (lut_hdr) scratch_free(lut_hdr); + if (seen_hdr) scratch_free(seen_hdr); + + int64_t out_off = 0; + for (int64_t s = 0; s < input->len; s++) { + ray_t* seg = segs[s]; + if (!seg) continue; + const void* base = ray_data(seg); + for (int64_t i = 0; i < seg->len; i++) { + int64_t sym_id = ray_read_sym(base, i, seg->type, seg->attrs); + ray_t* str = (sym_strings && (uint64_t)sym_id < (uint64_t)dict_n) + ? sym_strings[sym_id] : NULL; + if (!str) { dst[out_off + i] = 0; continue; } + const char* sp = ray_str_ptr(str); + size_t sl = ray_str_len(str); + dst[out_off + i] = (use_simple + ? ray_glob_match_compiled(pc, sp, sl) + : ray_glob_match(sp, sl, pat_str, pat_len)) + ? 1 : 0; + } + out_off += seg->len; + } + if (out_off < total_len) + memset(dst + out_off, 0, (size_t)(total_len - out_off)); +} + +static ray_t* exec_like_input(ray_graph_t* g, ray_op_t* input_op) { + if (!input_op || input_op->opcode != OP_SCAN) + return exec_node(g, input_op); + + ray_op_ext_t* ext = find_ext(g, input_op->id); + if (!ext) return exec_node(g, input_op); + + uint16_t stored_table_id = 0; + memcpy(&stored_table_id, ext->base.pad, sizeof(uint16_t)); + ray_t* scan_tbl = NULL; + if (stored_table_id > 0 && g->tables && (stored_table_id - 1) < g->n_tables) + scan_tbl = g->tables[stored_table_id - 1]; + else + scan_tbl = g->table; + if (!scan_tbl) return exec_node(g, input_op); + + ray_t* col = ray_table_get_col(scan_tbl, ext->sym); + if (!col) return exec_node(g, input_op); + if (RAY_IS_PARTED(col->type)) { + int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type); + if (base == RAY_STR || RAY_IS_SYM(base)) { + ray_retain(col); + return col; + } + } + return exec_node(g, input_op); +} + ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { - ray_t* input = exec_node(g, op->inputs[0]); + ray_t* input = exec_like_input(g, op->inputs[0]); ray_t* pat_v = exec_node(g, op->inputs[1]); if (!input || RAY_IS_ERR(input)) { if (pat_v && !RAY_IS_ERR(pat_v)) ray_release(pat_v); return input; } if (!pat_v || RAY_IS_ERR(pat_v)) { ray_release(input); return pat_v; } @@ -367,7 +547,10 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { ray_glob_compiled_t pc = ray_glob_compile(pat_str, pat_len); bool use_simple = pc.shape != RAY_GLOB_SHAPE_NONE; - int64_t len = input->len; + int8_t in_type = input->type; + bool in_parted = RAY_IS_PARTED(in_type); + int8_t base_type = in_parted ? (int8_t)RAY_PARTED_BASETYPE(in_type) : in_type; + int64_t len = in_parted ? parted_row_count(input) : input->len; ray_t* result = ray_vec_new(RAY_BOOL, len); if (!result || RAY_IS_ERR(result)) { ray_release(input); ray_release(pat_v); @@ -376,11 +559,14 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { result->len = len; uint8_t* dst = (uint8_t*)ray_data(result); - int8_t in_type = input->type; - if (in_type == RAY_STR) { - /* Parallel substring/glob match over RAY_STR. Q22/Q23 ClickBench - * are 5 M URL/Title columns × ~80 chars/row = ~400 MB scan, - * memory-bandwidth bound; the worker pool gives a 5-10× speedup + if (in_parted && base_type == RAY_STR) { + exec_like_parted_str(input, dst, &pc, use_simple, pat_str, pat_len); + } else if (in_parted && RAY_IS_SYM(base_type)) { + exec_like_parted_sym(input, dst, &pc, use_simple, pat_str, pat_len, len); + } else if (in_type == RAY_STR) { + /* Parallel substring/glob match over RAY_STR. Wide text scans + * over URL/title-like columns are memory-bandwidth bound; the + * worker pool gives a 5-10× speedup * since glob_match is independent per row. */ const ray_str_t* elems; const char* pool_data; str_resolve(input, &elems, &pool_data); @@ -434,7 +620,7 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { ray_pool_t* pool = ray_pool_get(); /* Phase 1: mark used sym_ids. Parallelised because for - * high-cardinality columns (URL on ClickBench) the seen- + * high-cardinality text columns the seen- * mark scan was a 5 ms-class serial pass. Multiple workers * may write 1 to the same byte concurrently — the value is * idempotent so the race is benign. */ diff --git a/src/ops/strop.c b/src/ops/strop.c index 89aaac18..4f68ba18 100644 --- a/src/ops/strop.c +++ b/src/ops/strop.c @@ -24,12 +24,133 @@ #include "lang/internal.h" #include "ops/internal.h" #include "table/sym.h" +#include "table/table.h" #include "ops/glob.h" /* ══════════════════════════════════════════ * String builtins * ══════════════════════════════════════════ */ +static bool strlen_atom_value(ray_t* x, int64_t* out) { + if (x->type == -RAY_STR) { + *out = (int64_t)ray_str_len(x); + return true; + } + if (x->type == -RAY_SYM) { + ray_t* s = ray_sym_str(x->i64); + *out = s ? (int64_t)ray_str_len(s) : 0; + return true; + } + return false; +} + +static bool strlen_vec_value(ray_t* x, int64_t row, int64_t* out) { + if (x->type == RAY_STR) { + size_t slen = 0; + const char* s = ray_str_vec_get(x, row, &slen); + *out = s ? (int64_t)slen : 0; + return true; + } + if (x->type == RAY_SYM) { + int64_t sid = ray_read_sym(ray_data(x), row, x->type, x->attrs); + ray_t* s = ray_sym_str(sid); + *out = s ? (int64_t)ray_str_len(s) : 0; + return true; + } + return false; +} + +static ray_t* strlen_vec(ray_t* x) { + if (x->type != RAY_STR && x->type != RAY_SYM) + return ray_error("type", "strlen: expected string or symbol"); + + int64_t n = x->len; + ray_t* out = ray_vec_new(RAY_I64, n); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = n; + int64_t* dst = (int64_t*)ray_data(out); + bool has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0; + + for (int64_t i = 0; i < n; i++) { + if (has_nulls && ray_vec_is_null(x, i)) { + dst[i] = 0; + ray_vec_set_null(out, i, true); + continue; + } + strlen_vec_value(x, i, &dst[i]); + } + return out; +} + +static ray_t* strlen_mapcommon(ray_t* x) { + ray_t** ptrs = (ray_t**)ray_data(x); + ray_t* keys = ptrs[0]; + ray_t* counts = ptrs[1]; + if (!keys || !counts || RAY_IS_ERR(keys) || RAY_IS_ERR(counts)) + return ray_error("domain", "strlen: invalid partition column"); + if (keys->type != RAY_STR && keys->type != RAY_SYM) + return ray_error("type", "strlen: expected string or symbol"); + + int64_t total = ray_parted_nrows(x); + ray_t* out = ray_vec_new(RAY_I64, total); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = total; + int64_t* dst = (int64_t*)ray_data(out); + const int64_t* cnt = (const int64_t*)ray_data(counts); + + int64_t off = 0; + for (int64_t p = 0; p < counts->len; p++) { + int64_t v = 0; + bool is_null = (keys->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(keys, p); + if (!is_null) strlen_vec_value(keys, p, &v); + for (int64_t r = 0; r < cnt[p]; r++) { + dst[off] = v; + if (is_null) ray_vec_set_null(out, off, true); + off++; + } + } + return out; +} + +static ray_t* strlen_parted(ray_t* x) { + int8_t base = (int8_t)RAY_PARTED_BASETYPE(x->type); + if (base != RAY_STR && base != RAY_SYM) + return ray_error("type", "strlen: expected string or symbol"); + + int64_t total = ray_parted_nrows(x); + ray_t* out = ray_vec_new(RAY_I64, total); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = total; + int64_t* dst = (int64_t*)ray_data(out); + + ray_t** segs = (ray_t**)ray_data(x); + int64_t off = 0; + for (int64_t s = 0; s < x->len; s++) { + ray_t* seg = segs[s]; + if (!seg || RAY_IS_ERR(seg)) continue; + bool has_nulls = (seg->attrs & RAY_ATTR_HAS_NULLS) != 0; + for (int64_t i = 0; i < seg->len; i++) { + if (has_nulls && ray_vec_is_null(seg, i)) { + dst[off] = 0; + ray_vec_set_null(out, off, true); + } else { + strlen_vec_value(seg, i, &dst[off]); + } + off++; + } + } + return out; +} + +ray_t* ray_strlen_fn(ray_t* x) { + int64_t len = 0; + if (strlen_atom_value(x, &len)) return ray_i64(len); + if (ray_is_vec(x)) return strlen_vec(x); + if (x->type == RAY_MAPCOMMON) return strlen_mapcommon(x); + if (RAY_IS_PARTED(x->type)) return strlen_parted(x); + return ray_error("type", "strlen: expected string or symbol"); +} + ray_t* ray_split_fn(ray_t* str, ray_t* delim) { /* List split: (split list indices) → list of sub-lists */ if (str->type == RAY_LIST && @@ -203,7 +324,7 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) { const char* pat = ray_str_ptr(pattern); size_t pat_len = ray_str_len(pattern); - /* Pre-compile the pattern once. Most ClickBench LIKE shapes are + /* Pre-compile the pattern once. Most benchmark LIKE shapes are * `*literal*` (substring) which collapses to a memmem call — the * libc-provided implementation is SIMD on glibc/Apple/BSD. When the * shape is RAY_GLOB_SHAPE_NONE we keep the iterative matcher. */ diff --git a/src/ops/system.c b/src/ops/system.c index 6b0df37d..9ccc90b4 100644 --- a/src/ops/system.c +++ b/src/ops/system.c @@ -112,7 +112,7 @@ ray_t* ray_get_splayed_fn(ray_t** args, int64_t n) { else sym_path = splay_default_sym(dir, sym, sizeof(sym), true); - return ray_splay_load(dir, sym_path); + return ray_read_splayed(dir, sym_path); } /* (.db.parted.get "db_root" `table_name) -- load partitioned table */ diff --git a/src/store/col.c b/src/store/col.c index 4ee2e370..3275c668 100644 --- a/src/store/col.c +++ b/src/store/col.c @@ -28,6 +28,7 @@ #include "store/fileio.h" #include "table/sym.h" #include "ops/idxop.h" +#include "vec/str.h" #include #include #include @@ -86,6 +87,8 @@ static ray_err_t validate_sym_bounds(const void* data, int64_t len, #define LIST_MAGIC 0x4754534CU /* "LSTG" */ #define TABLE_MAGIC 0x4C425454U /* "TTBL" */ +static size_t col_str_pool_payload_len(const ray_t* vec); + /* -------------------------------------------------------------------------- * Column file format: * Bytes 0-15: nullmap (inline) or zeroed (ext_nullmap / no nulls) @@ -97,14 +100,16 @@ static ray_err_t validate_sym_bounds(const void* data, int64_t len, * -------------------------------------------------------------------------- */ /* Explicit allowlist of types that are safe to serialize as raw bytes. - * Only fixed-size scalar types -- pointer-bearing types (STR, LIST, TABLE) - * and non-scalar types are excluded. */ + * Fixed-size scalar types plus RAY_STR. RAY_STR has a pointer-bearing pool + * in memory, but the column format stores the fixed 16-byte descriptors and + * the byte pool as adjacent raw regions so mmap can restore the pointer + * without deserializing string contents. */ static bool is_serializable_type(int8_t t) { switch (t) { case RAY_BOOL: case RAY_U8: case RAY_I16: case RAY_I32: case RAY_I64: case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP: case RAY_GUID: - case RAY_SYM: + case RAY_SYM: case RAY_STR: return true; default: return false; @@ -188,34 +193,6 @@ static ray_t* col_load_str_list(const uint8_t* ptr, size_t remaining) { return list; } -/* -------------------------------------------------------------------------- - * col_save_str_vec -- serialize a RAY_STR vector with Rayforce serde - * - * RAY_STR columns carry a string pool through the header union, so they cannot - * use the raw 32-byte column layout. Reuse the object wire format here; it - * already preserves pooled strings and external null bitmaps. - * -------------------------------------------------------------------------- */ - -static ray_err_t col_save_str_vec(ray_t* vec, FILE* f) { - uint32_t magic = STR_VEC_MAGIC; - if (fwrite(&magic, 4, 1, f) != 1) return RAY_ERR_IO; - - int64_t len = ray_serde_size(vec); - if (len <= 0) return RAY_ERR_IO; - ray_t* bytes = ray_vec_new(RAY_U8, len); - if (!bytes || RAY_IS_ERR(bytes)) return RAY_ERR_OOM; - - int64_t wrote = ray_ser_raw((uint8_t*)ray_data(bytes), vec); - if (wrote != len) { - ray_release(bytes); - return RAY_ERR_IO; - } - - size_t out = fwrite(ray_data(bytes), 1, (size_t)len, f); - ray_release(bytes); - return out == (size_t)len ? RAY_OK : RAY_ERR_IO; -} - static ray_t* col_load_str_vec(const uint8_t* ptr, size_t remaining) { if (remaining > (size_t)INT64_MAX) return ray_error("range", NULL); int64_t len = (int64_t)remaining; @@ -266,10 +243,11 @@ static ray_err_t col_write_recursive(ray_t* obj, FILE* f) { if (is_serializable_type(type)) { /* Fixed-size vector: write len + raw data. - * RAY_SYM: also write attrs byte (adaptive width W8/W16/W32/W64). */ + * RAY_SYM: also write attrs byte (adaptive width W8/W16/W32/W64). + * RAY_STR: also write attrs byte and the adjacent byte pool. */ int64_t len = obj->len; if (fwrite(&len, 8, 1, f) != 1) return RAY_ERR_IO; - if (type == RAY_SYM) { + if (type == RAY_SYM || type == RAY_STR) { uint8_t attrs = obj->attrs; if (fwrite(&attrs, 1, 1, f) != 1) return RAY_ERR_IO; } @@ -277,6 +255,13 @@ static ray_err_t col_write_recursive(ray_t* obj, FILE* f) { size_t data_size = (size_t)len * esz; if (data_size > 0 && fwrite(ray_data(obj), 1, data_size, f) != data_size) return RAY_ERR_IO; + if (type == RAY_STR) { + uint64_t pool_size = (uint64_t)col_str_pool_payload_len(obj); + if (fwrite(&pool_size, 8, 1, f) != 1) return RAY_ERR_IO; + if (pool_size > 0 && + fwrite(ray_data(obj->str_pool), 1, (size_t)pool_size, f) != pool_size) + return RAY_ERR_IO; + } return RAY_OK; } @@ -352,9 +337,9 @@ static ray_t* col_read_recursive(const uint8_t** pp, size_t* remaining) { *pp += 8; *remaining -= 8; if (len < 0) return ray_error("corrupt", NULL); - /* RAY_SYM: read attrs byte for adaptive width */ + /* RAY_SYM / RAY_STR: read attrs byte for adaptive width/nulls */ uint8_t attrs = 0; - if (type == RAY_SYM) { + if (type == RAY_SYM || type == RAY_STR) { if (*remaining < 1) return ray_error("corrupt", NULL); memcpy(&attrs, *pp, 1); *pp += 1; *remaining -= 1; @@ -375,6 +360,31 @@ static ray_t* col_read_recursive(const uint8_t** pp, size_t* remaining) { memcpy(ray_data(vec), *pp, data_size); *pp += data_size; *remaining -= data_size; + if (type == RAY_STR) { + if (*remaining < 8) { ray_release(vec); return ray_error("corrupt", NULL); } + uint64_t pool_size; + memcpy(&pool_size, *pp, 8); + *pp += 8; *remaining -= 8; + if (pool_size > *remaining || pool_size > (uint64_t)INT64_MAX) { + ray_release(vec); + return ray_error("corrupt", NULL); + } + if (pool_size > 0) { + vec->str_pool = ray_alloc((size_t)pool_size); + if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) { + ray_t* err = vec->str_pool ? vec->str_pool : ray_error("oom", NULL); + vec->str_pool = NULL; + ray_release(vec); + return err; + } + vec->str_pool->type = RAY_U8; + vec->str_pool->len = (int64_t)pool_size; + memcpy(ray_data(vec->str_pool), *pp, (size_t)pool_size); + } + *pp += pool_size; *remaining -= (size_t)pool_size; + vec->attrs = attrs; + } + if (type == RAY_SYM) { uint32_t sc = ray_sym_count(); ray_err_t ve = validate_sym_bounds(ray_data(vec), len, attrs, sc); @@ -489,7 +499,7 @@ static void try_load_link_sidecar(ray_t* vec, const char* path) { * ray_col_save -- write a vector to a column file * -------------------------------------------------------------------------- */ -ray_err_t ray_col_save(ray_t* vec, const char* path) { +static ray_err_t col_save_impl(ray_t* vec, const char* path, bool durable) { if (!vec || RAY_IS_ERR(vec)) return RAY_ERR_TYPE; if (!path) return RAY_ERR_IO; @@ -508,16 +518,6 @@ ray_err_t ray_col_save(ray_t* vec, const char* path) { goto fsync_and_rename; } - /* String vector */ - if (vec->type == RAY_STR) { - FILE* f = fopen(tmp_path, "wb"); - if (!f) return RAY_ERR_IO; - ray_err_t err = col_save_str_vec(vec, f); - fclose(f); - if (err != RAY_OK) { remove(tmp_path); return err; } - goto fsync_and_rename; - } - /* Generic list */ if (vec->type == RAY_LIST) { FILE* f = fopen(tmp_path, "wb"); @@ -633,6 +633,32 @@ ray_err_t ray_col_save(ray_t* vec, const char* path) { if (written != data_size) { fclose(f); remove(tmp_path); return RAY_ERR_IO; } } + if (vec->type == RAY_STR) { + size_t pool_size = col_str_pool_payload_len(vec); + ray_t pool_header; + memset(&pool_header, 0, sizeof(pool_header)); + if (vec->str_pool && !RAY_IS_ERR(vec->str_pool)) { + memcpy(&pool_header, vec->str_pool, 32); + } + pool_header.mmod = 0; + pool_header.order = 0; + pool_header.type = RAY_U8; + pool_header.attrs = 0; + pool_header.rc = 0; + pool_header.len = (int64_t)pool_size; + written = fwrite(&pool_header, 1, 32, f); + if (written != 32) { fclose(f); remove(tmp_path); return RAY_ERR_IO; } + if (pool_size > 0) { + if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) { + fclose(f); + remove(tmp_path); + return RAY_ERR_CORRUPT; + } + written = fwrite(ray_data(vec->str_pool), 1, pool_size, f); + if (written != pool_size) { fclose(f); remove(tmp_path); return RAY_ERR_IO; } + } + } + /* Append external nullmap bitmap after data. Use header.attrs * (rebased above for HAS_INDEX) and ext_for_append (the * effective ext_nullmap pointer, possibly extracted from the @@ -648,12 +674,15 @@ ray_err_t ray_col_save(ray_t* vec, const char* path) { } fsync_and_rename:; - /* Fsync temp file for durability */ - ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE); - if (tmp_fd == RAY_FD_INVALID) { remove(tmp_path); return RAY_ERR_IO; } - ray_err_t err = ray_file_sync(tmp_fd); - ray_file_close(tmp_fd); - if (err != RAY_OK) { remove(tmp_path); return err; } + ray_err_t err = RAY_OK; + if (durable) { + /* Fsync temp file for durability */ + ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE); + if (tmp_fd == RAY_FD_INVALID) { remove(tmp_path); return RAY_ERR_IO; } + err = ray_file_sync(tmp_fd); + ray_file_close(tmp_fd); + if (err != RAY_OK) { remove(tmp_path); return err; } + } /* Atomic rename: tmp -> final path */ err = ray_file_rename(tmp_path, path); @@ -700,6 +729,14 @@ fsync_and_rename:; return RAY_OK; } +ray_err_t ray_col_save(ray_t* vec, const char* path) { + return col_save_impl(vec, path, true); +} + +ray_err_t ray_col_save_bulk(ray_t* vec, const char* path) { + return col_save_impl(vec, path, false); +} + /* -------------------------------------------------------------------------- * col_validate_mapped -- shared validation for ray_col_load / ray_col_mmap * @@ -715,10 +752,75 @@ typedef struct { ray_t* header; /* pointer into mapped region */ uint8_t esz; size_t data_size; + bool has_str_pool; + size_t str_pool_offset; + size_t str_pool_size; + size_t bitmap_offset; bool has_ext_nullmap; size_t bitmap_len; + uint32_t saved_sym_count; } col_mapped_t; +static size_t col_str_pool_payload_len(const ray_t* vec) { + if (!vec || vec->type != RAY_STR || !vec->str_pool || RAY_IS_ERR(vec->str_pool)) + return 0; + return vec->str_pool->len > 0 ? (size_t)vec->str_pool->len : 0; +} + +static ray_t* col_copy_str_pool(const col_mapped_t* cm) { + if (!cm->has_str_pool) return NULL; + + const ray_t* src = (const ray_t*)((const char*)cm->mapped + cm->str_pool_offset); + if (cm->str_pool_size == 0) return NULL; + + ray_t* pool = ray_alloc(cm->str_pool_size); + if (!pool || RAY_IS_ERR(pool)) return pool ? pool : ray_error("oom", NULL); + + uint8_t saved_order = pool->order; + uint8_t saved_mmod = pool->mmod; + memcpy(pool, src, 32 + cm->str_pool_size); + pool->order = saved_order; + pool->mmod = saved_mmod; + pool->attrs &= (uint8_t)~RAY_ATTR_SLICE; + ray_atomic_store(&pool->rc, 1); + return pool; +} + +static ray_err_t col_validate_str_region(ray_t* hdr, const void* ptr, + size_t mapped_size, col_mapped_t* out) { + size_t offset = 32 + out->data_size; + if (offset > mapped_size || mapped_size - offset < 32) + return RAY_ERR_CORRUPT; + + ray_t* pool = (ray_t*)((char*)ptr + offset); + if (pool->type != RAY_U8 || pool->len < 0) + return RAY_ERR_CORRUPT; + + size_t pool_size = (size_t)pool->len; + if (pool_size > mapped_size - offset - 32) + return RAY_ERR_CORRUPT; + + const ray_str_t* elems = (const ray_str_t*)((const char*)ptr + 32); + for (int64_t i = 0; i < hdr->len; i++) { + uint32_t len = elems[i].len; + if (len <= RAY_STR_INLINE_MAX) continue; + if (pool_size == 0 || elems[i].pool_off > pool_size || + len > pool_size - elems[i].pool_off) + return RAY_ERR_CORRUPT; + if (len >= 4) { + const char* p = (const char*)ptr + offset + 32 + elems[i].pool_off; + if (memcmp(elems[i].prefix, p, 4) != 0) + return RAY_ERR_CORRUPT; + } + } + + out->has_str_pool = true; + out->str_pool_offset = offset; + out->str_pool_size = pool_size; + out->bitmap_offset = offset + 32 + pool_size; + return RAY_OK; +} + static ray_t* col_validate_mapped(const char* path, col_mapped_t* out) { size_t mapped_size = 0; void* ptr = ray_vm_map_file(path, &mapped_size); @@ -777,11 +879,28 @@ static ray_t* col_validate_mapped(const char* path, col_mapped_t* out) { return ray_error("corrupt", NULL); } + out->data_size = data_size; + size_t bitmap_offset = 32 + data_size; + if (hdr->type == RAY_STR) { + ray_err_t se = col_validate_str_region(hdr, ptr, mapped_size, out); + if (se != RAY_OK) { + ray_vm_unmap_file(ptr, mapped_size); + return ray_error(ray_err_code_str(se), NULL); + } + bitmap_offset = out->bitmap_offset; + } else { + out->has_str_pool = false; + out->str_pool_offset = 0; + out->str_pool_size = 0; + out->bitmap_offset = bitmap_offset; + } + /* Check for appended ext_nullmap bitmap */ bool has_ext_nullmap = (hdr->attrs & RAY_ATTR_HAS_NULLS) && (hdr->attrs & RAY_ATTR_NULLMAP_EXT); size_t bitmap_len = has_ext_nullmap ? ((size_t)hdr->len + 7) / 8 : 0; - if (has_ext_nullmap && 32 + data_size + bitmap_len > mapped_size) { + if (has_ext_nullmap && (bitmap_offset > mapped_size || + bitmap_len > mapped_size - bitmap_offset)) { ray_vm_unmap_file(ptr, mapped_size); return ray_error("corrupt", NULL); } @@ -791,6 +910,7 @@ static ray_t* col_validate_mapped(const char* path, col_mapped_t* out) { if (hdr->type == RAY_SYM) { uint32_t saved_sc; memcpy(&saved_sc, (const char*)ptr + offsetof(ray_t, rc), sizeof(saved_sc)); + out->saved_sym_count = saved_sc; uint32_t cur_sc = ray_sym_count(); if (saved_sc > 0 && cur_sc > 0 && cur_sc < saved_sc) { ray_vm_unmap_file(ptr, mapped_size); @@ -803,6 +923,7 @@ static ray_t* col_validate_mapped(const char* path, col_mapped_t* out) { out->header = hdr; out->esz = esz; out->data_size = data_size; + out->bitmap_offset = bitmap_offset; out->has_ext_nullmap = has_ext_nullmap; out->bitmap_len = bitmap_len; return NULL; /* success */ @@ -819,7 +940,7 @@ static ray_t* col_restore_ext_nullmap(ray_t* vec, const col_mapped_t* cm) { ray_t* ext = ray_vec_new(RAY_U8, (int64_t)cm->bitmap_len); if (!ext || RAY_IS_ERR(ext)) return ray_error("oom", NULL); ext->len = (int64_t)cm->bitmap_len; - memcpy(ray_data(ext), (char*)cm->mapped + 32 + cm->data_size, cm->bitmap_len); + memcpy(ray_data(ext), (char*)cm->mapped + cm->bitmap_offset, cm->bitmap_len); vec->ext_nullmap = ext; return NULL; /* success */ } @@ -876,6 +997,16 @@ ray_t* ray_col_load(const char* path) { uint8_t saved_order = vec->order; /* preserve buddy order */ memcpy(vec, cm.mapped, 32 + cm.data_size); + if (vec->type == RAY_STR) { + ray_t* pool = col_copy_str_pool(&cm); + if (pool && RAY_IS_ERR(pool)) { + ray_vm_unmap_file(cm.mapped, cm.mapped_size); + ray_free(vec); + return pool; + } + vec->str_pool = pool; + } + /* Restore external nullmap if present */ if (cm.has_ext_nullmap) { ray_t* ext_err = col_restore_ext_nullmap(vec, &cm); @@ -920,7 +1051,7 @@ ray_t* ray_col_load(const char* path) { * ray_release -> ray_free -> munmap. * -------------------------------------------------------------------------- */ -ray_t* ray_col_mmap(const char* path) { +static ray_t* col_mmap_impl(const char* path, bool trust_splayed_sym_count) { if (!path) return ray_error("io", NULL); col_mapped_t cm = {0}; @@ -929,7 +1060,7 @@ ray_t* ray_col_mmap(const char* path) { /* Validate that file size matches expected layout exactly. * ray_free() reconstructs the munmap size using the same formula. */ - size_t expected = 32 + cm.data_size + cm.bitmap_len; + size_t expected = cm.bitmap_offset + cm.bitmap_len; if (expected != cm.mapped_size) { ray_vm_unmap_file(cm.mapped, cm.mapped_size); return ray_error("io", NULL); @@ -937,13 +1068,27 @@ ray_t* ray_col_mmap(const char* path) { ray_t* vec = cm.header; - /* RAY_SYM: bounds check on data */ + /* RAY_SYM: generic mmap validates indices by scanning the data. Splayed + * reads are the trusted local table format, so opening them must only map + * files and validate headers; walking every symbol column would turn mmap + * open into an eager cold-disk scan. New files carry a saved symbol count + * for an O(1) reject. Older files have zero there, so they rely on the + * loaded global symfile contract instead. */ if (vec->type == RAY_SYM) { - ray_err_t sym_err = validate_sym_bounds( - (const char*)cm.mapped + 32, vec->len, vec->attrs, ray_sym_count()); - if (sym_err != RAY_OK) { + uint32_t cur_sc = ray_sym_count(); + uint32_t trusted_sc = trust_splayed_sym_count ? ray_sym_persisted_count() : cur_sc; + if (trust_splayed_sym_count && cm.saved_sym_count > trusted_sc) { ray_vm_unmap_file(cm.mapped, cm.mapped_size); - return ray_error(ray_err_code_str(sym_err), NULL); + return ray_error("corrupt", NULL); + } + bool skip_bounds = trust_splayed_sym_count && trusted_sc > 0; + if (!skip_bounds) { + ray_err_t sym_err = validate_sym_bounds( + (const char*)cm.mapped + 32, vec->len, vec->attrs, cur_sc); + if (sym_err != RAY_OK) { + ray_vm_unmap_file(cm.mapped, cm.mapped_size); + return ray_error(ray_err_code_str(sym_err), NULL); + } } } @@ -965,6 +1110,15 @@ ray_t* ray_col_mmap(const char* path) { vec->attrs &= ~RAY_ATTR_NULLMAP_EXT; ray_atomic_store(&vec->rc, 1); + if (vec->type == RAY_STR) { + ray_t* pool = (ray_t*)((char*)cm.mapped + cm.str_pool_offset); + pool->mmod = 2; + pool->order = 0; + pool->attrs &= (uint8_t)~RAY_ATTR_SLICE; + ray_atomic_store(&pool->rc, 1); + vec->str_pool = pool; + } + /* Reattach link sidecar if present. Without this, linked columns * round-tripped through splay-mmap (splay.c:184) lose HAS_LINK * even though ray_col_load restores it. */ @@ -972,3 +1126,11 @@ ray_t* ray_col_mmap(const char* path) { return vec; } + +ray_t* ray_col_mmap(const char* path) { + return col_mmap_impl(path, false); +} + +ray_t* ray_col_mmap_splayed(const char* path) { + return col_mmap_impl(path, true); +} diff --git a/src/store/col.h b/src/store/col.h index 55f492b0..d4a79020 100644 --- a/src/store/col.h +++ b/src/store/col.h @@ -28,7 +28,9 @@ /* Column file I/O */ ray_err_t ray_col_save(ray_t* vec, const char* path); +ray_err_t ray_col_save_bulk(ray_t* vec, const char* path); ray_t* ray_col_load(const char* path); ray_t* ray_col_mmap(const char* path); +ray_t* ray_col_mmap_splayed(const char* path); #endif /* RAY_COL_H */ diff --git a/src/store/part.c b/src/store/part.c index a00648a5..c8f5caaf 100644 --- a/src/store/part.c +++ b/src/store/part.c @@ -27,13 +27,14 @@ #define _GNU_SOURCE #endif #include "part.h" -#include "core/platform.h" #include "mem/sys.h" #include "ops/ops.h" #include "store/splay.h" #include "table/sym.h" #include #include +#include +#include #include #include @@ -126,6 +127,7 @@ static ray_err_t collect_part_dirs(const char* db_root, char*** out_dirs, char** part_dirs = NULL; int64_t part_count = 0; int64_t part_cap = 0; + ray_err_t err = RAY_OK; struct dirent* ent; while ((ent = readdir(d)) != NULL) { @@ -144,18 +146,26 @@ static ray_err_t collect_part_dirs(const char* db_root, char*** out_dirs, if (part_count >= part_cap) { part_cap = part_cap == 0 ? 16 : part_cap * 2; - char** tmp = (char**)ray_sys_realloc(part_dirs, (size_t)part_cap * sizeof(char*)); - if (!tmp) break; + char** tmp = (char**)realloc(part_dirs, (size_t)part_cap * sizeof(char*)); + if (!tmp) { err = RAY_ERR_OOM; break; } part_dirs = tmp; } - char* dup = ray_sys_strdup(ent->d_name); - if (!dup) break; + size_t len = strlen(ent->d_name); + char* dup = (char*)malloc(len + 1); + if (!dup) { err = RAY_ERR_OOM; break; } + memcpy(dup, ent->d_name, len + 1); part_dirs[part_count++] = dup; } closedir(d); + if (err != RAY_OK) { + for (int64_t i = 0; i < part_count; i++) free(part_dirs[i]); + free(part_dirs); + return err; + } + if (part_count == 0) { - ray_sys_free(part_dirs); + free(part_dirs); return RAY_ERR_IO; } @@ -186,6 +196,9 @@ static ray_err_t collect_part_dirs(const char* db_root, char*** out_dirs, ray_t* ray_read_parted(const char* db_root, const char* table_name) { if (!db_root || !table_name) return ray_error("io", NULL); + bool trace = getenv("RAY_CSV_TRACE") != NULL; + if (trace) + fprintf(stderr, "parted.get: root=%s table=%s\n", db_root, table_name); /* Validate table_name: no path separators or traversal */ if (strchr(table_name, '/') || strchr(table_name, '\\') || @@ -212,7 +225,14 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { char** part_dirs = NULL; int64_t part_count = 0; ray_err_t collect_err = collect_part_dirs(db_root, &part_dirs, &part_count, true); - if (collect_err != RAY_OK) return ray_error("io", NULL); + if (collect_err != RAY_OK) { + if (trace) + fprintf(stderr, "parted.get: collect dirs failed err=%s\n", + ray_err_code_str(collect_err)); + return ray_error("io", NULL); + } + if (trace) + fprintf(stderr, "parted.get: parts=%" PRId64 "\n", part_count); /* Open each partition via ray_read_splayed */ ray_t** part_tables = (ray_t**)ray_sys_alloc((size_t)part_count * sizeof(ray_t*)); @@ -228,6 +248,11 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { } part_tables[p] = ray_read_splayed(path, NULL); if (!part_tables[p] || RAY_IS_ERR(part_tables[p])) { + if (trace) + fprintf(stderr, "parted.get: splayed load failed part=%" PRId64 " path=%s err=%s\n", + p, path, + part_tables[p] && RAY_IS_ERR(part_tables[p]) + ? ray_err_code(part_tables[p]) : "io"); part_tables[p] = NULL; goto fail_tables; } @@ -235,7 +260,13 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { /* Get schema from first partition */ int64_t ncols = ray_table_ncols(part_tables[0]); - if (ncols <= 0) goto fail_tables; + if (ncols <= 0) { + if (trace) + fprintf(stderr, "parted.get: empty first partition\n"); + goto fail_tables; + } + if (trace) + fprintf(stderr, "parted.get: ncols=%" PRId64 "\n", ncols); /* Infer MAPCOMMON sub-type from partition directory names */ uint8_t mc_type = infer_mc_type(part_dirs, part_count); @@ -322,6 +353,8 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { ray_t* parted = ray_alloc((size_t)part_count * sizeof(ray_t*)); if (!parted || RAY_IS_ERR(parted)) { + if (trace) + fprintf(stderr, "parted.get: alloc failed col=%" PRId64 "\n", c); ray_release(result); goto fail_tables; } @@ -339,8 +372,6 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { } ray_retain(seg); segs[p] = seg; - ray_vm_advise_willneed(ray_data(seg), - (size_t)seg->len * ray_sym_elem_size(seg->type, seg->attrs)); } result = ray_table_add_col(result, name_id, parted); @@ -351,14 +382,16 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { /* Release partition sub-tables (segment vectors survive via retain) */ for (int64_t p = 0; p < part_count; p++) { if (part_tables[p]) ray_release(part_tables[p]); - ray_sys_free(part_dirs[p]); + free(part_dirs[p]); } ray_sys_free(part_tables); - ray_sys_free(part_dirs); + free(part_dirs); return result; fail_tables: + if (trace) + fprintf(stderr, "parted.get: failed\n"); for (int64_t p = 0; p < part_count; p++) { if (part_tables[p] && !RAY_IS_ERR(part_tables[p])) ray_release(part_tables[p]); @@ -367,8 +400,8 @@ ray_t* ray_read_parted(const char* db_root, const char* table_name) { fail_dirs: for (int64_t p = 0; p < part_count; p++) - ray_sys_free(part_dirs[p]); - ray_sys_free(part_dirs); + free(part_dirs[p]); + free(part_dirs); return ray_error("io", NULL); } diff --git a/src/store/splay.c b/src/store/splay.c index 91417bd5..2b71586e 100644 --- a/src/store/splay.c +++ b/src/store/splay.c @@ -24,8 +24,11 @@ #include "splay.h" #include "store/col.h" #include "store/fileio.h" +#include "table/sym.h" #include #include +#include +#include /* -------------------------------------------------------------------------- * Splayed table: directory of column files + .d schema file @@ -59,7 +62,8 @@ static ray_err_t validate_sym_columns(ray_t* tbl, int64_t schema_ncols) { * ray_splay_save — save a table to a splayed table directory * -------------------------------------------------------------------------- */ -ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { +static ray_err_t splay_save_impl(ray_t* tbl, const char* dir, const char* sym_path, + bool durable) { if (!tbl || RAY_IS_ERR(tbl)) return RAY_ERR_TYPE; if (!dir) return RAY_ERR_IO; @@ -71,7 +75,7 @@ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { /* Save symbol table if sym_path provided */ if (sym_path) { - ray_err_t sym_err = ray_sym_save(sym_path); + ray_err_t sym_err = durable ? ray_sym_save(sym_path) : ray_sym_save_bulk(sym_path); if (sym_err != RAY_OK) return sym_err; } @@ -83,7 +87,7 @@ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { char path[1024]; int path_len = snprintf(path, sizeof(path), "%s/.d", dir); if (path_len < 0 || (size_t)path_len >= sizeof(path)) return RAY_ERR_RANGE; - ray_err_t err = ray_col_save(schema, path); + ray_err_t err = durable ? ray_col_save(schema, path) : ray_col_save_bulk(schema, path); if (err != RAY_OK) return err; } @@ -110,7 +114,7 @@ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { int path_len = snprintf(path, sizeof(path), "%s/%.*s", dir, (int)name_len, name); if (path_len < 0 || (size_t)path_len >= sizeof(path)) return RAY_ERR_RANGE; - ray_err_t err = ray_col_save(col, path); + ray_err_t err = durable ? ray_col_save(col, path) : ray_col_save_bulk(col, path); /* On partial failure, columns 0..c-1 remain on disk. * Caller should clean up or use atomic rename for safe writes. */ if (err != RAY_OK) return err; @@ -119,6 +123,14 @@ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { return RAY_OK; } +ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { + return splay_save_impl(tbl, dir, sym_path, true); +} + +ray_err_t ray_splay_save_bulk(ray_t* tbl, const char* dir, const char* sym_path) { + return splay_save_impl(tbl, dir, sym_path, false); +} + /* -------------------------------------------------------------------------- * splay_load_impl — shared implementation for ray_splay_load / ray_read_splayed * @@ -129,6 +141,9 @@ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) { static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mmap) { if (!dir) return ray_error("io", NULL); + bool trace = getenv("RAY_CSV_TRACE") != NULL; + if (trace) + fprintf(stderr, "splayed.get: dir=%s mmap=%d\n", dir, use_mmap ? 1 : 0); /* Load symbol table if sym_path provided */ if (sym_path) { @@ -142,7 +157,12 @@ static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mm if (path_len < 0 || (size_t)path_len >= sizeof(path)) return ray_error("range", NULL); ray_t* schema = ray_col_load(path); - if (!schema || RAY_IS_ERR(schema)) return schema; + if (!schema || RAY_IS_ERR(schema)) { + if (trace) + fprintf(stderr, "splayed.get: schema load failed path=%s err=%s\n", + path, schema && RAY_IS_ERR(schema) ? ray_err_code(schema) : "io"); + return schema; + } int64_t ncols = schema->len; int64_t* name_ids = (int64_t*)ray_data(schema); @@ -160,6 +180,9 @@ static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mm if (!name_atom) { /* Schema references a sym ID that doesn't exist — sym table * is stale or wrong for this data. */ + if (trace) + fprintf(stderr, "splayed.get: missing schema symbol col=%" PRId64 " id=%" PRId64 "\n", + c, name_id); ray_release(schema); ray_release(tbl); return ray_error("corrupt", NULL); @@ -185,7 +208,7 @@ static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mm return ray_error("range", NULL); } - ray_t* col = use_mmap ? ray_col_mmap(path) : ray_col_load(path); + ray_t* col = use_mmap ? ray_col_mmap_splayed(path) : ray_col_load(path); if (use_mmap && col && RAY_IS_ERR(col) && strcmp(ray_err_code(col), "nyi") == 0) { /* ray_release on an error object is a no-op (rayforce.h:180); @@ -195,6 +218,9 @@ static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mm col = ray_col_load(path); } if (!col || RAY_IS_ERR(col)) { + if (trace) + fprintf(stderr, "splayed.get: col load failed path=%s err=%s\n", + path, col && RAY_IS_ERR(col) ? ray_err_code(col) : "io"); ray_release(schema); ray_release(tbl); return col ? col : ray_error("io", NULL); diff --git a/src/store/splay.h b/src/store/splay.h index 8648bf14..7b69529b 100644 --- a/src/store/splay.h +++ b/src/store/splay.h @@ -28,6 +28,7 @@ /* Splayed table I/O */ ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path); +ray_err_t ray_splay_save_bulk(ray_t* tbl, const char* dir, const char* sym_path); ray_t* ray_splay_load(const char* dir, const char* sym_path); ray_t* ray_read_splayed(const char* dir, const char* sym_path); diff --git a/src/table/sym.c b/src/table/sym.c index 30b4b928..e7a859fb 100644 --- a/src/table/sym.c +++ b/src/table/sym.c @@ -41,6 +41,8 @@ #define SYM_INIT_CAP 256 #define SYM_LOAD_FACTOR 0.7 +#define SYM_STRL_MAGIC 0x4C525453U /* "STRL" */ +#define SYM_LAZY_LOAD_MIN_BYTES (64u * 1024u * 1024u) /* Cached segment list for a dotted sym: nsegs sym_ids that together make up * the dotted path. segs is arena-allocated (same lifetime as sym table). */ @@ -76,6 +78,13 @@ typedef struct { /* Persistence: entries [0..persisted_count-1] are known on disk */ uint32_t persisted_count; + /* Large on-disk dictionaries stay mapped and are materialized by id. */ + uint8_t* lazy_map; + size_t lazy_size; + uint32_t lazy_next_id; + const uint8_t* lazy_ptr; + size_t lazy_remaining; + /* Arena for string atoms — avoids per-string buddy allocator calls */ ray_arena_t* arena; } sym_table_t; @@ -235,6 +244,15 @@ ray_err_t ray_sym_init(void) { void ray_sym_destroy(void) { if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return; + if (g_sym.lazy_map) { + ray_vm_unmap_file(g_sym.lazy_map, g_sym.lazy_size); + g_sym.lazy_map = NULL; + g_sym.lazy_size = 0; + g_sym.lazy_next_id = 0; + g_sym.lazy_ptr = NULL; + g_sym.lazy_remaining = 0; + } + /* Arena-backed strings: ray_release is a no-op (RAY_ATTR_ARENA). * Destroy the arena to free all string atoms at once. * segments[i].segs pointers are arena-allocated too, freed with it. */ @@ -312,6 +330,7 @@ static bool sym_grow_str_cap(uint32_t new_cap) { ray_t** new_strings = (ray_t**)ray_sys_realloc(g_sym.strings, (size_t)new_cap * sizeof(ray_t*)); if (!new_strings) return false; + memset(new_strings + old_cap, 0, (size_t)(new_cap - old_cap) * sizeof(ray_t*)); g_sym.strings = new_strings; uint32_t old_bm_words = (old_cap + 63) / 64; @@ -351,6 +370,7 @@ static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len); static int64_t sym_probe(uint32_t hash, const char* str, size_t len); static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len); static bool sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes); +static bool sym_lazy_materialize_to_locked(uint32_t target_id); /* -------------------------------------------------------------------------- * sym_cache_segments — idempotent cache-and-apply for an EXISTING sym. @@ -572,6 +592,51 @@ static bool sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes) { return true; } +static void sym_lazy_unmap_locked(void) { + if (!g_sym.lazy_map) return; + ray_vm_unmap_file(g_sym.lazy_map, g_sym.lazy_size); + g_sym.lazy_map = NULL; + g_sym.lazy_size = 0; + g_sym.lazy_next_id = 0; + g_sym.lazy_ptr = NULL; + g_sym.lazy_remaining = 0; +} + +static bool sym_lazy_materialize_to_locked(uint32_t target_id) { + if (!g_sym.lazy_map) return false; + if (target_id >= g_sym.persisted_count) return false; + if (target_id < g_sym.lazy_next_id) return g_sym.strings[target_id] != NULL; + + while (g_sym.lazy_next_id <= target_id) { + if (g_sym.lazy_remaining < 4) return false; + uint32_t slen; + memcpy(&slen, g_sym.lazy_ptr, 4); + g_sym.lazy_ptr += 4; + g_sym.lazy_remaining -= 4; + if ((size_t)slen > g_sym.lazy_remaining) return false; + + uint32_t id = g_sym.lazy_next_id; + const char* sp = (const char*)g_sym.lazy_ptr; + ray_t* existing = g_sym.strings[id]; + if (existing) { + if (ray_str_len(existing) != (size_t)slen || + memcmp(ray_str_ptr(existing), sp, slen) != 0) + return false; + } else { + ray_t* s = sym_str_arena(g_sym.arena, sp, (size_t)slen); + if (!s) return false; + g_sym.strings[id] = s; + ht_insert(g_sym.buckets, g_sym.bucket_cap, + (uint32_t)ray_hash_bytes(sp, (size_t)slen), id); + } + + g_sym.lazy_ptr += slen; + g_sym.lazy_remaining -= slen; + g_sym.lazy_next_id++; + } + return true; +} + /* -------------------------------------------------------------------------- * sym_intern_nolock — fully atomic intern. * @@ -744,6 +809,12 @@ int64_t ray_sym_intern_no_split(const char* str, size_t len) { return id; } +int64_t ray_sym_intern_no_split_unlocked(const char* str, size_t len) { + if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1; + uint32_t hash = (uint32_t)ray_hash_bytes(str, len); + return sym_intern_nolock_noseg(hash, str, len); +} + /* -------------------------------------------------------------------------- * ray_sym_rebuild_segments — populate dotted cache for any not-yet-cached * entries. Must follow a batch of ray_sym_intern_no_split calls. @@ -844,6 +915,12 @@ ray_t* ray_sym_str(int64_t id) { /* Lock required: concurrent ray_sym_intern may realloc g_sym.strings. */ sym_lock(); if (id < 0 || (uint32_t)id >= g_sym.str_count) { sym_unlock(); return NULL; } + if (!g_sym.strings[id] && (uint32_t)id < g_sym.persisted_count) { + if (!sym_lazy_materialize_to_locked((uint32_t)id)) { + sym_unlock(); + return NULL; + } + } ray_t* s = g_sym.strings[id]; sym_unlock(); return s; @@ -863,6 +940,15 @@ uint32_t ray_sym_count(void) { return count; } +uint32_t ray_sym_persisted_count(void) { + if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return 0; + + sym_lock(); + uint32_t count = g_sym.persisted_count; + sym_unlock(); + return count; +} + /* -------------------------------------------------------------------------- * ray_sym_strings_borrow * @@ -884,6 +970,9 @@ void ray_sym_strings_borrow(ray_t*** out_strings, uint32_t* out_count) { if (out_count) *out_count = 0; if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return; sym_lock(); + if (g_sym.lazy_map && g_sym.persisted_count > 0) { + (void)sym_lazy_materialize_to_locked(g_sym.persisted_count - 1); + } if (out_strings) *out_strings = g_sym.strings; if (out_count) *out_count = g_sym.str_count; sym_unlock(); @@ -950,7 +1039,7 @@ bool ray_sym_ensure_cap(uint32_t needed) { * when persisted_count == str_count. * -------------------------------------------------------------------------- */ -ray_err_t ray_sym_save(const char* path) { +static ray_err_t sym_save_impl(const char* path, bool durable) { if (!path) return RAY_ERR_IO; if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return RAY_ERR_IO; @@ -1076,29 +1165,39 @@ ray_err_t ray_sym_save(const char* path) { memcpy(snap, g_sym.strings, snap_sz); sym_unlock(); - /* Build RAY_LIST of -RAY_STR from snapshot */ - ray_t* list = ray_list_new((int64_t)count); - if (!list || RAY_IS_ERR(list)) { - ray_free(snap_block); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return RAY_ERR_OOM; - } - - for (uint32_t i = 0; i < count; i++) { - list = ray_list_append(list, snap[i]); - if (!list || RAY_IS_ERR(list)) { + /* Save STRL directly instead of first materializing a giant RAY_LIST. */ + { + FILE* f = fopen(tmp_path, "wb"); + if (!f) { ray_free(snap_block); ray_file_unlock(lock_fd); ray_file_close(lock_fd); - return RAY_ERR_OOM; + return RAY_ERR_IO; } + uint32_t magic = SYM_STRL_MAGIC; + int64_t n64 = (int64_t)count; + err = RAY_OK; + if (fwrite(&magic, 4, 1, f) != 1 || + fwrite(&n64, 8, 1, f) != 1) { + err = RAY_ERR_IO; + } else { + for (uint32_t i = 0; i < count; i++) { + ray_t* s = snap[i]; + if (!s || s->type != -RAY_STR) { err = RAY_ERR_CORRUPT; break; } + const char* sp = ray_str_ptr(s); + size_t slen = ray_str_len(s); + if (slen > UINT32_MAX) { err = RAY_ERR_RANGE; break; } + uint32_t len32 = (uint32_t)slen; + if (fwrite(&len32, 4, 1, f) != 1 || + (slen > 0 && fwrite(sp, 1, slen, f) != slen)) { + err = RAY_ERR_IO; + break; + } + } + } + if (fclose(f) != 0 && err == RAY_OK) err = RAY_ERR_IO; } ray_free(snap_block); - - /* Save to temp file via ray_col_save (writes STRL format) */ - err = ray_col_save(list, tmp_path); - ray_release(list); if (err != RAY_OK) { remove(tmp_path); ray_file_unlock(lock_fd); @@ -1106,21 +1205,23 @@ ray_err_t ray_sym_save(const char* path) { return err; } - /* Fsync temp file for durability */ - ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE); - if (tmp_fd == RAY_FD_INVALID) { - remove(tmp_path); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return RAY_ERR_IO; - } - err = ray_file_sync(tmp_fd); - ray_file_close(tmp_fd); - if (err != RAY_OK) { - remove(tmp_path); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return err; + if (durable) { + /* Fsync temp file for durability */ + ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE); + if (tmp_fd == RAY_FD_INVALID) { + remove(tmp_path); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_IO; + } + err = ray_file_sync(tmp_fd); + ray_file_close(tmp_fd); + if (err != RAY_OK) { + remove(tmp_path); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return err; + } } /* Atomic rename: tmp -> final path */ @@ -1132,13 +1233,15 @@ ray_err_t ray_sym_save(const char* path) { return err; } - /* Fsync parent directory so the new directory entry is durable. - * Without this, a crash after rename can lose the new file. */ - err = ray_file_sync_dir(path); - if (err != RAY_OK) { - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return err; + if (durable) { + /* Fsync parent directory so the new directory entry is durable. + * Without this, a crash after rename can lose the new file. */ + err = ray_file_sync_dir(path); + if (err != RAY_OK) { + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return err; + } } /* Update persisted count */ @@ -1151,6 +1254,14 @@ ray_err_t ray_sym_save(const char* path) { return RAY_OK; } +ray_err_t ray_sym_save(const char* path) { + return sym_save_impl(path, true); +} + +ray_err_t ray_sym_save_bulk(const char* path) { + return sym_save_impl(path, false); +} + /* -------------------------------------------------------------------------- * ray_sym_load -- load symbol table from RAY_LIST file (STRL format) * @@ -1189,22 +1300,90 @@ ray_err_t ray_sym_load(const char* path) { if (err != RAY_OK) { ray_file_close(lock_fd); return err; } } - /* Load the sym file as a RAY_LIST of -RAY_STR */ - ray_t* list = ray_col_load(path); - if (!list || RAY_IS_ERR(list)) { - ray_err_t code = RAY_IS_ERR(list) ? ray_err_from_obj(list) : RAY_ERR_IO; + size_t mapped_size = 0; + uint8_t* mapped = (uint8_t*)ray_vm_map_file(path, &mapped_size); + if (!mapped) { ray_file_unlock(lock_fd); ray_file_close(lock_fd); - return code; + return RAY_ERR_IO; + } + if (mapped_size < 12) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; + } + uint32_t magic; + memcpy(&magic, mapped, 4); + if (magic != SYM_STRL_MAGIC) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; } - if (list->type != RAY_LIST || list->len > UINT32_MAX) { - ray_release(list); + int64_t disk_count; + memcpy(&disk_count, mapped + 4, 8); + if (disk_count < 0 || disk_count > UINT32_MAX) { + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return RAY_ERR_CORRUPT; } + if (mapped_size >= SYM_LAZY_LOAD_MIN_BYTES) { + sym_lock(); + uint32_t current = g_sym.str_count; + uint32_t persisted = g_sym.persisted_count; + uint32_t disk_u = (uint32_t)disk_count; + if (disk_count < (int64_t)persisted) { + sym_unlock(); + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; + } + + uint32_t target_count = current > disk_u ? current : disk_u; + if (target_count > current && !sym_reserve_capacity(target_count - current, 0)) { + sym_unlock(); + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_OOM; + } + if (disk_u > current) { + memset(g_sym.strings + current, 0, + ((size_t)disk_u - current) * sizeof(ray_t*)); + } + + sym_lazy_unmap_locked(); + g_sym.lazy_map = mapped; + g_sym.lazy_size = mapped_size; + g_sym.lazy_next_id = 0; + g_sym.lazy_ptr = mapped + 12; + g_sym.lazy_remaining = mapped_size - 12; + g_sym.str_count = target_count; + g_sym.persisted_count = disk_u; + + uint32_t validate_count = current < disk_u ? current : disk_u; + bool ok = validate_count == 0 || + sym_lazy_materialize_to_locked(validate_count - 1); + sym_unlock(); + if (!ok) { + sym_lock(); + sym_lazy_unmap_locked(); + sym_unlock(); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; + } + + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_OK; + } + /* Validate existing entries match, then intern remaining. * Use persisted_count (not str_count) as the already-loaded prefix: * runtime code may ray_sym_intern transient names that were never @@ -1213,71 +1392,71 @@ ray_err_t ray_sym_load(const char* path) { sym_lock(); uint32_t already = g_sym.persisted_count; sym_unlock(); - ray_t** slots = (ray_t**)ray_data(list); /* Reject stale/truncated sym file: if disk has fewer entries than what * we previously loaded from disk, the file is outdated or truncated. */ - if (already > 0 && list->len < (int64_t)already) { - ray_release(list); + if (already > 0 && disk_count < (int64_t)already) { + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return RAY_ERR_CORRUPT; } - /* Validate entries [0..already-1] match the persisted prefix */ - for (int64_t i = 0; i < (int64_t)already && i < list->len; i++) { - ray_t* s = slots[i]; - if (!s || RAY_IS_ERR(s) || s->type != -RAY_STR) { - ray_release(list); + const uint8_t* ptr = mapped + 12; + size_t remaining = mapped_size - 12; + for (int64_t i = 0; i < disk_count; i++) { + if (remaining < 4) { + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return RAY_ERR_CORRUPT; } - ray_t* mem_s = ray_sym_str(i); - if (!mem_s || ray_str_len(mem_s) != ray_str_len(s) || - memcmp(ray_str_ptr(mem_s), ray_str_ptr(s), ray_str_len(s)) != 0) { - ray_release(list); + uint32_t slen; + memcpy(&slen, ptr, 4); + ptr += 4; + remaining -= 4; + if ((size_t)slen > remaining) { + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return RAY_ERR_CORRUPT; } - } - /* Intern entries beyond what's already in memory. - * Verify each entry's in-memory ID matches its disk position: - * if transient runtime-interned symbols already occupy these - * slots, the disk entries would get wrong IDs, causing RAY_SYM - * columns to resolve the wrong strings. */ - for (int64_t i = (int64_t)already; i < list->len; i++) { - ray_t* s = slots[i]; - if (!s || RAY_IS_ERR(s) || s->type != -RAY_STR) { - ray_release(list); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return RAY_ERR_CORRUPT; - } - /* Bulk load MUST use the no-split variant so that loading a disk - * entry like "user.name" doesn't recursively intern "user" + "name" - * mid-loop and shift subsequent disk positions — that would break - * the id==i contract below. Segment cache is populated in one - * pass after the loop finishes. */ - int64_t id = ray_sym_intern_no_split(ray_str_ptr(s), ray_str_len(s)); - if (id < 0) { - ray_release(list); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return RAY_ERR_OOM; - } - if (id != i) { - /* ID mismatch: disk position i was assigned in-memory - * id != i, meaning a transient symbol occupies the slot. - * The sym table has diverged from disk; continuing would - * cause RAY_SYM columns to resolve wrong strings. */ - ray_release(list); - ray_file_unlock(lock_fd); - ray_file_close(lock_fd); - return RAY_ERR_CORRUPT; + const char* sp = (const char*)ptr; + if (i < (int64_t)already) { + ray_t* mem_s = ray_sym_str(i); + if (!mem_s || ray_str_len(mem_s) != (size_t)slen || + memcmp(ray_str_ptr(mem_s), sp, slen) != 0) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; + } + } else { + /* Bulk load uses no-split interning so dotted names cannot append + * segment symbols mid-stream and shift disk-position IDs. */ + int64_t id = ray_sym_intern_no_split(sp, (size_t)slen); + if (id < 0) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_OOM; + } + if (id != i) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; + } } + ptr += slen; + remaining -= slen; + } + if (remaining != 0) { + ray_vm_unmap_file(mapped, mapped_size); + ray_file_unlock(lock_fd); + ray_file_close(lock_fd); + return RAY_ERR_CORRUPT; } /* Populate dotted cache for every loaded (and previously-loaded) sym. @@ -1287,7 +1466,7 @@ ray_err_t ray_sym_load(const char* path) { * namespace semantics on anything the user stored with a '.' in it. */ ray_err_t rebuild_err = ray_sym_rebuild_segments(); if (rebuild_err != RAY_OK) { - ray_release(list); + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return rebuild_err; @@ -1297,10 +1476,10 @@ ray_err_t ray_sym_load(const char* path) { * Use list->len (not str_count) because transient runtime-interned * symbols may exist beyond the persisted prefix. */ sym_lock(); - g_sym.persisted_count = (uint32_t)list->len; + g_sym.persisted_count = (uint32_t)disk_count; sym_unlock(); - ray_release(list); + ray_vm_unmap_file(mapped, mapped_size); ray_file_unlock(lock_fd); ray_file_close(lock_fd); return RAY_OK; diff --git a/src/table/sym.h b/src/table/sym.h index e55734c8..67c159bc 100644 --- a/src/table/sym.h +++ b/src/table/sym.h @@ -109,6 +109,7 @@ int ray_sym_segs(int64_t sym_id, const int64_t** out_segs); * appending entries mid-sequence. Callers MUST follow a batch of these * with ray_sym_rebuild_segments to populate the dotted cache. */ int64_t ray_sym_intern_no_split(const char* str, size_t len); +int64_t ray_sym_intern_no_split_unlocked(const char* str, size_t len); /* Walk the intern table and cache segment sym_ids for any dotted name * that hasn't been cached yet. Idempotent — safe to call multiple times. @@ -117,6 +118,15 @@ int64_t ray_sym_intern_no_split(const char* str, size_t len); * paths can abort instead of leaving dotted names silently un-cached. */ ray_err_t ray_sym_rebuild_segments(void); +/* Number of symbols loaded from or saved to the current on-disk dictionary. + * Runtime-only interned symbols may exist above this prefix. */ +uint32_t ray_sym_persisted_count(void); + +/* Save the same on-disk symbol format as ray_sym_save, but skip durability + * syncs. Intended for generated bulk-import caches where throughput matters + * more than crash recovery of a half-written target. */ +ray_err_t ray_sym_save_bulk(const char* path); + /* Upper bound on the arena bytes that sym_str_arena consumes for a name * of the given length. Used by the three-phase atomic intern to pre- * reserve arena capacity, so the commit phase cannot fail partway. diff --git a/src/vec/vec.c b/src/vec/vec.c index dce29e09..16491f73 100644 --- a/src/vec/vec.c +++ b/src/vec/vec.c @@ -959,9 +959,11 @@ void ray_vec_set_null(ray_t* vec, int64_t idx, bool is_null) { static ray_t* str_pool_cow(ray_t* vec) { if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) return vec; uint32_t pool_rc = ray_atomic_load(&vec->str_pool->rc); - if (pool_rc <= 1) return vec; + if (pool_rc <= 1 && vec->str_pool->mmod == 0) return vec; - size_t pool_data_size = ((size_t)1 << vec->str_pool->order) - 32; + size_t pool_data_size = vec->str_pool->mmod == 0 + ? ((size_t)1 << vec->str_pool->order) - 32 + : (vec->str_pool->len > 64 ? (size_t)vec->str_pool->len : 64); ray_t* new_pool = ray_alloc(pool_data_size); if (!new_pool || RAY_IS_ERR(new_pool)) return NULL; diff --git a/test/rfl/strop/strlen.rfl b/test/rfl/strop/strlen.rfl new file mode 100644 index 00000000..80aa48eb --- /dev/null +++ b/test/rfl/strop/strlen.rfl @@ -0,0 +1,4 @@ +(strlen "abc") -- 3 +(strlen "") -- 0 +(strlen 'alpha) -- 5 +(sum (strlen ["aa" "bbb" ""])) -- 5 diff --git a/test/rfl/system/part.rfl b/test/rfl/system/part.rfl index bc928a4c..ddba95be 100644 --- a/test/rfl/system/part.rfl +++ b/test/rfl/system/part.rfl @@ -16,7 +16,7 @@ ;; on the second open. ;; ────────────── pre-flight cleanup ────────────── -(.sys.exec "rm -rf /tmp/rfl_part_date /tmp/rfl_part_int /tmp/rfl_part_sym /tmp/rfl_part_single /tmp/rfl_part_empty /tmp/rfl_part_missing /tmp/rfl_part_three") +(.sys.exec "rm -rf /tmp/rfl_part_date /tmp/rfl_part_int /tmp/rfl_part_sym /tmp/rfl_part_single /tmp/rfl_part_empty /tmp/rfl_part_missing /tmp/rfl_part_three /tmp/rfl_part_cd /tmp/rfl_part_cd_sym /tmp/rfl_part_minute /tmp/rfl_part_like") (.sys.exec "mkdir -p /tmp/rfl_part_empty") ;; ────────────── date-partition path: RAY_MC_DATE branch ────────────── @@ -24,20 +24,26 @@ ;; table. Drives is_date_dir → all_date branch of infer_mc_type, ;; parse_date_dir for both partitions, the RAY_MC_DATE arm of ;; ray_read_parted (kv_type = RAY_DATE, mc_name = "date"). -(set DT-A (table [id val] (list [1 2 3] [10.0 20.0 30.0]))) -(set DT-B (table [id val] (list [4 5] [40.0 50.0]))) +(set DT-A (table [id d val] (list [1 2 3] [2024.01.01 2024.01.01 2024.01.02] [10.0 20.0 30.0]))) +(set DT-B (table [id d val] (list [4 5] [2024.01.03 2024.01.04] [40.0 50.0]))) (.db.splayed.set "/tmp/rfl_part_date/2024.01.01/t/" DT-A) (.db.splayed.set "/tmp/rfl_part_date/2024.01.02/t/" DT-B) (set Pd (.db.parted.get "/tmp/rfl_part_date/" 't)) -;; total rows = 3 + 2; columns = 1 partition-key + 2 data +;; total rows = 3 + 2; columns = 1 partition-key + 3 data (count Pd) -- 5 -(count (key Pd)) -- 3 +(count (key Pd)) -- 4 ;; partition-key column is named 'date when partitions are dates (first (key Pd)) -- 'date ;; data column names survive in their original order after the ;; partition-key prefix -(key Pd) -- ['date 'id 'val] +(key Pd) -- ['date 'id 'd 'val] +(sum (at Pd 'id)) -- 15 +(avg (at Pd 'val)) -- 30.0 +(min (at Pd 'd)) -- 2024.01.01 +(max (at Pd 'd)) -- 2024.01.04 +(count (distinct (at Pd 'id))) -- 5 +(sum (+ (at Pd 'id) 1)) -- 20 ;; ────────────── int-partition path: RAY_MC_I64 branch ────────────── ;; Pure-integer partition names — drives is_integer_str → all_int @@ -78,7 +84,7 @@ ;; ────────────── single-partition root ────────────── ;; ray_read_parted has no part_count==1 fast-return; it always runs ;; the full builder. This case still covers the for-loop bodies with a -;; single iteration and the segment retain/willneed call. +;; single iteration and segment retaining. (set SP (table [x] (list [7 8 9 10]))) (.db.splayed.set "/tmp/rfl_part_single/2024.06.15/t/" SP) (set Psp (.db.parted.get "/tmp/rfl_part_single/" 't)) @@ -102,6 +108,50 @@ (count (key Pt)) -- 2 (first (key Pt)) -- 'date +;; ────────────── grouped count(distinct) over parted input ────────────── +;; Regression for the non-agg scatter path used by analytical grouped +;; count-distinct queries: parted inputs should not require flattening the +;; whole table, and the row->group map must still span all partitions. +(set CD-A (table [RegionID UserID payload] (list [1 1 2 2] [10 10 20 21] ["a" "b" "c" "d"]))) +(set CD-B (table [RegionID UserID payload] (list [2 2 3 3] [20 22 30 31] ["e" "f" "g" "h"]))) +(.db.splayed.set "/tmp/rfl_part_cd/2024.01.01/t/" CD-A) +(.db.splayed.set "/tmp/rfl_part_cd/2024.01.02/t/" CD-B) +(set Pcd (.db.parted.get "/tmp/rfl_part_cd/" 't)) +(set Rcd (select {RegionID: RegionID u: (count (distinct UserID)) from: Pcd by: RegionID desc: u take: 3})) +(at Rcd 'RegionID) -- [2 3 1] +(at Rcd 'u) -- [3 2 1] +(set CD-S1 (table [phrase user] (list ['skip 'alpha 'alpha 'beta] [1 2 3 4]))) +(set CD-S2 (table [phrase user] (list ['skip 'beta 'gamma 'gamma] [5 6 7 8]))) +(.db.splayed.set "/tmp/rfl_part_cd_sym/2024.02.01/s/" CD-S1) +(.db.splayed.set "/tmp/rfl_part_cd_sym/2024.02.02/s/" CD-S2) +(set Pcs (.db.parted.get "/tmp/rfl_part_cd_sym/" 's)) +(set Rcs (select {phrase: phrase c: (count phrase) from: Pcs where: (!= phrase 'skip) by: phrase desc: c take: 3})) +(at Rcs 'phrase) -- ['alpha 'beta 'gamma] +(at Rcs 'c) -- [2 2 2] + +(set TM-A (table [UserID EventTime phrase] (list [1 1 2] [00:01:00 00:01:30 00:02:00] ['a 'a 'b]))) +(set TM-B (table [UserID EventTime phrase] (list [1 3 3] [00:01:45 00:03:00 00:03:30] ['a 'c 'c]))) +(.db.splayed.set "/tmp/rfl_part_minute/2024.03.01/t/" TM-A) +(.db.splayed.set "/tmp/rfl_part_minute/2024.03.02/t/" TM-B) +(set Ptm (.db.parted.get "/tmp/rfl_part_minute/" 't)) +(set Rtm (select {c: (count UserID) from: Ptm by: {UserID: UserID m: (minute EventTime) phrase: phrase} desc: c take: 3})) +(at Rtm 'UserID) -- [1 3 2] +(at Rtm 'm) -- [1 3 2] +(at Rtm 'phrase) -- ['a 'c 'b] +(at Rtm 'c) -- [3 2 1] + +;; ────────────── LIKE over parted text columns ────────────── +;; Regression for predicates over parted STR/SYM columns: the boolean +;; result must be sized to total rows, not number of partitions. +(set LK-A (table [url title] (list ['alpha 'google 'docs] ["alpha page" "search google" "manual"]))) +(set LK-B (table [url title] (list ['googlemaps 'plain 'news] ["maps" "plain page" "google news"]))) +(.db.splayed.set "/tmp/rfl_part_like/2024.04.01/t/" LK-A) +(.db.splayed.set "/tmp/rfl_part_like/2024.04.02/t/" LK-B) +(set Plk (.db.parted.get "/tmp/rfl_part_like/" 't)) +(count (select {from: Plk where: (like url "*google*")})) -- 2 +(count (select {from: Plk where: (like title "*google*")})) -- 2 +(count (select {from: Plk where: (and (!= url 'plain) (like url "*google*"))})) -- 2 + ;; ────────────── .db.parted.mount over int partitions ────────────── ;; Mount needs at least one digit/dot subdir (dir_is_parted_root) and ;; uses the first partition's subdirs as table names. This double- @@ -149,4 +199,4 @@ (.db.parted.get "/tmp/rfl_part_date/") !- domain ;; ────────────── teardown ────────────── -(.sys.exec "rm -rf /tmp/rfl_part_date /tmp/rfl_part_int /tmp/rfl_part_sym /tmp/rfl_part_single /tmp/rfl_part_empty /tmp/rfl_part_missing /tmp/rfl_part_three") +(.sys.exec "rm -rf /tmp/rfl_part_date /tmp/rfl_part_int /tmp/rfl_part_sym /tmp/rfl_part_single /tmp/rfl_part_empty /tmp/rfl_part_missing /tmp/rfl_part_three /tmp/rfl_part_cd /tmp/rfl_part_cd_sym /tmp/rfl_part_minute /tmp/rfl_part_like") diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl index 1293a90b..9e1c8e52 100644 --- a/test/rfl/system/read_csv.rfl +++ b/test/rfl/system/read_csv.rfl @@ -19,6 +19,45 @@ (.sys.exec "rm -f rf_test_syms.csv") -- 0 +;; Explicit names + types loads a no-header CSV and binds user names. +(.sys.exec "rm -f rf_test_named_no_header.csv") -- 0 +(.sys.exec "printf '1,alice\\n2,bob\\n' > rf_test_named_no_header.csv") -- 0 +(set _named (.csv.read [id name] [I64 STR] "rf_test_named_no_header.csv")) +(count _named) -- 2 +(sum (at _named 'id)) -- 3 +(first (at _named 'name)) -- "alice" +(.sys.exec "rm -f rf_test_named_no_header.csv") -- 0 + +;; CSV storage helpers keep parsing in .csv but return disk-backed tables. +(.sys.exec "rm -rf rf_test_csv_splayed rf_test_csv_parted rf_test_csv_store.csv") -- 0 +(.sys.exec "printf 'id,val\\n1,10\\n2,20\\n' > rf_test_csv_store.csv") -- 0 +(set _csv_sp (.csv.splayed "rf_test_csv_store.csv" "rf_test_csv_splayed/")) +(count _csv_sp) -- 2 +(sum (at _csv_sp 'id)) -- 3 +(count (.db.splayed.get "rf_test_csv_splayed/")) -- 2 +(set _csv_pt (.csv.parted "rf_test_csv_store.csv" "rf_test_csv_parted/" 'hits)) +(count _csv_pt) -- 2 +(sum (at _csv_pt 'id)) -- 3 +(sum (+ (at _csv_pt 'id) 1)) -- 5 +(.sys.exec "rm -rf rf_test_csv_splayed rf_test_csv_parted rf_test_csv_store.csv") -- 0 + +;; The stored variants support the explicit names + types no-header form too. +(.sys.exec "rm -rf rf_test_csv_parted2 rf_test_csv_no_header.csv") -- 0 +(.sys.exec "printf '1,10\\n2,20\\n' > rf_test_csv_no_header.csv") -- 0 +(set _csv_pn (.csv.parted [id val] [I64 I64] "rf_test_csv_no_header.csv" "rf_test_csv_parted2/" 't)) +(count _csv_pn) -- 2 +(sum (at _csv_pn 'val)) -- 30 +(.sys.exec "rm -rf rf_test_csv_parted2 rf_test_csv_no_header.csv") -- 0 + +;; .csv.parted accepts an optional rows-per-part argument before root/table. +(.sys.exec "rm -rf rf_test_csv_parted_rows rf_test_csv_rows.csv") -- 0 +(.sys.exec "printf '1,10\\n2,20\\n3,30\\n' > rf_test_csv_rows.csv") -- 0 +(set _csv_pr (.csv.parted [id val] [I64 I64] "rf_test_csv_rows.csv" 1 "rf_test_csv_parted_rows/" 't)) +(count _csv_pr) -- 3 +(count (.db.parted.get "rf_test_csv_parted_rows/" 't)) -- 3 +(.sys.exec "test $(find rf_test_csv_parted_rows -mindepth 1 -maxdepth 1 -type d | wc -l) -eq 3") -- 0 +(.sys.exec "rm -rf rf_test_csv_parted_rows rf_test_csv_rows.csv") -- 0 + ;; ── R6 regression: empty TSV/CSV fields → empty SYM (not null sentinel) ── ;; CSV format conflates "missing" and "empty"; the loader treats empty ;; SYM cells as the interned empty string so SQL-style `(!= col "")` diff --git a/test/rfl/system/reserved_namespace.rfl b/test/rfl/system/reserved_namespace.rfl index acceef7c..3f7577f5 100644 --- a/test/rfl/system/reserved_namespace.rfl +++ b/test/rfl/system/reserved_namespace.rfl @@ -26,8 +26,8 @@ ;; getenv, setenv, size, list (count .ipc) -- 3 ;; open, close, send -(count .csv) -- 2 -;; read, write +(count .csv) -- 4 +;; read, splayed, parted, write ;; .db.* — three-level reserved namespace for storage I/O. The ;; root resolves to a dict whose values are themselves dicts ;; ({splayed: …, parted: …}); each format dict carries set/get/ @@ -54,6 +54,8 @@ (nil? (resolve '.sys.info)) -- false (nil? (resolve '.os.getenv)) -- false (nil? (resolve '.csv.read)) -- false +(nil? (resolve '.csv.splayed)) -- false +(nil? (resolve '.csv.parted)) -- false ;; OS env round-trip: set a key, read it back. (.os.setenv "RAYF_NS_TEST" "hello") (.os.getenv "RAYF_NS_TEST") -- "hello" diff --git a/test/rfl/table/select.rfl b/test/rfl/table/select.rfl index e47b48a4..caac1950 100644 --- a/test/rfl/table/select.rfl +++ b/test/rfl/table/select.rfl @@ -27,6 +27,14 @@ (count (select {s: (sum size) from: trades by: sym})) -- 3 (sum (at (select {s: (sum size) from: trades by: sym}) 's)) -- 1240 (sum (at (select {s: (sum size) from: trades by: sym where: (== sym 'AAPL)}) 's)) -- 460 +(set grp-key-proj (select {sym: sym s: (sum size) c: (count sym) u: (count (distinct price)) from: trades by: sym desc: c})) +(count (key grp-key-proj)) -- 4 +(type (at grp-key-proj 'sym)) -- 'SYM +(count (select {sym: sym c: (count sym) from: trades by: sym desc: c take: 2})) -- 2 +(first (at (select {sym: sym c: (count sym) from: trades by: sym desc: c take: 2}) 'c)) -- 4 +(sum (at (select {sym: sym c: (count sym) from: trades by: sym desc: c take: 2}) 'c)) -- 7 +(count (select {sym: sym c: (count sym) from: trades where: (> size 50) by: sym desc: c take: 2})) -- 2 +(sum (at (select {sym: sym c: (count sym) from: trades where: (> size 50) by: sym desc: c take: 2}) 'c)) -- 7 ;; ── scalar aggregation (no `by:`) must honour `where:` — the lazy WHERE ;; rowsel is consumed by exec_reduction over the column vector. Two-step @@ -121,6 +129,12 @@ (count (select {s: (sum qty) from: trades-mk by: [sector side]})) -- 6 (sum (at (select {s: (sum qty) from: trades-mk by: [sector side]}) 's)) -- 1165 (sum (at (select {s: (sum qty) from: trades-mk by: [sector side] where: (and (== sector 'Tech) (== side 'Buy))}) 's)) -- 515 +(count (select {c: (count qty) from: trades-mk by: [sector side] take: 3})) -- 3 + +;; ── grouped string length aggregation streams from the source column +(set page-lens (table [bucket url] (list [1 1 2 2] (list "aa" "bbbb" "ccc" "")))) +(first (at (select {l: (avg (strlen url)) from: page-lens by: bucket where: (== bucket 1)} ) 'l)) -- 3.0 +(first (at (select {l: (sum (strlen url)) from: page-lens by: bucket where: (== bucket 2)} ) 'l)) -- 3 ;; ── three-key (count (select {s: (sum qty) from: trades-mk by: [sym sector side]})) -- 12 @@ -130,6 +144,13 @@ (count (select {s: (sum price) from: trades-mk by: qty})) -- 11 (sum (at (select {s: (sum price) from: trades-mk by: qty}) 's)) -- 15346.5 +;; ── dependent integer keys are equivalent to grouping by the base key +(set dep-keys (table [id] (list [1 1 2 3 3]))) +(count (select {c: (count id) from: dep-keys by: {id: id m1: (- id 1) p2: (+ id 2)}})) -- 3 +(sum (at (select {c: (count id) from: dep-keys by: {id: id m1: (- id 1) p2: (+ id 2)}}) 'c)) -- 5 +(sum (at (select {c: (count id) from: dep-keys by: {id: id m1: (- id 1) p2: (+ id 2)}}) 'm1)) -- 3 +(sum (at (select {c: (count id) from: dep-keys by: {id: id m1: (- id 1) p2: (+ id 2)}}) 'p2)) -- 12 + ;; ── filter-then-group preserves filtered totals (count (select {s: (sum qty) from: trades-mk by: sym where: (== side 'Buy)})) -- 6 (sum (at (select {s: (sum qty) from: trades-mk by: sym where: (== side 'Buy)}) 's)) -- 830 diff --git a/test/test_csv.c b/test/test_csv.c index e97bcdb3..59d24873 100644 --- a/test/test_csv.c +++ b/test/test_csv.c @@ -280,9 +280,8 @@ static test_result_t test_csv_null_sym(void) { /* CSV format conflates "empty field" and "missing field" — both * appear as a zero-length cell. The Rayforce loader interns empty * SYM cells as the empty SYM (not the null sentinel) so SQL-style - * `(!= col "")` filters work the way users expect. See R6 in - * ClickBench/rayforce/REMAINING_FIXES.md. RAY_STR columns and - * non-string types preserve the null distinction. */ + * `(!= col "")` filters work the way users expect. RAY_STR columns + * and non-string types preserve the null distinction. */ ray_heap_init(); (void)ray_sym_init(); @@ -1296,6 +1295,30 @@ static test_result_t test_csv_explicit_u8_schema_serial(void) { PASS(); } +static test_result_t test_csv_infer_high_cardinality_str(void) { + ray_heap_init(); + (void)ray_sym_init(); + + FILE* f = fopen(TMP_CSV, "w"); + fprintf(f, "payload\n"); + for (int i = 0; i < 100; i++) + fprintf(f, "unique_payload_%03d\n", i); + fclose(f); + + ray_t* loaded = ray_read_csv(TMP_CSV); + TEST_ASSERT_FALSE(RAY_IS_ERR(loaded)); + ray_t* col = ray_table_get_col_idx(loaded, 0); + TEST_ASSERT_NOT_NULL(col); + TEST_ASSERT_EQ_I(col->type, RAY_STR); + TEST_ASSERT_EQ_I(ray_table_nrows(loaded), 100); + + ray_release(loaded); + unlink(TMP_CSV); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + const test_entry_t csv_entries[] = { { "csv/roundtrip_i64", test_csv_roundtrip_i64, NULL, NULL }, { "csv/roundtrip_guid", test_csv_guid_roundtrip, NULL, NULL }, @@ -1332,6 +1355,7 @@ const test_entry_t csv_entries[] = { { "csv/header_needs_quoting", test_csv_header_needs_quoting, NULL, NULL }, { "csv/parallel_parse", test_csv_parallel_parse, NULL, NULL }, { "csv/sym_narrowing", test_csv_sym_narrowing, NULL, NULL }, + { "csv/infer_high_cardinality_str", test_csv_infer_high_cardinality_str, NULL, NULL }, /* Narrow-int explicit schema (regression for missing parse_types map * entries that routed U8/I16/I32 to STR and corrupted the heap). */ { "csv/explicit_u8_schema", test_csv_explicit_u8_schema, NULL, NULL }, diff --git a/test/test_fused_group.c b/test/test_fused_group.c index e5e29095..f83902db 100644 --- a/test/test_fused_group.c +++ b/test/test_fused_group.c @@ -33,6 +33,7 @@ #include "test.h" #include #include "mem/heap.h" +#include "ops/internal.h" #include "ops/ops.h" #include "ops/fused_group.h" #include "table/sym.h" @@ -236,6 +237,65 @@ static test_result_t test_ne_const_out_of_range_u8(void) { PASS(); } +static test_result_t test_count1_i16_direct_counts_negative_keys(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int16_t kv[] = { -2, -1, -2, 0, 1, 1, 1 }; + ray_t* col = ray_vec_new(RAY_I16, 7); + TEST_ASSERT_NOT_NULL(col); + col->len = 7; + memcpy(ray_data(col), kv, sizeof(kv)); + + int64_t k_sym = ray_sym_intern("k16", 3); + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, k_sym, col); + ray_release(col); + + ray_graph_t* g = ray_graph_new(tbl); + TEST_ASSERT_NOT_NULL(g); + ray_op_t* scan_k = ray_scan(g, "k16"); + ray_op_t* scan_pred = ray_scan(g, "k16"); + ray_op_t* min_key = ray_const_i64(g, -2); + ray_op_t* pred = ray_binop(g, OP_GE, scan_pred, min_key); + uint16_t agg_ops[] = { OP_COUNT }; + ray_op_t* agg_ins[] = { scan_k }; + ray_op_t* keys[] = { scan_k }; + ray_op_t* fused = ray_filtered_group(g, pred, keys, 1, agg_ops, agg_ins, 1); + TEST_ASSERT_NOT_NULL(fused); + + ray_t* res = ray_execute(g, fused); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 4); + + int64_t cnt_sym = ray_sym_intern("count", 5); + ray_t* key_col = ray_table_get_col(res, k_sym); + ray_t* cnt_col = ray_table_get_col(res, cnt_sym); + TEST_ASSERT_NOT_NULL(key_col); + TEST_ASSERT_NOT_NULL(cnt_col); + + int64_t got_m2 = -1, got_m1 = -1, got_0 = -1, got_1 = -1; + int16_t* ks = (int16_t*)ray_data(key_col); + int64_t* cs = (int64_t*)ray_data(cnt_col); + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + if (ks[i] == -2) got_m2 = cs[i]; + else if (ks[i] == -1) got_m1 = cs[i]; + else if (ks[i] == 0) got_0 = cs[i]; + else if (ks[i] == 1) got_1 = cs[i]; + } + TEST_ASSERT_EQ_I(got_m2, 2); + TEST_ASSERT_EQ_I(got_m1, 1); + TEST_ASSERT_EQ_I(got_0, 1); + TEST_ASSERT_EQ_I(got_1, 3); + + ray_release(res); + ray_graph_free(g); + ray_release(tbl); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + /* Finding #4 (signed narrow agg read) regression: SUM of an I16 * column with negative values must produce the correct signed sum, * not a sum where -1 is read as 65535. */ @@ -704,6 +764,219 @@ static test_result_t test_multi_agg_multi_key(void) { PASS(); } +static test_result_t test_multi_key_in_pred(void) { + ray_heap_init(); + (void)ray_sym_init(); + + ray_t* g1c = ray_vec_new(RAY_I32, 8); g1c->len = 8; + ray_t* g2c = ray_vec_new(RAY_I32, 8); g2c->len = 8; + ray_t* vc = ray_vec_new(RAY_I64, 8); vc->len = 8; + ray_t* sc = ray_vec_new(RAY_I64, 2); sc->len = 2; + int32_t g1[] = {1, 1, 2, 2, 1, 1, 2, 2}; + int32_t g2[] = {1, 2, 1, 2, 1, 2, 1, 2}; + int64_t v[] = {10, 20, 30, 40, 50, 60, 70, 80}; + int64_t set_vals[] = {20, 70}; + memcpy(ray_data(g1c), g1, sizeof(g1)); + memcpy(ray_data(g2c), g2, sizeof(g2)); + memcpy(ray_data(vc), v, sizeof(v)); + memcpy(ray_data(sc), set_vals, sizeof(set_vals)); + + int64_t s_g1 = ray_sym_intern("g1", 2); + int64_t s_g2 = ray_sym_intern("g2", 2); + int64_t s_v = ray_sym_intern("v", 1); + ray_t* tbl = ray_table_new(3); + tbl = ray_table_add_col(tbl, s_g1, g1c); ray_release(g1c); + tbl = ray_table_add_col(tbl, s_g2, g2c); ray_release(g2c); + tbl = ray_table_add_col(tbl, s_v, vc); ray_release(vc); + + ray_graph_t* g = ray_graph_new(tbl); + ray_op_t* scan_g1 = ray_scan(g, "g1"); + ray_op_t* scan_g2 = ray_scan(g, "g2"); + ray_op_t* scan_v = ray_scan(g, "v"); + ray_op_t* scan_vp = ray_scan(g, "v"); + ray_op_t* set_op = ray_const_vec(g, sc); + ray_op_t* pred = ray_binop(g, OP_IN, scan_vp, set_op); + ray_release(sc); + + uint16_t agg_ops[] = { OP_COUNT }; + ray_op_t* agg_ins[] = { scan_v }; + ray_op_t* keys[] = { scan_g1, scan_g2 }; + ray_op_t* fused = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1); + TEST_ASSERT_NOT_NULL(fused); + + ray_t* res = ray_execute(g, fused); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + int64_t cnt_sym = ray_sym_intern("count", 5); + ray_t* k1_col = ray_table_get_col(res, s_g1); + ray_t* k2_col = ray_table_get_col(res, s_g2); + ray_t* c_col = ray_table_get_col(res, cnt_sym); + TEST_ASSERT_NOT_NULL(k1_col); + TEST_ASSERT_NOT_NULL(k2_col); + TEST_ASSERT_NOT_NULL(c_col); + int64_t got_12 = 0, got_21 = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int32_t a = ((int32_t*)ray_data(k1_col))[i]; + int32_t b = ((int32_t*)ray_data(k2_col))[i]; + int64_t c = ((int64_t*)ray_data(c_col))[i]; + if (a == 1 && b == 2) got_12 = c; + if (a == 2 && b == 1) got_21 = c; + } + TEST_ASSERT_EQ_I(got_12, 1); + TEST_ASSERT_EQ_I(got_21, 1); + + ray_release(res); ray_graph_free(g); ray_release(tbl); + ray_sym_destroy(); ray_heap_destroy(); + PASS(); +} + +static test_result_t test_wide_multi_key_in_pred(void) { + ray_heap_init(); + (void)ray_sym_init(); + + ray_t* k1c = ray_vec_new(RAY_I64, 8); k1c->len = 8; + ray_t* k2c = ray_vec_new(RAY_I32, 8); k2c->len = 8; + ray_t* vc = ray_vec_new(RAY_I64, 8); vc->len = 8; + ray_t* sc = ray_vec_new(RAY_I64, 2); sc->len = 2; + int64_t k1[] = {10, 10, 20, 20, 10, 10, 20, 20}; + int32_t k2[] = {1, 2, 1, 2, 1, 2, 1, 2}; + int64_t v[] = {10, 20, 30, 40, 50, 60, 70, 80}; + int64_t set_vals[] = {20, 70}; + memcpy(ray_data(k1c), k1, sizeof(k1)); + memcpy(ray_data(k2c), k2, sizeof(k2)); + memcpy(ray_data(vc), v, sizeof(v)); + memcpy(ray_data(sc), set_vals, sizeof(set_vals)); + + int64_t s_k1 = ray_sym_intern("k1", 2); + int64_t s_k2 = ray_sym_intern("k2", 2); + int64_t s_v = ray_sym_intern("v", 1); + ray_t* tbl = ray_table_new(3); + tbl = ray_table_add_col(tbl, s_k1, k1c); ray_release(k1c); + tbl = ray_table_add_col(tbl, s_k2, k2c); ray_release(k2c); + tbl = ray_table_add_col(tbl, s_v, vc); ray_release(vc); + + ray_graph_t* g = ray_graph_new(tbl); + ray_op_t* scan_k1 = ray_scan(g, "k1"); + ray_op_t* scan_k2 = ray_scan(g, "k2"); + ray_op_t* scan_v = ray_scan(g, "v"); + ray_op_t* scan_vp = ray_scan(g, "v"); + ray_op_t* set_op = ray_const_vec(g, sc); + ray_op_t* pred = ray_binop(g, OP_IN, scan_vp, set_op); + ray_release(sc); + + uint16_t agg_ops[] = { OP_COUNT }; + ray_op_t* agg_ins[] = { scan_v }; + ray_op_t* keys[] = { scan_k1, scan_k2 }; + ray_op_t* fused = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1); + TEST_ASSERT_NOT_NULL(fused); + + ray_t* res = ray_execute(g, fused); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + int64_t cnt_sym = ray_sym_intern("count", 5); + ray_t* out_k1 = ray_table_get_col(res, s_k1); + ray_t* out_k2 = ray_table_get_col(res, s_k2); + ray_t* c_col = ray_table_get_col(res, cnt_sym); + TEST_ASSERT_NOT_NULL(out_k1); + TEST_ASSERT_NOT_NULL(out_k2); + TEST_ASSERT_NOT_NULL(c_col); + int64_t got_102 = 0, got_201 = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int64_t a = ((int64_t*)ray_data(out_k1))[i]; + int32_t b = ((int32_t*)ray_data(out_k2))[i]; + int64_t c = ((int64_t*)ray_data(c_col))[i]; + if (a == 10 && b == 2) got_102 = c; + if (a == 20 && b == 1) got_201 = c; + } + TEST_ASSERT_EQ_I(got_102, 1); + TEST_ASSERT_EQ_I(got_201, 1); + + ray_release(res); ray_graph_free(g); ray_release(tbl); + ray_sym_destroy(); ray_heap_destroy(); + PASS(); +} + +static test_result_t test_wide_multi_key_top_count_emit_filter(void) { + ray_heap_init(); + (void)ray_sym_init(); + + enum { N = 15 }; + ray_t* k1c = ray_vec_new(RAY_I64, N); k1c->len = N; + ray_t* k2c = ray_vec_new(RAY_I32, N); k2c->len = N; + ray_t* vc = ray_vec_new(RAY_I64, N); vc->len = N; + int64_t k1[] = { + 10,10,10,10,10, 20,20,20,20, 30,30,30, 40,40, 50 + }; + int32_t k2[] = { + 1,1,1,1,1, 2,2,2,2, 3,3,3, 4,4, 5 + }; + int64_t v[] = { + 0,1,2,3,4, 5,6,7,8, 9,10,11, 12,13, 14 + }; + memcpy(ray_data(k1c), k1, sizeof(k1)); + memcpy(ray_data(k2c), k2, sizeof(k2)); + memcpy(ray_data(vc), v, sizeof(v)); + + int64_t s_k1 = ray_sym_intern("k1", 2); + int64_t s_k2 = ray_sym_intern("k2", 2); + int64_t s_v = ray_sym_intern("v", 1); + ray_t* tbl = ray_table_new(3); + tbl = ray_table_add_col(tbl, s_k1, k1c); ray_release(k1c); + tbl = ray_table_add_col(tbl, s_k2, k2c); ray_release(k2c); + tbl = ray_table_add_col(tbl, s_v, vc); ray_release(vc); + + ray_graph_t* g = ray_graph_new(tbl); + ray_op_t* scan_k1 = ray_scan(g, "k1"); + ray_op_t* scan_k2 = ray_scan(g, "k2"); + ray_op_t* scan_v = ray_scan(g, "v"); + ray_op_t* scan_vp = ray_scan(g, "v"); + ray_op_t* zero = ray_const_i64(g, 0); + ray_op_t* pred = ray_binop(g, OP_GE, scan_vp, zero); + + uint16_t agg_ops[] = { OP_COUNT }; + ray_op_t* agg_ins[] = { scan_v }; + ray_op_t* keys[] = { scan_k1, scan_k2 }; + ray_op_t* fused = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1); + TEST_ASSERT_NOT_NULL(fused); + + ray_group_emit_filter_t prev = ray_group_emit_filter_get(); + ray_group_emit_filter_t filter = {0}; + filter.enabled = 1; + filter.agg_index = 0; + filter.top_count_take = 2; + ray_group_emit_filter_set(filter); + ray_t* res = ray_execute(g, fused); + ray_group_emit_filter_set(prev); + + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + int64_t cnt_sym = ray_sym_intern("count", 5); + ray_t* out_k1 = ray_table_get_col(res, s_k1); + ray_t* out_k2 = ray_table_get_col(res, s_k2); + ray_t* c_col = ray_table_get_col(res, cnt_sym); + TEST_ASSERT_NOT_NULL(out_k1); + TEST_ASSERT_NOT_NULL(out_k2); + TEST_ASSERT_NOT_NULL(c_col); + + int64_t got_101 = 0, got_202 = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int64_t a = ((int64_t*)ray_data(out_k1))[i]; + int32_t b = ((int32_t*)ray_data(out_k2))[i]; + int64_t c = ((int64_t*)ray_data(c_col))[i]; + if (a == 10 && b == 1) got_101 = c; + if (a == 20 && b == 2) got_202 = c; + } + TEST_ASSERT_EQ_I(got_101, 5); + TEST_ASSERT_EQ_I(got_202, 4); + + ray_release(res); ray_graph_free(g); ray_release(tbl); + ray_sym_destroy(); ray_heap_destroy(); + PASS(); +} + /* Chunk 2 + 3 + 4: wide multi-key path (mk_shard_grow wide branch, * mk_compose_key2, mk_hash_lo_hi, mk_state_merge AVG). * @@ -1113,6 +1386,7 @@ const test_entry_t fused_group_entries[] = { { "fused_group/eq_no_match", test_eq_no_match, NULL, NULL }, { "fused_group/eq_const_out_of_range_u8", test_eq_const_out_of_range_u8, NULL, NULL }, { "fused_group/ne_const_out_of_range_u8", test_ne_const_out_of_range_u8, NULL, NULL }, + { "fused_group/count1_i16_direct_negative", test_count1_i16_direct_counts_negative_keys, NULL, NULL }, { "fused_group/sum_negative_i16", test_sum_negative_i16, NULL, NULL }, { "fused_group/fallback_filter_honored", test_fallback_filter_honored, NULL, NULL }, { "fused_group/count1_rejects_nullable_key", test_count1_rejects_nullable_key, NULL, NULL }, @@ -1125,6 +1399,9 @@ const test_entry_t fused_group_entries[] = { { "fused_group/fold_i32_eq_above", test_fold_i32_eq_above, NULL, NULL }, { "fused_group/fold_i32_ne_above", test_fold_i32_ne_above, NULL, NULL }, { "fused_group/multi_agg_multi_key", test_multi_agg_multi_key, NULL, NULL }, + { "fused_group/multi_key_in_pred", test_multi_key_in_pred, NULL, NULL }, + { "fused_group/wide_multi_key_in_pred", test_wide_multi_key_in_pred, NULL, NULL }, + { "fused_group/wide_multi_key_top_count", test_wide_multi_key_top_count_emit_filter, NULL, NULL }, { "fused_group/wide_multi_key", test_wide_multi_key, NULL, NULL }, { "fused_group/count1_parallel_combine", test_count1_parallel_combine, NULL, NULL }, { "fused_group/count1_shard_grow", test_count1_shard_grow, NULL, NULL }, diff --git a/test/test_fused_topk.c b/test/test_fused_topk.c index 523b3c54..9e4e853f 100644 --- a/test/test_fused_topk.c +++ b/test/test_fused_topk.c @@ -428,9 +428,10 @@ static test_result_t test_topk_gate_k_ge_nrows(void) { PASS(); } -static test_result_t test_topk_gate_unsupported_out_col_type(void) { - /* Build a table with an F64 column — F64 is not in the supported - * output set, so the out-col gate must reject. */ +static test_result_t test_topk_gathers_f64_out_col_type(void) { + /* Output columns use the shared gather helper, so F64 projections + * are safe even though the sort key still uses the fixed-width + * comparator gate. */ int64_t N = 50; ray_t* fc = ray_vec_new(RAY_F64, N); fc->len = N; double* fd = (double*)ray_data(fc); @@ -447,12 +448,21 @@ static test_result_t test_topk_gate_unsupported_out_col_type(void) { ray_t* where_expr = ray_parse("(>= sel 0)"); int64_t sort_keys[1] = { s_sel }; uint8_t sort_descs[1] = { 0 }; - int64_t out_syms[2] = { s_sel, s_f }; /* f is F64 → unsupported */ + int64_t out_syms[2] = { s_sel, s_f }; ray_t* res = ray_fused_topk_select(tbl, where_expr, sort_keys, sort_descs, 1, 5, out_syms, NULL, 2); - TEST_ASSERT_NULL(res); - + TEST_ASSERT_NOT_NULL(res); + TEST_ASSERT_EQ_I(res->type, RAY_TABLE); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 5); + ray_t* out_f = ray_table_get_col(res, s_f); + TEST_ASSERT_NOT_NULL(out_f); + TEST_ASSERT_EQ_I(out_f->type, RAY_F64); + double* out_fd = (double*)ray_data(out_f); + TEST_ASSERT_FMT(out_fd[0] == 0.0, "expected first F64 output to be 0"); + TEST_ASSERT_FMT(out_fd[4] == 4.0, "expected fifth F64 output to be 4"); + + ray_release(res); ray_release(where_expr); ray_release(tbl); PASS(); } @@ -571,6 +581,36 @@ static test_result_t test_topk_basic_i64_asc(void) { PASS(); } +static test_result_t test_topk_in_pred(void) { + int64_t N = 100; + ray_t* tbl = make_i64_table(N); + ray_t* where_expr = ray_parse("(in g [1 3])"); + int64_t s_v = ray_sym_intern("v", 1); + int64_t s_g = ray_sym_intern("g", 1); + int64_t sort_keys[1] = { s_v }; + uint8_t sort_descs[1] = { 0 }; + int64_t out_syms[2] = { s_v, s_g }; + ray_t* res = ray_fused_topk_select(tbl, where_expr, + sort_keys, sort_descs, 1, 4, + out_syms, NULL, 2); + TEST_ASSERT_NOT_NULL(res); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 4); + ray_t* v_col = ray_table_get_col(res, s_v); + ray_t* g_col = ray_table_get_col(res, s_g); + TEST_ASSERT_NOT_NULL(v_col); + TEST_ASSERT_NOT_NULL(g_col); + int64_t expect_v[] = {1, 3, 5, 7}; + int64_t expect_g[] = {1, 3, 1, 3}; + for (int64_t i = 0; i < 4; i++) { + TEST_ASSERT_EQ_I(((int64_t*)ray_data(v_col))[i], expect_v[i]); + TEST_ASSERT_EQ_I(((int64_t*)ray_data(g_col))[i], expect_g[i]); + } + + ray_release(res); ray_release(where_expr); ray_release(tbl); + PASS(); +} + /* Aliased output columns — alias array non-NULL. Exercises line 414 * `int64_t alias = out_alias_syms ? out_alias_syms[c] : cs;`. */ static test_result_t test_topk_aliased_out(void) { @@ -703,13 +743,14 @@ const test_entry_t fused_topk_entries[] = { { "fused_topk/gate_k_too_large", test_topk_gate_k_too_large, topk_setup, topk_teardown }, { "fused_topk/gate_zero_sort_keys", test_topk_gate_zero_sort_keys, topk_setup, topk_teardown }, { "fused_topk/gate_k_ge_nrows", test_topk_gate_k_ge_nrows, topk_setup, topk_teardown }, - { "fused_topk/gate_unsupported_out_col", test_topk_gate_unsupported_out_col_type, topk_setup, topk_teardown }, + { "fused_topk/gather_f64_out_col", test_topk_gathers_f64_out_col_type, topk_setup, topk_teardown }, { "fused_topk/gate_unsupported_sort_key", test_topk_gate_unsupported_sort_key_type, topk_setup, topk_teardown }, { "fused_topk/gate_n_out_zero", test_topk_gate_n_out_zero, topk_setup, topk_teardown }, { "fused_topk/gate_too_many_sort_keys", test_topk_gate_too_many_sort_keys, topk_setup, topk_teardown }, { "fused_topk/gate_negative_k", test_topk_gate_negative_k, topk_setup, topk_teardown }, /* Happy paths */ { "fused_topk/basic_i64_asc", test_topk_basic_i64_asc, topk_setup, topk_teardown }, + { "fused_topk/in_pred", test_topk_in_pred, topk_setup, topk_teardown }, { "fused_topk/aliased_out", test_topk_aliased_out, topk_setup, topk_teardown }, { "fused_topk/propagates_nullmap", test_topk_propagates_nullmap, topk_setup, topk_teardown }, { "fused_topk/three_keys", test_topk_three_keys, topk_setup, topk_teardown }, diff --git a/test/test_group_extra.c b/test/test_group_extra.c index 729fb1d9..8d512596 100644 --- a/test/test_group_extra.c +++ b/test/test_group_extra.c @@ -1040,6 +1040,223 @@ static test_result_t test_count_distinct_per_group_parallel(void) { PASS(); } +static test_result_t test_i16_group_top_count_emit_filter(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int16_t keys_data[] = { + 1,1,1,1,1, + 2,2,2,2, + 3,3,3, + 4,4, + 5 + }; + enum { R = (int)(sizeof(keys_data) / sizeof(keys_data[0])) }; + + ray_t* keys_vec = ray_vec_new(RAY_I16, R); + TEST_ASSERT_NOT_NULL(keys_vec); + keys_vec->len = R; + memcpy(ray_data(keys_vec), keys_data, sizeof(keys_data)); + + int64_t key_sym = ray_sym_intern("k", 1); + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, key_sym, keys_vec); + ray_release(keys_vec); + + ray_graph_t* g = ray_graph_new(tbl); + TEST_ASSERT_NOT_NULL(g); + ray_op_t* scan_key = ray_scan(g, "k"); + uint16_t ops[] = { OP_COUNT }; + ray_op_t* ins[] = { scan_key }; + ray_op_t* keys[] = { scan_key }; + ray_op_t* grp = ray_group(g, keys, 1, ops, ins, 1); + TEST_ASSERT_NOT_NULL(grp); + + ray_group_emit_filter_t prev = ray_group_emit_filter_get(); + ray_group_emit_filter_t filter = {0}; + filter.enabled = 1; + filter.agg_index = 0; + filter.top_count_take = 2; + ray_group_emit_filter_set(filter); + ray_t* res = ray_execute(g, grp); + ray_group_emit_filter_set(prev); + + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + ray_t* out_key = ray_table_get_col(res, key_sym); + ray_t* out_cnt = ray_table_get_col_idx(res, 1); + TEST_ASSERT_NOT_NULL(out_key); + TEST_ASSERT_NOT_NULL(out_cnt); + + int got_1 = 0, got_2 = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int16_t k = ((int16_t*)ray_data(out_key))[i]; + int64_t c = ((int64_t*)ray_data(out_cnt))[i]; + if (k == 1 && c == 5) got_1 = 1; + if (k == 2 && c == 4) got_2 = 1; + } + TEST_ASSERT_TRUE(got_1 && got_2); + + ray_release(res); + ray_graph_free(g); + ray_release(tbl); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + +static test_result_t test_sym_group_top_count_emit_filter(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int64_t sym_a = ray_sym_intern("alpha", 5); + int64_t sym_b = ray_sym_intern("beta", 4); + int64_t sym_c = ray_sym_intern("gamma", 5); + int64_t sym_d = ray_sym_intern("delta", 5); + uint32_t keys_data[] = { + (uint32_t)sym_a, (uint32_t)sym_a, (uint32_t)sym_a, (uint32_t)sym_a, + (uint32_t)sym_b, (uint32_t)sym_b, (uint32_t)sym_b, + (uint32_t)sym_c, (uint32_t)sym_c, + (uint32_t)sym_d + }; + enum { R = (int)(sizeof(keys_data) / sizeof(keys_data[0])) }; + + ray_t* keys_vec = ray_sym_vec_new(RAY_SYM_W32, R); + TEST_ASSERT_NOT_NULL(keys_vec); + keys_vec->len = R; + memcpy(ray_data(keys_vec), keys_data, sizeof(keys_data)); + + int64_t key_sym = ray_sym_intern("s", 1); + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, key_sym, keys_vec); + ray_release(keys_vec); + + ray_graph_t* g = ray_graph_new(tbl); + TEST_ASSERT_NOT_NULL(g); + ray_op_t* scan_key = ray_scan(g, "s"); + uint16_t ops[] = { OP_COUNT }; + ray_op_t* ins[] = { scan_key }; + ray_op_t* keys[] = { scan_key }; + ray_op_t* grp = ray_group(g, keys, 1, ops, ins, 1); + TEST_ASSERT_NOT_NULL(grp); + + ray_group_emit_filter_t prev = ray_group_emit_filter_get(); + ray_group_emit_filter_t filter = {0}; + filter.enabled = 1; + filter.agg_index = 0; + filter.top_count_take = 2; + ray_group_emit_filter_set(filter); + ray_t* res = ray_execute(g, grp); + ray_group_emit_filter_set(prev); + + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + ray_t* out_key = ray_table_get_col(res, key_sym); + ray_t* out_cnt = ray_table_get_col_idx(res, 1); + TEST_ASSERT_NOT_NULL(out_key); + TEST_ASSERT_NOT_NULL(out_cnt); + + int got_a = 0, got_b = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int64_t k = ray_read_sym(ray_data(out_key), i, out_key->type, out_key->attrs); + int64_t c = ((int64_t*)ray_data(out_cnt))[i]; + if (k == sym_a && c == 4) got_a = 1; + if (k == sym_b && c == 3) got_b = 1; + } + TEST_ASSERT_TRUE(got_a && got_b); + + ray_release(res); + ray_graph_free(g); + ray_release(tbl); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + +static test_result_t test_five_key_group_top_count_emit_filter(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int16_t rows[][5] = { + {1, 10, 20, 30, 40}, {1, 10, 20, 30, 40}, + {1, 10, 20, 30, 40}, {1, 10, 20, 30, 40}, + {2, 11, 21, 31, 41}, {2, 11, 21, 31, 41}, + {2, 11, 21, 31, 41}, + {3, 12, 22, 32, 42}, {3, 12, 22, 32, 42}, + {4, 13, 23, 33, 43} + }; + enum { R = (int)(sizeof(rows) / sizeof(rows[0])) }; + const char* names[5] = { "k0", "k1", "k2", "k3", "k4" }; + int64_t syms[5]; + + ray_t* tbl = ray_table_new(5); + TEST_ASSERT_NOT_NULL(tbl); + for (int col = 0; col < 5; col++) { + ray_t* vec = ray_vec_new(RAY_I16, R); + TEST_ASSERT_NOT_NULL(vec); + vec->len = R; + int16_t* data = (int16_t*)ray_data(vec); + for (int row = 0; row < R; row++) + data[row] = rows[row][col]; + syms[col] = ray_sym_intern(names[col], 2); + tbl = ray_table_add_col(tbl, syms[col], vec); + ray_release(vec); + } + + ray_graph_t* g = ray_graph_new(tbl); + TEST_ASSERT_NOT_NULL(g); + ray_op_t* scans[5]; + for (int i = 0; i < 5; i++) { + scans[i] = ray_scan(g, names[i]); + TEST_ASSERT_NOT_NULL(scans[i]); + } + uint16_t ops[] = { OP_COUNT }; + ray_op_t* ins[] = { scans[0] }; + ray_op_t* grp = ray_group(g, scans, 5, ops, ins, 1); + TEST_ASSERT_NOT_NULL(grp); + + ray_group_emit_filter_t prev = ray_group_emit_filter_get(); + ray_group_emit_filter_t filter = {0}; + filter.enabled = 1; + filter.agg_index = 0; + filter.top_count_take = 2; + ray_group_emit_filter_set(filter); + ray_t* res = ray_execute(g, grp); + ray_group_emit_filter_set(prev); + + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 2); + + ray_t* out_k0 = ray_table_get_col(res, syms[0]); + ray_t* out_k1 = ray_table_get_col(res, syms[1]); + ray_t* out_k4 = ray_table_get_col(res, syms[4]); + ray_t* out_cnt = ray_table_get_col_idx(res, 5); + TEST_ASSERT_NOT_NULL(out_k0); + TEST_ASSERT_NOT_NULL(out_k1); + TEST_ASSERT_NOT_NULL(out_k4); + TEST_ASSERT_NOT_NULL(out_cnt); + + int got_1 = 0, got_2 = 0; + for (int64_t i = 0; i < ray_table_nrows(res); i++) { + int16_t k0 = ((int16_t*)ray_data(out_k0))[i]; + int16_t k1 = ((int16_t*)ray_data(out_k1))[i]; + int16_t k4 = ((int16_t*)ray_data(out_k4))[i]; + int64_t c = ((int64_t*)ray_data(out_cnt))[i]; + if (k0 == 1 && k1 == 10 && k4 == 40 && c == 4) got_1 = 1; + if (k0 == 2 && k1 == 11 && k4 == 41 && c == 3) got_2 = 1; + } + TEST_ASSERT_TRUE(got_1 && got_2); + + ray_release(res); + ray_graph_free(g); + ray_release(tbl); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + /* -------------------------------------------------------------------------- * Test registry * -------------------------------------------------------------------------- */ @@ -1059,5 +1276,8 @@ const test_entry_t group_extra_entries[] = { { "group_extra/reduction_var_i64_parallel", test_reduction_var_i64_parallel, NULL, NULL }, { "group_extra/count_distinct_parallel_types", test_count_distinct_parallel_types, NULL, NULL }, { "group_extra/count_distinct_per_group_parallel", test_count_distinct_per_group_parallel, NULL, NULL }, + { "group_extra/i16_group_top_count_emit_filter", test_i16_group_top_count_emit_filter, NULL, NULL }, + { "group_extra/sym_group_top_count_emit_filter", test_sym_group_top_count_emit_filter, NULL, NULL }, + { "group_extra/five_key_group_top_count_emit_filter", test_five_key_group_top_count_emit_filter, NULL, NULL }, { NULL, NULL, NULL, NULL }, }; diff --git a/test/test_lang.c b/test/test_lang.c index 7b7a725c..7440a76c 100644 --- a/test/test_lang.c +++ b/test/test_lang.c @@ -915,6 +915,37 @@ static test_result_t test_eval_count_table(void) { PASS(); } +/* ---- Test: count filtered table ---- */ +static test_result_t test_eval_count_select_where(void) { + ray_t* result = ray_eval_str( + "(do (set t (table ['a 'b] (list [1 2 3 4] [10 20 30 40]))) " + "(count (select {from: t where: (> a 2)})))"); + TEST_ASSERT_NOT_NULL(result); + TEST_ASSERT_FALSE(RAY_IS_ERR(result)); + TEST_ASSERT_EQ_I(result->type, -RAY_I64); + TEST_ASSERT_EQ_I(result->i64, 2); + ray_release(result); + + result = ray_eval_str( + "(do (set t (table ['a 'b] (list [1 2 3 4] [10 20 30 40]))) " + "(count (select {from: t where: (!= a 2)})))"); + TEST_ASSERT_NOT_NULL(result); + TEST_ASSERT_FALSE(RAY_IS_ERR(result)); + TEST_ASSERT_EQ_I(result->type, -RAY_I64); + TEST_ASSERT_EQ_I(result->i64, 3); + ray_release(result); + + result = ray_eval_str( + "(do (set t (table ['a 'b] (list [1 2 3 4] [10 20 30 40]))) " + "(count (select {from: t where: (<= 3 a)})))"); + TEST_ASSERT_NOT_NULL(result); + TEST_ASSERT_FALSE(RAY_IS_ERR(result)); + TEST_ASSERT_EQ_I(result->type, -RAY_I64); + TEST_ASSERT_EQ_I(result->i64, 2); + ray_release(result); + PASS(); +} + /* ---- Test: select all ---- */ static test_result_t test_eval_select_all(void) { ray_t* result = ray_eval_str( @@ -6610,6 +6641,7 @@ const test_entry_t lang_entries[] = { { "lang/eval/at_table", test_eval_at_table, lang_setup, lang_teardown }, { "lang/eval/key_table", test_eval_key_table, lang_setup, lang_teardown }, { "lang/eval/count_table", test_eval_count_table, lang_setup, lang_teardown }, + { "lang/eval/count_select_where", test_eval_count_select_where, lang_setup, lang_teardown }, { "lang/eval/select_all", test_eval_select_all, lang_setup, lang_teardown }, { "lang/eval/select_where", test_eval_select_where, lang_setup, lang_teardown }, { "lang/eval/select_where_in_sym", test_eval_select_where_in_sym, lang_setup, lang_teardown }, diff --git a/test/test_store.c b/test/test_store.c index 98111db5..62760870 100644 --- a/test/test_store.c +++ b/test/test_store.c @@ -369,8 +369,10 @@ static test_result_t test_splay_str_column_roundtrip(void) { TEST_ASSERT_NOT_NULL(loaded_ids); TEST_ASSERT_NOT_NULL(loaded_names); TEST_ASSERT_EQ_U(loaded_ids->mmod, 1); - TEST_ASSERT_EQ_U(loaded_names->mmod, 0); + TEST_ASSERT_EQ_U(loaded_names->mmod, 1); TEST_ASSERT_EQ_I(loaded_names->type, RAY_STR); + TEST_ASSERT_NOT_NULL(loaded_names->str_pool); + TEST_ASSERT_EQ_U(loaded_names->str_pool->mmod, 2); TEST_ASSERT_TRUE(loaded_names->attrs & RAY_ATTR_HAS_NULLS); TEST_ASSERT_TRUE(ray_vec_is_null(loaded_names, 2)); @@ -392,12 +394,9 @@ static test_result_t test_splay_str_column_roundtrip(void) { } /* ---- test_splay_short_strv_roundtrip ---------------------------------- - * Regression: 0-row STRV columns serialize to 14 bytes and 1-row STRV - * columns with content < 10 bytes serialize under 32 bytes total. The - * splay reader uses ray_col_mmap which falls through col_validate_mapped; - * before the magic-aware fix, mapped_size < 32 returned "corrupt" and - * the splay loader's "nyi" fallback to ray_col_load never fired, - * making short STRV tables unreadable via ray_read_splayed. + * Regression: short string columns must remain readable through + * ray_read_splayed. Older files used STRV and could be smaller than the + * raw header; newer files use the raw RAY_STR layout and mmap directly. * ---------------------------------------------------------------------- */ static test_result_t test_splay_short_strv_roundtrip(void) {