diff --git a/.gitignore b/.gitignore index 5d65ab34b1..259888fdff 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,24 @@ *.parquet hits.csv hits.tsv + +# Per-system runtime artifacts produced by benchmark.sh +result.csv +log.txt +load_out.txt +server.log +server.pid +arc_token.txt +data-size.txt +.doris_home +.sirius_env + +# Per-system data files +hits.db +mydb +hits.hyper +hits.vortex +*.vortex + +# Python venvs created by install scripts +myenv/ diff --git a/arc/benchmark.sh b/arc/benchmark.sh index d1f13caa72..b851876173 100755 --- a/arc/benchmark.sh +++ b/arc/benchmark.sh @@ -1,204 +1,5 @@ #!/bin/bash -# Arc ClickBench Complete Benchmark Script (Go Binary Version) -set -e - -# ============================================================ -# 1. INSTALL ARC FROM .DEB PACKAGE -# ============================================================ -echo "Installing Arc from .deb package..." - -# Fetch latest Arc version from GitHub releases -echo "Fetching latest Arc version..." -ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest | grep -oP '"tag_name": "v\K[^"]+') -if [ -z "$ARC_VERSION" ]; then - echo "Error: Could not fetch latest Arc version from GitHub" - exit 1 -fi -echo "Latest Arc version: $ARC_VERSION" - -ARCH=$(uname -m) -if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then - DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_arm64.deb" - DEB_FILE="arc_${ARC_VERSION}_arm64.deb" -else - DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_amd64.deb" - DEB_FILE="arc_${ARC_VERSION}_amd64.deb" -fi - -echo "Detected architecture: $ARCH -> $DEB_FILE" - -if [ ! -f "$DEB_FILE" ]; then - wget -q "$DEB_URL" -O "$DEB_FILE" -fi - -sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y -echo "[OK] Arc installed" - -# ============================================================ -# 2. PRINT SYSTEM INFO (Arc defaults) -# ============================================================ -CORES=$(nproc) -TOTAL_MEM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') -TOTAL_MEM_GB=$((TOTAL_MEM_KB / 1024 / 1024)) -MEM_LIMIT_GB=$((TOTAL_MEM_GB * 80 / 100)) # 80% of system RAM - -echo "" -echo "System Configuration:" -echo " CPU cores: $CORES" -echo " Connections: $((CORES * 2)) (cores × 2)" -echo " Threads: $CORES (same as cores)" -echo " Memory limit: ${MEM_LIMIT_GB}GB (80% of ${TOTAL_MEM_GB}GB total)" -echo "" - -# ============================================================ -# 3. START ARC AND CAPTURE TOKEN FROM LOGS -# ============================================================ -echo "Starting Arc service..." - -# Check if we already have a valid token from a previous run -if [ -f "arc_token.txt" ]; then - EXISTING_TOKEN=$(cat arc_token.txt) - echo "Found existing token file, will verify after Arc starts..." -fi - -sudo systemctl start arc - -# Wait for Arc to be ready -echo "Waiting for Arc to be ready..." -for i in {1..30}; do - if curl -sf http://localhost:8000/health > /dev/null 2>&1; then - echo "[OK] Arc is ready!" - break - fi - if [ $i -eq 30 ]; then - echo "Error: Arc failed to start within 30 seconds" - sudo journalctl -u arc --no-pager | tail -50 - exit 1 - fi - sleep 1 -done - -# Try to get token - either from existing file or from logs (first run) -ARC_TOKEN="" - -# First, check if existing token works -if [ -n "$EXISTING_TOKEN" ]; then - if curl -sf http://localhost:8000/health -H "x-api-key: $EXISTING_TOKEN" > /dev/null 2>&1; then - ARC_TOKEN="$EXISTING_TOKEN" - echo "[OK] Using existing token from arc_token.txt" - else - echo "Existing token invalid, looking for new token in logs..." - fi -fi - -# If no valid token yet, try to extract from logs (first run scenario) -if [ -z "$ARC_TOKEN" ]; then - ARC_TOKEN=$(sudo journalctl -u arc --no-pager | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' | head -1) - if [ -n "$ARC_TOKEN" ]; then - echo "[OK] Captured new token from logs" - echo "$ARC_TOKEN" > arc_token.txt - else - echo "Error: Could not find or validate API token" - echo "If this is not the first run, Arc's database may need to be reset:" - echo " sudo rm -rf /var/lib/arc/data/arc.db" - exit 1 - fi -fi - -echo "Token: ${ARC_TOKEN:0:20}..." - -# ============================================================ -# 4. DOWNLOAD DATASET -# ============================================================ -DATASET_FILE="hits.parquet" -DATASET_URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet" -EXPECTED_SIZE=14779976446 - -if [ -f "$DATASET_FILE" ]; then - CURRENT_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null) - if [ "$CURRENT_SIZE" -eq "$EXPECTED_SIZE" ]; then - echo "[OK] Dataset already downloaded (14GB)" - else - echo "Re-downloading dataset (size mismatch)..." - rm -f "$DATASET_FILE" - wget --continue --progress=dot:giga "$DATASET_URL" - fi -else - echo "Downloading ClickBench dataset (14GB)..." - wget --continue --progress=dot:giga "$DATASET_URL" -fi - -# ============================================================ -# 5. LOAD DATA INTO ARC -# ============================================================ -echo "Loading data into Arc..." - -# Determine Arc's data directory (default: /var/lib/arc/data) -ARC_DATA_DIR="/var/lib/arc/data" -TARGET_DIR="$ARC_DATA_DIR/clickbench/hits" -TARGET_FILE="$TARGET_DIR/hits.parquet" - -sudo mkdir -p "$TARGET_DIR" - -if [ -f "$TARGET_FILE" ]; then - SOURCE_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null) - TARGET_SIZE=$(stat -c%s "$TARGET_FILE" 2>/dev/null || stat -f%z "$TARGET_FILE" 2>/dev/null) - if [ "$SOURCE_SIZE" -eq "$TARGET_SIZE" ]; then - echo "[OK] Data already loaded" - else - echo "Reloading data (size mismatch)..." - sudo cp "$DATASET_FILE" "$TARGET_FILE" - fi -else - sudo cp "$DATASET_FILE" "$TARGET_FILE" - echo "[OK] Data loaded to $TARGET_FILE" -fi - -# ============================================================ -# 6. SET ENVIRONMENT AND RUN BENCHMARK -# ============================================================ -export ARC_URL="http://localhost:8000" -export ARC_API_KEY="$ARC_TOKEN" -export DATABASE="clickbench" -export TABLE="hits" - -echo "" -echo "Running ClickBench queries (true cold runs)..." -echo "================================================" -./run.sh 2>&1 | tee log.txt - -# ============================================================ -# 7. STOP ARC AND FORMAT RESULTS -# ============================================================ -echo "Stopping Arc..." -sudo systemctl stop arc - -# Format results as proper JSON array -cat log.txt | grep -oE '^[0-9]+\.[0-9]+|^null' | \ - awk '{ - if (NR % 3 == 1) printf "["; - printf "%s", $1; - if (NR % 3 == 0) print "],"; - else printf ", "; - }' > results.txt - -echo "" -echo "[OK] Benchmark complete!" -echo "================================================" -echo "Load time: 0" -echo "Data size: $EXPECTED_SIZE" -cat results.txt -echo "================================================" - -# ============================================================ -# 8. CLEANUP -# ============================================================ -echo "Cleaning up..." - -# Uninstall Arc package -sudo dpkg -r arc || true - -# Remove Arc data directory -sudo rm -rf /var/lib/arc - -echo "[OK] Cleanup complete" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/arc/check b/arc/check new file mode 100755 index 0000000000..2ba2f88519 --- /dev/null +++ b/arc/check @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" +TOKEN=$(cat arc_token.txt 2>/dev/null || true) + +if [ -n "$TOKEN" ]; then + curl -sf "$ARC_URL/health" -H "x-api-key: $TOKEN" >/dev/null +else + curl -sf "$ARC_URL/health" >/dev/null +fi diff --git a/arc/data-size b/arc/data-size new file mode 100755 index 0000000000..d37e32e8e1 --- /dev/null +++ b/arc/data-size @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Source parquet file size (loaded into Arc's data directory). +F="/var/lib/arc/data/clickbench/hits/hits.parquet" +if [ -f "$F" ]; then + sudo stat -c%s "$F" +else + echo 14779976446 +fi diff --git a/arc/install b/arc/install new file mode 100755 index 0000000000..eb79fb4bf6 --- /dev/null +++ b/arc/install @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +# Install Arc from a .deb release. Idempotent. +if dpkg -l arc 2>/dev/null | grep -q '^ii '; then + exit 0 +fi + +ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest \ + | grep -oP '"tag_name": "v\K[^"]+') +if [ -z "$ARC_VERSION" ]; then + echo "Error: Could not fetch latest Arc version from GitHub" >&2 + exit 1 +fi + +ARCH=$(uname -m) +if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then + DEB_FILE="arc_${ARC_VERSION}_arm64.deb" +else + DEB_FILE="arc_${ARC_VERSION}_amd64.deb" +fi +DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/${DEB_FILE}" + +if [ ! -f "$DEB_FILE" ]; then + wget -q "$DEB_URL" -O "$DEB_FILE" +fi + +sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y diff --git a/arc/load b/arc/load new file mode 100755 index 0000000000..b46a4e3265 --- /dev/null +++ b/arc/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +# Arc loads the parquet file into its data directory and indexes it on startup. +ARC_DATA_DIR="/var/lib/arc/data" +TARGET_DIR="$ARC_DATA_DIR/clickbench/hits" +TARGET_FILE="$TARGET_DIR/hits.parquet" + +sudo mkdir -p "$TARGET_DIR" + +if [ -f "$TARGET_FILE" ] && \ + [ "$(stat -c%s hits.parquet)" -eq "$(stat -c%s "$TARGET_FILE")" ]; then + : # already loaded +else + sudo cp hits.parquet "$TARGET_FILE" +fi + +# Free up local space. +rm -f hits.parquet +sync diff --git a/arc/query b/arc/query new file mode 100755 index 0000000000..da3619df9f --- /dev/null +++ b/arc/query @@ -0,0 +1,49 @@ +#!/bin/bash +# Reads a SQL query from stdin, POSTs it to Arc's HTTP API. +# Stdout: query response body (JSON). +# Stderr: query runtime in fractional seconds on the last line (extracted +# from Arc's journal log line `execution_time_ms=N`). +# Exit non-zero on error. +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" +ARC_API_KEY="${ARC_API_KEY:-$(cat arc_token.txt 2>/dev/null)}" + +query=$(cat) + +# Build JSON payload with proper escaping. +JSON_PAYLOAD=$(jq -Rs '{sql: .}' <<<"$query") + +# Mark journal position so we can locate the matching execution_time_ms entry. +LOG_MARKER=$(date -u +"%Y-%m-%dT%H:%M:%S") + +RESPONSE=$(curl -s -w "\n%{http_code}" \ + -X POST "$ARC_URL/api/v1/query" \ + -H "x-api-key: $ARC_API_KEY" \ + -H "Content-Type: application/json" \ + -d "$JSON_PAYLOAD" \ + --max-time 300) + +HTTP_CODE=$(printf '%s\n' "$RESPONSE" | tail -1) +BODY=$(printf '%s\n' "$RESPONSE" | head -n -1) + +if [ "$HTTP_CODE" != "200" ]; then + printf 'arc query failed: HTTP %s\n%s\n' "$HTTP_CODE" "$BODY" >&2 + exit 1 +fi + +# Result body to stdout. +printf '%s\n' "$BODY" + +# Extract execution_time_ms from Arc's journal — give it a moment to flush. +sleep 0.1 +EXEC_MS=$(sudo journalctl -u arc --since="$LOG_MARKER" --no-pager 2>/dev/null \ + | grep -oP 'execution_time_ms=\K[0-9]+' | tail -1) + +if [ -z "$EXEC_MS" ]; then + echo "Could not extract execution_time_ms from arc journal" >&2 + exit 1 +fi + +# Convert ms -> seconds and emit on stderr. +awk -v ms="$EXEC_MS" 'BEGIN { printf "%.4f\n", ms / 1000 }' >&2 diff --git a/arc/results/20260509/c6a.4xlarge.json b/arc/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..5ad3953393 --- /dev/null +++ b/arc/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Arc", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Go","column-oriented","time-series"], + "load_time": 57, + "data_size": 14779976446, + "result": [ + [0.066, 0.001, 0.002], + [0.097, 0.018, 0.018], + [0.142, 0.046, 0.045], + [0.348, 0.047, 0.045], + [0.481, 0.316, 0.312], + [0.832, 0.504, 0.464], + [0.084, 0.025, 0.025], + [0.102, 0.02, 0.02], + [0.765, 0.417, 0.413], + [1.082, 0.515, 0.51], + [0.424, 0.109, 0.11], + [0.453, 0.128, 0.13], + [0.838, 0.501, 0.508], + [2.379, 0.832, 0.816], + [0.899, 0.558, 0.552], + [0.533, 0.368, 0.369], + [2.307, 0.934, 0.936], + [2.083, 0.718, 0.7], + [4.467, 1.587, 1.581], + [0.219, 0.016, 0.015], + [9.431, 0.855, 0.854], + [11.015, 0.816, 0.804], + [19.94, 1.709, 1.671], + [2.655, 0.41, 0.399], + [0.213, 0.133, 0.136], + [0.738, 0.266, 0.266], + [0.259, 0.109, 0.106], + [9.458, 0.733, 0.74], + [8.369, 4.222, 4.206], + [0.135, 0.047, 0.049], + [2.178, 0.527, 0.541], + [5.704, 0.622, 0.627], + [5.051, 1.706, 1.699], + [9.95, 2.035, 2.007], + [9.967, 2.099, 2.086], + [0.612, 0.488, 0.442], + [0.229, 0.115, 0.111], + [0.159, 0.091, 0.091], + [0.168, 0.068, 0.056], + [0.417, 0.23, 0.224], + [0.12, 0.027, 0.026], + [0.094, 0.025, 0.025], + [0.095, 0.034, 0.029] +] +} + diff --git a/arc/results/20260509/c6a.metal.json b/arc/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..cdba52c52a --- /dev/null +++ b/arc/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Arc", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Go","column-oriented","time-series"], + "load_time": 63, + "data_size": 14779976446, + "result": [ + [0.08, 0.001, 0.001], + [0.109, 0.009, 0.011], + [0.131, 0.018, 0.015], + [0.297, 0.024, 0.018], + [0.403, 0.124, 0.116], + [0.799, 0.196, 0.19], + [0.104, 0.008, 0.01], + [0.133, 0.017, 0.011], + [0.713, 0.249, 0.226], + [0.958, 0.237, 0.282], + [0.391, 0.07, 0.059], + [0.418, 0.078, 0.061], + [0.788, 0.208, 0.21], + [2.162, 0.39, 0.385], + [0.862, 0.223, 0.223], + [0.473, 0.175, 0.153], + [2.085, 0.292, 0.261], + [2.21, 0.341, 0.334], + [4.088, 0.698, 0.618], + [0.168, 0.018, 0.015], + [9.417, 0.193, 0.186], + [10.954, 0.191, 0.174], + [19.812, 0.581, 0.41], + [11.28, 0.433, 0.385], + [2.044, 0.13, 0.095], + [0.693, 0.103, 0.082], + [2.016, 0.099, 0.089], + [9.446, 0.209, 0.196], + [8.249, 0.817, 0.789], + [0.161, 0.033, 0.025], + [2.066, 0.202, 0.209], + [5.495, 0.278, 0.209], + [4.999, 0.997, 0.894], + [9.882, 0.8, 0.761], + [9.902, 0.811, 0.759], + [0.453, 0.192, 0.165], + [0.297, 0.119, 0.114], + [0.228, 0.094, 0.092], + [0.258, 0.063, 0.06], + [0.571, 0.224, 0.217], + [0.221, 0.038, 0.031], + [0.172, 0.033, 0.029], + [0.158, 0.037, 0.033] +] +} + diff --git a/arc/run.sh b/arc/run.sh deleted file mode 100755 index 4145c9c5b4..0000000000 --- a/arc/run.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash -# Arc ClickBench Benchmark Runner - TRUE COLD RUNS -# Restarts Arc service and clears OS cache before EACH QUERY (not each run) -# Pattern: restart -> run query 3 times -> next query - -TRIES=3 -ARC_URL="${ARC_URL:-http://localhost:8000}" -ARC_API_KEY="${ARC_API_KEY:-$(cat arc_token.txt 2>/dev/null)}" - -echo "Running benchmark with TRUE COLD RUNS (restart + cache clear before each query)" >&2 -echo "API endpoint: $ARC_URL" >&2 -echo "API key: ${ARC_API_KEY:0:20}..." >&2 - -# Function to restart Arc and clear caches -restart_arc() { - # Stop Arc - sudo systemctl stop arc - - # Clear OS page cache - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - # Start Arc - sudo systemctl start arc - - # Wait for Arc to be ready - for i in {1..30}; do - if curl -sf "$ARC_URL/health" > /dev/null 2>&1; then - sleep 0.2 # Extra delay to ensure server is fully ready - return 0 - fi - sleep 0.5 - done - echo "Error: Arc failed to restart" >&2 - return 1 -} - -# Read queries line by line -cat queries.sql | while read -r query; do - # Skip empty lines and comments - [[ -z "$query" || "$query" =~ ^-- ]] && continue - - # TRUE COLD RUN: Restart Arc and clear OS cache ONCE per query - restart_arc - - echo "$query" >&2 - - # Run the query 3 times (first is cold, 2-3 benefit from warm DB caches) - for i in $(seq 1 $TRIES); do - # Mark the log position before query - LOG_MARKER=$(date -u +"%Y-%m-%dT%H:%M:%S") - - # Build JSON payload properly using printf to escape the query - JSON_PAYLOAD=$(printf '{"sql": %s}' "$(echo "$query" | jq -Rs .)") - - # Execute query - RESPONSE=$(curl -s -w "\n%{http_code}" \ - -X POST "$ARC_URL/api/v1/query" \ - -H "x-api-key: $ARC_API_KEY" \ - -H "Content-Type: application/json" \ - -d "$JSON_PAYLOAD" \ - --max-time 300 2>/dev/null) - - HTTP_CODE=$(echo "$RESPONSE" | tail -1) - - if [ "$HTTP_CODE" = "200" ]; then - # Extract execution_time_ms from Arc logs - # Log format: 2025-11-28T14:20:44Z INF ... execution_time_ms=97 ... - sleep 0.1 # Small delay to ensure log is written - EXEC_TIME_MS=$(sudo journalctl -u arc --since="$LOG_MARKER" --no-pager 2>/dev/null | \ - grep -oP 'execution_time_ms=\K[0-9]+' | tail -1) - - if [ -n "$EXEC_TIME_MS" ]; then - # Convert ms to seconds with 4 decimal places - EXEC_TIME_SEC=$(echo "scale=4; $EXEC_TIME_MS / 1000" | bc) - printf "%.4f\n" "$EXEC_TIME_SEC" - else - echo "null" - echo "Warning: Could not extract execution_time_ms from logs" >&2 - fi - else - echo "null" - if [ "$i" -eq 1 ]; then - echo "Query failed (HTTP $HTTP_CODE): ${query:0:50}..." >&2 - echo "Response: $(echo "$RESPONSE" | head -n -1 | head -c 200)" >&2 - fi - fi - done -done - -echo "Benchmark complete!" >&2 diff --git a/arc/start b/arc/start new file mode 100755 index 0000000000..d06f81cab1 --- /dev/null +++ b/arc/start @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" + +# Idempotent: if already up and we have a working token, do nothing. +if [ -f arc_token.txt ]; then + TOKEN=$(cat arc_token.txt) + if curl -sf "$ARC_URL/health" -H "x-api-key: $TOKEN" >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo systemctl start arc + +# Wait for the HTTP endpoint to come up before we try to read the token. +for _ in $(seq 1 30); do + if curl -sf "$ARC_URL/health" >/dev/null 2>&1; then + break + fi + sleep 1 +done + +# On first start, Arc prints its admin token to its journal; capture it. +if [ ! -f arc_token.txt ] || \ + ! curl -sf "$ARC_URL/health" -H "x-api-key: $(cat arc_token.txt)" >/dev/null 2>&1; then + TOKEN=$(sudo journalctl -u arc --no-pager \ + | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' \ + | head -1) + if [ -z "$TOKEN" ]; then + echo "Error: Could not extract Arc admin API token from journal" >&2 + exit 1 + fi + echo "$TOKEN" > arc_token.txt +fi diff --git a/arc/stop b/arc/stop new file mode 100755 index 0000000000..98db475d8e --- /dev/null +++ b/arc/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop arc || true diff --git a/byconity/benchmark.sh b/byconity/benchmark.sh index af59bdff20..02755da37b 100755 --- a/byconity/benchmark.sh +++ b/byconity/benchmark.sh @@ -1,45 +1,10 @@ -#!/bin/bash -e - -sudo apt-get update -y -sudo apt-get install -y ca-certificates curl gnupg -sudo install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --yes --dearmor -o /etc/apt/keyrings/docker.gpg -sudo chmod a+r /etc/apt/keyrings/docker.gpg -echo \ - "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - -sudo apt-get update -y -sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# Make sure docker is running -sudo systemctl start docker - -docker compose up -d -sleep 5 - -hdfs/create_users.sh - -function byconity() -{ - docker compose exec -T server clickhouse-client --port 52145 "$@" -} -export -f byconity - -byconity --time -n < create.sql -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -pigz -fkd hits.tsv.gz - -START=$(date +%s) -byconity --database bench --query "INSERT INTO hits FORMAT TSV" < hits.tsv -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -# NOTE: sometimes may hung due to docker-compose, using docker directly may help -./run.sh - -echo -n "Data size: " -byconity --enable_multiple_tables_for_cnch_parts=1 --query "SELECT sum(bytes_on_disk) FROM system.cnch_parts WHERE table = 'hits' AND database = 'bench'" - -docker compose down --volumes +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +# byconity boots a chain of containers (fdb -> tso -> server -> workers +# / daemon-manager). Each later step waits up to 600s for its +# dependency, so the worst-case cold start is several minutes; the +# lib's 300s default has timed out before server is up. +export BENCH_CHECK_TIMEOUT=1200 +exec ../lib/benchmark-common.sh diff --git a/byconity/check b/byconity/check new file mode 100755 index 0000000000..55e22e34f3 --- /dev/null +++ b/byconity/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +docker compose exec -T server clickhouse-client --port 52145 \ + --query "SELECT 1" >/dev/null diff --git a/byconity/data-size b/byconity/data-size new file mode 100755 index 0000000000..30383c1dda --- /dev/null +++ b/byconity/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +docker compose exec -T server clickhouse-client --port 52145 \ + --enable_multiple_tables_for_cnch_parts=1 \ + --query "SELECT sum(bytes_on_disk) FROM system.cnch_parts WHERE table = 'hits' AND database = 'bench'" diff --git a/byconity/docker-compose.yml b/byconity/docker-compose.yml index b65597524b..68710c03ff 100644 --- a/byconity/docker-compose.yml +++ b/byconity/docker-compose.yml @@ -1,6 +1,6 @@ --- -version: "3" - +# `version: "3"` was removed — Compose v2 ignores it and warns. +# # NOTE: # - you cannot use network_mode=host (to optimize out network overhead) because hadoop does not work without DNS. @@ -45,7 +45,7 @@ services: # byconity: tso: - image: byconity/byconity:0.1.0-GA + image: byconity/byconity:1.0.1-hotfix1 environment: PATH: /usr/sbin:/usr/bin:/sbin:/bin:/opt/byconity/bin command: bash -c "fdbcli -C /config/fdb.cluster --exec \"configure new single ssd\"; tso-server --config-file /config/tso.yml" @@ -57,10 +57,10 @@ services: container_name: tso-0 server: - image: byconity/byconity:0.1.0-GA + image: byconity/byconity:1.0.1-hotfix1 environment: PATH: /usr/sbin:/usr/bin:/sbin:/bin:/opt/byconity/bin - command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 120 --max-time 120 tso-0:18845 && clickhouse-server --config-file /config/server.yml" + command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 600 --max-time 600 tso-0:18845 && clickhouse-server --config-file /config/server.yml" depends_on: - tso - hdfs @@ -72,10 +72,10 @@ services: - ./simple/:/config/:ro worker-write: - image: byconity/byconity:0.1.0-GA + image: byconity/byconity:1.0.1-hotfix1 environment: PATH: /usr/sbin:/usr/bin:/sbin:/bin:/opt/byconity/bin - command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 120 --max-time 120 server:21557 && clickhouse-server --config-file /config/worker.yml" + command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 600 --max-time 600 server:21557 && clickhouse-server --config-file /config/worker.yml" depends_on: - server container_name: worker-write-0 @@ -83,10 +83,10 @@ services: - ./simple/:/config/:ro worker-default: - image: byconity/byconity:0.1.0-GA + image: byconity/byconity:1.0.1-hotfix1 environment: PATH: /usr/sbin:/usr/bin:/sbin:/bin:/opt/byconity/bin - command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 120 --max-time 120 server:21557 && clickhouse-server --config-file /config/worker.yml" + command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 600 --max-time 600 server:21557 && clickhouse-server --config-file /config/worker.yml" depends_on: - server container_name: worker-default-0 @@ -94,10 +94,10 @@ services: - ./simple/:/config/:ro daemon-manager: - image: byconity/byconity:0.1.0-GA + image: byconity/byconity:1.0.1-hotfix1 environment: PATH: /usr/sbin:/usr/bin:/sbin:/bin:/opt/byconity/bin - command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 120 --max-time 120 server:21557 && daemon-manager --config-file ./config/daemon-manager.yml" + command: bash -c "curl --retry 10 --retry-delay 5 --retry-connrefused --retry-max-time 600 --max-time 600 server:21557 && daemon-manager --config-file ./config/daemon-manager.yml" depends_on: - server container_name: daemon-manager-0 diff --git a/byconity/hdfs/create_users.sh b/byconity/hdfs/create_users.sh index ad6ecc6873..93b02ac5f3 100755 --- a/byconity/hdfs/create_users.sh +++ b/byconity/hdfs/create_users.sh @@ -1,6 +1,8 @@ #!/bin/bash set -e -x -docker exec hdfs-namenode hdfs dfs -mkdir /user -docker exec hdfs-namenode hdfs dfs -mkdir /user/clickhouse + +# Idempotent (-p), so re-running across BENCH_RESTARTABLE iterations +# doesn't fail on "File exists" once the dirs are set up. +docker exec hdfs-namenode hdfs dfs -mkdir -p /user/clickhouse docker exec hdfs-namenode hdfs dfs -chown clickhouse /user/clickhouse -docker exec hdfs-namenode hdfs dfs -chmod -R 775 /user/clickhouse +docker exec hdfs-namenode hdfs dfs -chmod -R 775 /user/clickhouse diff --git a/byconity/install b/byconity/install new file mode 100755 index 0000000000..96464aef64 --- /dev/null +++ b/byconity/install @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +# Install Docker (required by byconity's compose stack). Idempotent. +# +# Two checks: the `docker` CLI itself, and the v2 `docker compose` plugin. +# Ubuntu's `docker.io` ships docker without the compose plugin, so a box can +# satisfy `command -v docker` but still fail `docker compose up`. + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y ca-certificates curl gnupg + sudo install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg \ + | sudo gpg --yes --dearmor -o /etc/apt/keyrings/docker.gpg + sudo chmod a+r /etc/apt/keyrings/docker.gpg + { + echo -n "deb [arch=$(dpkg --print-architecture) " + echo -n "signed-by=/etc/apt/keyrings/docker.gpg] " + echo -n "https://download.docker.com/linux/ubuntu " + echo "$(. /etc/os-release && echo "$VERSION_CODENAME") stable" + } | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null + + sudo apt-get update -y + sudo apt-get install -y docker-ce docker-ce-cli containerd.io \ + docker-buildx-plugin docker-compose-plugin +elif ! docker compose version >/dev/null 2>&1; then + sudo apt-get update -y + # Docker.com's apt repo ships the plugin as `docker-compose-plugin`; + # Ubuntu's own repo (24.04+) ships it as `docker-compose-v2`. Try both. + sudo apt-get install -y docker-compose-plugin \ + || sudo apt-get install -y docker-compose-v2 +fi + +sudo systemctl start docker diff --git a/byconity/load b/byconity/load new file mode 100755 index 0000000000..8a262cfd48 --- /dev/null +++ b/byconity/load @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +byconity() { + docker compose exec -T server clickhouse-client --port 52145 "$@" +} + +# HDFS user setup is required for ingestion. Done here (rather than in +# ./start, which races the namenode coming up) — by the time ./check +# has gated us through, hdfs-namenode is reliably ready. +hdfs/create_users.sh + +# Schema (creates database `bench` and table `bench.hits`). +byconity --multiquery < create.sql + +# Ingest from the downloaded TSV. lib/download-hits-tsv already +# decompresses .gz to hits.tsv, so no pigz step here. +byconity --database bench --query "INSERT INTO hits FORMAT TSV" < hits.tsv + +rm -f hits.tsv +sync diff --git a/byconity/query b/byconity/query new file mode 100755 index 0000000000..eb1df09b0d --- /dev/null +++ b/byconity/query @@ -0,0 +1,11 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client inside the +# byconity server container. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +query=$(cat) +docker compose exec -T server clickhouse-client --port 52145 \ + --database bench --time --query="$query" diff --git a/byconity/run.sh b/byconity/run.sh deleted file mode 100755 index 720c678601..0000000000 --- a/byconity/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(byconity --database bench --time --format=Null --query="$query" &1 ||:) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/byconity/start b/byconity/start new file mode 100755 index 0000000000..53fd3bce52 --- /dev/null +++ b/byconity/start @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Bring the byconity stack up. Idempotent: if the server already responds, +# do nothing. +if docker compose exec -T server clickhouse-client --port 52145 \ + --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi + +docker compose up -d + +# HDFS user setup runs in load (after ./check confirms the stack is up +# — hdfs-namenode is a dependency of server, so when SELECT 1 works, +# namenode has been ready long enough to accept dfs commands too). Doing +# it here racing the bring-up was failing on cold instances when sleep 5 +# wasn't long enough. diff --git a/byconity/stop b/byconity/stop new file mode 100755 index 0000000000..a6a2c3661d --- /dev/null +++ b/byconity/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +docker compose down --volumes || true diff --git a/bytehouse/README.md b/bytehouse/README.md index f7242cb0f1..c395bce2f3 100644 --- a/bytehouse/README.md +++ b/bytehouse/README.md @@ -1,6 +1,15 @@ Bytehouse is a derivative of ClickHouse. It is based on very old ClickHouse version (20.4.54418) and many features are unsupported. +## Status + +ByteHouse's international cloud (bytehouse.cloud) is no longer reachable +from outside the China region. The service still operates within China +via Volcengine. All existing results in this directory were collected +against the international cloud and have been re-tagged with +`"historical"`. Future submissions running against a self-managed +ByteHouse instance (or via Volcengine) should not be tagged historical. + https://bytehouse.cloud/signup Sign Up. Only Asia-Pacific South-East 1 AWS region is available. Verify email. diff --git a/bytehouse/results/20220716/l.json b/bytehouse/results/20220716/l.json index 1cd40293d8..8715df88fc 100644 --- a/bytehouse/results/20220716/l.json +++ b/bytehouse/results/20220716/l.json @@ -9,7 +9,8 @@ "managed", "column-oriented", "ClickHouse derivative", - "C++" + "C++", + "historical" ], "load_time": 745, "data_size": 27190000000, @@ -229,5 +230,6 @@ null, null ] - ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220716/m.json b/bytehouse/results/20220716/m.json index 7aba5d100d..2d04945d4d 100644 --- a/bytehouse/results/20220716/m.json +++ b/bytehouse/results/20220716/m.json @@ -9,7 +9,8 @@ "managed", "column-oriented", "ClickHouse derivative", - "C++" + "C++", + "historical" ], "load_time": 745, "data_size": 27190000000, @@ -229,5 +230,6 @@ null, null ] - ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220716/s.json b/bytehouse/results/20220716/s.json index 51599e1d35..6ad08a7794 100644 --- a/bytehouse/results/20220716/s.json +++ b/bytehouse/results/20220716/s.json @@ -9,7 +9,8 @@ "managed", "column-oriented", "ClickHouse derivative", - "C++" + "C++", + "historical" ], "load_time": 745, "data_size": 27190000000, @@ -229,5 +230,6 @@ null, null ] - ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220716/xs.json b/bytehouse/results/20220716/xs.json index e2da646efc..7d4c81f593 100644 --- a/bytehouse/results/20220716/xs.json +++ b/bytehouse/results/20220716/xs.json @@ -9,7 +9,8 @@ "managed", "column-oriented", "ClickHouse derivative", - "C++" + "C++", + "historical" ], "load_time": 745, "data_size": 27190000000, @@ -229,5 +230,6 @@ null, null ] - ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220727/l.json b/bytehouse/results/20220727/l.json index afd17de199..aab69a15bd 100644 --- a/bytehouse/results/20220727/l.json +++ b/bytehouse/results/20220727/l.json @@ -6,55 +6,231 @@ "proprietary": "yes", "hardware": "cpu", "tuned": "no", - - "tags": ["managed", "column-oriented", "ClickHouse derivative", "C++"], - + "tags": [ + "managed", + "column-oriented", + "ClickHouse derivative", + "C++", + "historical" + ], "load_time": 745, "data_size": 27190000000, - "result": [ -[0.239,0.087,0.222], -[0.182,0.105,0.131], -[0.200,0.138,0.121], -[0.380,0.140,0.106], -[0.345,0.315,0.348], -[0.499,0.346,0.376], -[0.163,0.199,0.104], -[0.135,0.121,0.132], -[0.498,0.443,0.451], -[0.925,0.931,0.938], -[0.208,0.138,0.154], -[0.184,0.166,0.189], -[0.375,0.440,0.408], -[2.876,2.879,2.856], -[0.514,0.481,0.525], -[0.379,0.403,0.409], -[1.078,0.981,0.998], -[0.970,0.991,1.019], -[2.329,2.028,2.058], -[0.121,0.087,0.117], -[1.242,0.409,0.381], -[0.333,0.362,0.368], -[0.724,0.616,0.609], -[8.372,1.940,1.919], -[0.215,0.166,0.183], -[0.159,0.173,0.160], -[0.173,0.217,0.178], -[0.364,0.392,0.392], -[3.258,3.055,3.158], -[0.969,0.944,1.001], -[0.335,0.344,0.435], -[0.563,0.597,0.546], -[2.824,2.784,2.773], -[2.042,1.985,1.963], -[1.972,1.905,2.000], -[0.660,0.638,0.638], -[0.273,0.291,0.275], -[0.197,0.204,0.245], -[0.158,0.190,0.220], -[0.420,0.410,0.391], -[0.185,0.152,0.138], -[0.302,0.146,0.132], -[0.153,0.138,0.140] -] + [ + 0.239, + 0.087, + 0.222 + ], + [ + 0.182, + 0.105, + 0.131 + ], + [ + 0.2, + 0.138, + 0.121 + ], + [ + 0.38, + 0.14, + 0.106 + ], + [ + 0.345, + 0.315, + 0.348 + ], + [ + 0.499, + 0.346, + 0.376 + ], + [ + 0.163, + 0.199, + 0.104 + ], + [ + 0.135, + 0.121, + 0.132 + ], + [ + 0.498, + 0.443, + 0.451 + ], + [ + 0.925, + 0.931, + 0.938 + ], + [ + 0.208, + 0.138, + 0.154 + ], + [ + 0.184, + 0.166, + 0.189 + ], + [ + 0.375, + 0.44, + 0.408 + ], + [ + 2.876, + 2.879, + 2.856 + ], + [ + 0.514, + 0.481, + 0.525 + ], + [ + 0.379, + 0.403, + 0.409 + ], + [ + 1.078, + 0.981, + 0.998 + ], + [ + 0.97, + 0.991, + 1.019 + ], + [ + 2.329, + 2.028, + 2.058 + ], + [ + 0.121, + 0.087, + 0.117 + ], + [ + 1.242, + 0.409, + 0.381 + ], + [ + 0.333, + 0.362, + 0.368 + ], + [ + 0.724, + 0.616, + 0.609 + ], + [ + 8.372, + 1.94, + 1.919 + ], + [ + 0.215, + 0.166, + 0.183 + ], + [ + 0.159, + 0.173, + 0.16 + ], + [ + 0.173, + 0.217, + 0.178 + ], + [ + 0.364, + 0.392, + 0.392 + ], + [ + 3.258, + 3.055, + 3.158 + ], + [ + 0.969, + 0.944, + 1.001 + ], + [ + 0.335, + 0.344, + 0.435 + ], + [ + 0.563, + 0.597, + 0.546 + ], + [ + 2.824, + 2.784, + 2.773 + ], + [ + 2.042, + 1.985, + 1.963 + ], + [ + 1.972, + 1.905, + 2.0 + ], + [ + 0.66, + 0.638, + 0.638 + ], + [ + 0.273, + 0.291, + 0.275 + ], + [ + 0.197, + 0.204, + 0.245 + ], + [ + 0.158, + 0.19, + 0.22 + ], + [ + 0.42, + 0.41, + 0.391 + ], + [ + 0.185, + 0.152, + 0.138 + ], + [ + 0.302, + 0.146, + 0.132 + ], + [ + 0.153, + 0.138, + 0.14 + ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220727/m.json b/bytehouse/results/20220727/m.json index 421e7f13f4..9dea02fb9d 100644 --- a/bytehouse/results/20220727/m.json +++ b/bytehouse/results/20220727/m.json @@ -6,55 +6,231 @@ "proprietary": "yes", "hardware": "cpu", "tuned": "no", - - "tags": ["managed", "column-oriented", "ClickHouse derivative", "C++"], - + "tags": [ + "managed", + "column-oriented", + "ClickHouse derivative", + "C++", + "historical" + ], "load_time": 745, "data_size": 27190000000, - "result": [ -[0.204,0.124,0.111], -[0.162,0.120,0.129], -[0.138,0.124,0.100], -[0.424,0.193,0.148], -[0.626,0.636,0.701], -[0.700,0.614,0.563], -[0.226,0.107,0.105], -[0.115,0.129,0.135], -[0.908,0.890,0.930], -[1.072,1.196,1.085], -[0.185,0.229,0.228], -[0.210,0.220,0.229], -[0.777,0.751,0.808], -[3.443,3.529,3.431], -[0.918,0.941,0.964], -[0.814,0.826,0.830], -[2.154,2.196,2.145], -[2.002,2.010,1.968], -[5.072,5.071,4.954], -[0.127,0.107,0.105], -[1.344,0.761,0.738], -[0.847,0.866,0.790], -[2.173,1.909,1.956], -[6.798,6.030,5.961], -[0.232,0.241,0.225], -[0.224,0.213,0.165], -[0.249,0.236,0.234], -[0.886,0.895,0.934], -[8.905,8.909,9.119], -[2.729,2.700,2.805], -[0.599,0.679,0.657], -[1.120,1.130,1.114], -[6.566,6.552,6.493], -[4.601,4.529,4.554], -[4.544,4.497,4.527], -[1.338,1.347,1.313], -[0.298,0.295,0.279], -[0.163,0.188,0.158], -[0.210,0.197,0.140], -[0.395,0.434,0.453], -[0.175,0.175,0.145], -[0.159,0.126,0.119], -[0.188,0.140,0.117] -] + [ + 0.204, + 0.124, + 0.111 + ], + [ + 0.162, + 0.12, + 0.129 + ], + [ + 0.138, + 0.124, + 0.1 + ], + [ + 0.424, + 0.193, + 0.148 + ], + [ + 0.626, + 0.636, + 0.701 + ], + [ + 0.7, + 0.614, + 0.563 + ], + [ + 0.226, + 0.107, + 0.105 + ], + [ + 0.115, + 0.129, + 0.135 + ], + [ + 0.908, + 0.89, + 0.93 + ], + [ + 1.072, + 1.196, + 1.085 + ], + [ + 0.185, + 0.229, + 0.228 + ], + [ + 0.21, + 0.22, + 0.229 + ], + [ + 0.777, + 0.751, + 0.808 + ], + [ + 3.443, + 3.529, + 3.431 + ], + [ + 0.918, + 0.941, + 0.964 + ], + [ + 0.814, + 0.826, + 0.83 + ], + [ + 2.154, + 2.196, + 2.145 + ], + [ + 2.002, + 2.01, + 1.968 + ], + [ + 5.072, + 5.071, + 4.954 + ], + [ + 0.127, + 0.107, + 0.105 + ], + [ + 1.344, + 0.761, + 0.738 + ], + [ + 0.847, + 0.866, + 0.79 + ], + [ + 2.173, + 1.909, + 1.956 + ], + [ + 6.798, + 6.03, + 5.961 + ], + [ + 0.232, + 0.241, + 0.225 + ], + [ + 0.224, + 0.213, + 0.165 + ], + [ + 0.249, + 0.236, + 0.234 + ], + [ + 0.886, + 0.895, + 0.934 + ], + [ + 8.905, + 8.909, + 9.119 + ], + [ + 2.729, + 2.7, + 2.805 + ], + [ + 0.599, + 0.679, + 0.657 + ], + [ + 1.12, + 1.13, + 1.114 + ], + [ + 6.566, + 6.552, + 6.493 + ], + [ + 4.601, + 4.529, + 4.554 + ], + [ + 4.544, + 4.497, + 4.527 + ], + [ + 1.338, + 1.347, + 1.313 + ], + [ + 0.298, + 0.295, + 0.279 + ], + [ + 0.163, + 0.188, + 0.158 + ], + [ + 0.21, + 0.197, + 0.14 + ], + [ + 0.395, + 0.434, + 0.453 + ], + [ + 0.175, + 0.175, + 0.145 + ], + [ + 0.159, + 0.126, + 0.119 + ], + [ + 0.188, + 0.14, + 0.117 + ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220727/s.json b/bytehouse/results/20220727/s.json index 59d9438bc0..dd6a868f50 100644 --- a/bytehouse/results/20220727/s.json +++ b/bytehouse/results/20220727/s.json @@ -6,55 +6,231 @@ "proprietary": "yes", "hardware": "cpu", "tuned": "no", - - "tags": ["managed", "column-oriented", "ClickHouse derivative", "C++"], - + "tags": [ + "managed", + "column-oriented", + "ClickHouse derivative", + "C++", + "historical" + ], "load_time": 745, "data_size": 27190000000, - "result": [ -[0.465,0.116,0.103], -[0.126,0.120,0.178], -[0.190,0.153,0.150], -[0.241,0.125,0.139], -[1.069,1.034,1.106], -[1.562,1.206,1.221], -[0.152,0.122,0.117], -[0.130,0.117,0.132], -[1.599,1.514,1.542], -[1.341,1.417,1.509], -[0.273,0.302,0.371], -[0.323,0.317,0.346], -[1.350,1.463,1.394], -[6.526,6.562,6.603], -[1.657,1.617,1.658], -[1.301,1.328,1.417], -[4.200,4.116,4.145], -[4.083,3.939,4.009], -[7.324,7.220,7.239], -[0.124,0.102,0.120], -[1.358,0.815,0.825], -[0.950,0.890,0.994], -[2.508,1.891,1.924], -[9.532,6.816,6.810], -[0.288,0.257,0.323], -[0.287,0.252,0.301], -[0.352,0.289,0.325], -[0.888,0.853,0.902], -[11.996,11.335,10.928], -[3.223,3.274,3.039], -[1.076,1.121,1.066], -[1.584,1.634,1.636], -[11.265,11.250,11.265], -[6.284,6.419,6.503], -[6.301,6.381,6.450], -[1.645,1.660,1.591], -[0.290,0.268,0.259], -[0.209,0.200,0.271], -[0.168,0.156,0.173], -[0.459,0.450,0.456], -[0.151,0.149,0.137], -[0.130,0.127,0.131], -[0.159,0.147,0.111] -] + [ + 0.465, + 0.116, + 0.103 + ], + [ + 0.126, + 0.12, + 0.178 + ], + [ + 0.19, + 0.153, + 0.15 + ], + [ + 0.241, + 0.125, + 0.139 + ], + [ + 1.069, + 1.034, + 1.106 + ], + [ + 1.562, + 1.206, + 1.221 + ], + [ + 0.152, + 0.122, + 0.117 + ], + [ + 0.13, + 0.117, + 0.132 + ], + [ + 1.599, + 1.514, + 1.542 + ], + [ + 1.341, + 1.417, + 1.509 + ], + [ + 0.273, + 0.302, + 0.371 + ], + [ + 0.323, + 0.317, + 0.346 + ], + [ + 1.35, + 1.463, + 1.394 + ], + [ + 6.526, + 6.562, + 6.603 + ], + [ + 1.657, + 1.617, + 1.658 + ], + [ + 1.301, + 1.328, + 1.417 + ], + [ + 4.2, + 4.116, + 4.145 + ], + [ + 4.083, + 3.939, + 4.009 + ], + [ + 7.324, + 7.22, + 7.239 + ], + [ + 0.124, + 0.102, + 0.12 + ], + [ + 1.358, + 0.815, + 0.825 + ], + [ + 0.95, + 0.89, + 0.994 + ], + [ + 2.508, + 1.891, + 1.924 + ], + [ + 9.532, + 6.816, + 6.81 + ], + [ + 0.288, + 0.257, + 0.323 + ], + [ + 0.287, + 0.252, + 0.301 + ], + [ + 0.352, + 0.289, + 0.325 + ], + [ + 0.888, + 0.853, + 0.902 + ], + [ + 11.996, + 11.335, + 10.928 + ], + [ + 3.223, + 3.274, + 3.039 + ], + [ + 1.076, + 1.121, + 1.066 + ], + [ + 1.584, + 1.634, + 1.636 + ], + [ + 11.265, + 11.25, + 11.265 + ], + [ + 6.284, + 6.419, + 6.503 + ], + [ + 6.301, + 6.381, + 6.45 + ], + [ + 1.645, + 1.66, + 1.591 + ], + [ + 0.29, + 0.268, + 0.259 + ], + [ + 0.209, + 0.2, + 0.271 + ], + [ + 0.168, + 0.156, + 0.173 + ], + [ + 0.459, + 0.45, + 0.456 + ], + [ + 0.151, + 0.149, + 0.137 + ], + [ + 0.13, + 0.127, + 0.131 + ], + [ + 0.159, + 0.147, + 0.111 + ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/bytehouse/results/20220727/xs.json b/bytehouse/results/20220727/xs.json index d7aedadb90..2fc3736c89 100644 --- a/bytehouse/results/20220727/xs.json +++ b/bytehouse/results/20220727/xs.json @@ -6,55 +6,231 @@ "proprietary": "yes", "hardware": "cpu", "tuned": "no", - - "tags": ["managed", "column-oriented", "ClickHouse derivative", "C++"], - + "tags": [ + "managed", + "column-oriented", + "ClickHouse derivative", + "C++", + "historical" + ], "load_time": 745, "data_size": 27190000000, - "result": [ -[0.334,0.140,0.090], -[0.122,0.142,0.121], -[0.263,0.251,0.159], -[0.501,0.200,0.219], -[2.543,2.468,2.505], -[2.105,1.866,1.868], -[0.157,0.138,0.110], -[0.158,0.144,0.128], -[3.764,3.757,3.743], -[5.197,5.259,5.139], -[0.533,0.525,0.503], -[0.625,0.555,0.562], -[2.032,2.090,2.048], -[9.476,9.242,9.257], -[2.594,2.517,2.540], -[2.787,2.553,2.685], -[7.066,7.059,6.987], -[7.108,7.044,7.038], -[null,null,null], -[null,0.209,0.104], -[2.984,1.572,1.647], -[1.726,1.633,1.798], -[5.146,3.894,3.656], -[18.920,13.498,13.139], -[0.510,0.499,0.483], -[0.365,0.425,0.369], -[0.534,0.504,0.524], -[1.537,1.582,1.604], -[23.250,22.080,21.513], -[5.463,5.731,5.901], -[2.113,2.060,2.033], -[3.506,3.609,3.409], -[20.424,19.854,19.813], -[10.042,10.562,10.160], -[10.287,10.111,10.401], -[3.369,3.553,3.392], -[0.249,0.232,0.272], -[0.193,0.176,0.190], -[0.136,0.175,0.146], -[0.446,0.419,0.430], -[0.163,0.191,0.180], -[0.187,0.132,0.121], -[0.150,0.167,0.139] -] + [ + 0.334, + 0.14, + 0.09 + ], + [ + 0.122, + 0.142, + 0.121 + ], + [ + 0.263, + 0.251, + 0.159 + ], + [ + 0.501, + 0.2, + 0.219 + ], + [ + 2.543, + 2.468, + 2.505 + ], + [ + 2.105, + 1.866, + 1.868 + ], + [ + 0.157, + 0.138, + 0.11 + ], + [ + 0.158, + 0.144, + 0.128 + ], + [ + 3.764, + 3.757, + 3.743 + ], + [ + 5.197, + 5.259, + 5.139 + ], + [ + 0.533, + 0.525, + 0.503 + ], + [ + 0.625, + 0.555, + 0.562 + ], + [ + 2.032, + 2.09, + 2.048 + ], + [ + 9.476, + 9.242, + 9.257 + ], + [ + 2.594, + 2.517, + 2.54 + ], + [ + 2.787, + 2.553, + 2.685 + ], + [ + 7.066, + 7.059, + 6.987 + ], + [ + 7.108, + 7.044, + 7.038 + ], + [ + null, + null, + null + ], + [ + null, + 0.209, + 0.104 + ], + [ + 2.984, + 1.572, + 1.647 + ], + [ + 1.726, + 1.633, + 1.798 + ], + [ + 5.146, + 3.894, + 3.656 + ], + [ + 18.92, + 13.498, + 13.139 + ], + [ + 0.51, + 0.499, + 0.483 + ], + [ + 0.365, + 0.425, + 0.369 + ], + [ + 0.534, + 0.504, + 0.524 + ], + [ + 1.537, + 1.582, + 1.604 + ], + [ + 23.25, + 22.08, + 21.513 + ], + [ + 5.463, + 5.731, + 5.901 + ], + [ + 2.113, + 2.06, + 2.033 + ], + [ + 3.506, + 3.609, + 3.409 + ], + [ + 20.424, + 19.854, + 19.813 + ], + [ + 10.042, + 10.562, + 10.16 + ], + [ + 10.287, + 10.111, + 10.401 + ], + [ + 3.369, + 3.553, + 3.392 + ], + [ + 0.249, + 0.232, + 0.272 + ], + [ + 0.193, + 0.176, + 0.19 + ], + [ + 0.136, + 0.175, + 0.146 + ], + [ + 0.446, + 0.419, + 0.43 + ], + [ + 0.163, + 0.191, + 0.18 + ], + [ + 0.187, + 0.132, + 0.121 + ], + [ + 0.15, + 0.167, + 0.139 + ] + ], + "comment": "Historical: ByteHouse cloud (bytehouse.cloud) result; the service is no longer reachable from outside the China region." } diff --git a/cedardb-parquet/benchmark.sh b/cedardb-parquet/benchmark.sh index 3416523328..b851876173 100755 --- a/cedardb-parquet/benchmark.sh +++ b/cedardb-parquet/benchmark.sh @@ -1,39 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -# download dataset -../download-hits-parquet-single data -chmod -R 777 data -rm -rf db -mkdir db - -# get and configure CedarDB image -echo "Starting CedarDB..." -docker run --rm -p 5432:5432 -v ./data:/data -v ./db:/var/lib/cedardb/data -e CEDAR_PASSWORD=test --name cedardb cedardb/cedardb:latest > /dev/null 2>&1 & - -# wait for container to start -until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - -# create view over the parquet file -PGPASSWORD=test psql -h localhost -U postgres -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# data size = parquet file size; load time = 0 (no ingestion) -echo -n "Data size: " -stat -c%s data/hits.parquet -echo "Load time: 0" - -# run benchmark -echo "running benchmark..." -./run.sh 2>&1 | tee log.txt - -cat log.txt | \ - grep -oP 'Time: \d+\.\d+ ms|psql: error' | \ - sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cedardb-parquet/check b/cedardb-parquet/check new file mode 100755 index 0000000000..f161a08203 --- /dev/null +++ b/cedardb-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null diff --git a/cedardb-parquet/data-size b/cedardb-parquet/data-size new file mode 100755 index 0000000000..7a49e44749 --- /dev/null +++ b/cedardb-parquet/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# No ingestion — reported size is the parquet file itself. +stat -c%s data/hits.parquet diff --git a/cedardb-parquet/install b/cedardb-parquet/install new file mode 100755 index 0000000000..2c6f09a048 --- /dev/null +++ b/cedardb-parquet/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull cedardb/cedardb:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/cedardb-parquet/load b/cedardb-parquet/load new file mode 100755 index 0000000000..ee17527ef5 --- /dev/null +++ b/cedardb-parquet/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# Stage parquet file under ./data so the docker container sees /data/hits.parquet. +mkdir -p data +mv hits.parquet data/ +chmod -R 777 data + +# create.sql defines a view over the parquet file — no ingestion needed. +PGPASSWORD=test psql -h localhost -U postgres -t < create.sql + +sync diff --git a/cedardb-parquet/query b/cedardb-parquet/query new file mode 100755 index 0000000000..3261388dc6 --- /dev/null +++ b/cedardb-parquet/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CedarDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cedardb-parquet/results/20260509/c6a.4xlarge.json b/cedardb-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..f8d81aed25 --- /dev/null +++ b/cedardb-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "CedarDB (Parquet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","PostgreSQL compatible","stateless"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [0.071, 0.032, 0.052], + [0.4, 0.071, 0.049], + [0.146, 0.091, 0.091], + [0.272, 0.077, 0.077], + [0.402, 0.204, 0.179], + [0.696, 0.373, 0.346], + [0.258, 0.227, 0.25], + [0.101, 0.074, 0.053], + [0.575, 0.242, 0.272], + [0.952, 0.392, 0.394], + [0.392, 0.13, 0.154], + [0.184, 0.153, 0.18], + [0.707, 0.372, 0.345], + [0.759, 0.504, 0.526], + [0.511, 0.396, 0.371], + [0.408, 0.228, 0.207], + [0.819, 0.548, 0.523], + [0.707, 0.523, 0.496], + [4.031, 1.32, 1.29], + [0.26, 0.07, 0.07], + [9.353, 0.727, 0.775], + [0.935, 0.85, 0.823], + [21.57, 1.567, 1.566], + [55.493, 5.568, 5.356], + [0.38, 0.353, 0.386], + [0.638, 0.27, 0.246], + [0.76, 0.38, 0.378], + [9.469, 0.739, 0.797], + [7.894, 3.753, 3.806], + [0.13, 0.099, 0.076], + [2.122, 0.429, 0.406], + [5.54, 0.501, 0.458], + [4.817, 1.418, 1.37], + [9.781, 1.389, 1.372], + [9.771, 1.423, 1.415], + [0.373, 0.195, 0.2], + [9.619, 0.708, 0.654], + [8.894, 0.808, 0.857], + [9.455, 0.656, 0.706], + [18.502, 1.181, 1.178], + [2.723, 0.268, 0.242], + [0.318, 0.265, 0.242], + [0.68, 0.19, 0.191] +] +} + diff --git a/cedardb-parquet/results/20260509/c6a.metal.json b/cedardb-parquet/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..b591f90e85 --- /dev/null +++ b/cedardb-parquet/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "CedarDB (Parquet)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","PostgreSQL compatible","stateless"], + "load_time": 5, + "data_size": 14779976446, + "result": [ + [0.074, 0.035, 0.052], + [0.117, 0.065, 0.052], + [0.164, 0.054, 0.065], + [0.275, 0.079, 0.057], + [0.399, 0.292, 0.154], + [0.751, 0.333, 0.207], + [0.216, 0.083, 0.066], + [0.196, 0.067, 0.057], + [0.608, 0.331, 0.155], + [1.048, 0.362, 0.21], + [0.431, 0.113, 0.086], + [0.488, 0.095, 0.103], + [0.729, 0.199, 0.186], + [2.173, 0.309, 0.321], + [0.796, 0.189, 0.184], + [0.394, 0.294, 0.157], + [1.96, 0.238, 0.26], + [1.958, 0.405, 0.264], + [3.986, 0.433, 0.488], + [0.278, 0.061, 0.063], + [9.471, 0.25, 0.237], + [11.005, 0.249, 0.325], + [21.764, 0.307, 0.403], + [55.53, 2.206, 2.199], + [2.379, 0.114, 0.137], + [0.683, 0.106, 0.135], + [2.373, 0.151, 0.162], + [9.598, 0.262, 0.263], + [8.226, 0.694, 0.683], + [0.171, 0.061, 0.074], + [2.118, 0.223, 0.194], + [5.733, 0.306, 0.25], + [4.59, 0.699, 0.687], + [9.694, 0.551, 0.525], + [9.727, 0.586, 0.555], + [0.395, 0.187, 0.136], + [9.759, 0.259, 0.267], + [8.962, 0.229, 0.277], + [9.794, 0.251, 0.249], + [18.678, 0.388, 0.37], + [2.785, 0.114, 0.146], + [1.981, 0.112, 0.141], + [1.019, 0.152, 0.136] +] +} + diff --git a/cedardb-parquet/run.sh b/cedardb-parquet/run.sh deleted file mode 100755 index f76d6409ee..0000000000 --- a/cedardb-parquet/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - # wait for the server quietly so retry-loop messages don't pollute log.txt - # (the awk filter in benchmark.sh treats any `psql: error` line as a failed query) - until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - until PGPASSWORD=test psql -h localhost -U postgres -c "SELECT 'Ok';" > /dev/null 2>&1; do sleep 1; done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/cedardb-parquet/start b/cedardb-parquet/start new file mode 100755 index 0000000000..ad1d714394 --- /dev/null +++ b/cedardb-parquet/start @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo docker stop cedardb >/dev/null 2>&1 || true +sudo docker rm cedardb >/dev/null 2>&1 || true + +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest >/dev/null + +until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do + sleep 1 +done diff --git a/cedardb-parquet/stop b/cedardb-parquet/stop new file mode 100755 index 0000000000..5d6ade0a89 --- /dev/null +++ b/cedardb-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop cedardb >/dev/null 2>&1 || true diff --git a/cedardb/benchmark.sh b/cedardb/benchmark.sh index a6e1c25022..531bd65038 100755 --- a/cedardb/benchmark.sh +++ b/cedardb/benchmark.sh @@ -1,43 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client gzip - -# download dataset -../download-hits-tsv -mkdir data -mv hits.tsv data -chmod -R 777 data -rm -rf db -mkdir db - -# get and configure CedarDB image -echo "Starting CedarDB..." -docker run --rm -p 5432:5432 -v ./data:/data -v ./db:/var/lib/cedardb/data -e CEDAR_PASSWORD=test --name cedardb cedardb/cedardb:latest > /dev/null 2>&1 & - -# wait for container to start -until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - -# create table and ingest data -PGPASSWORD=test psql -h localhost -U postgres -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo "Inserting data..." -echo -n "Load time: " -PGPASSWORD=test command time -f '%e' psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';" - -# get ingested data size -echo -n "Data size: " -PGPASSWORD=test psql -h localhost -U postgres -q -t -c "SELECT pg_total_relation_size('hits');" - -# run benchmark -echo "running benchmark..." -./run.sh 2>&1 | tee log.txt - -cat log.txt | \ - grep -oP 'Time: \d+\.\d+ ms|psql: error' | \ - sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cedardb/check b/cedardb/check new file mode 100755 index 0000000000..f161a08203 --- /dev/null +++ b/cedardb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null diff --git a/cedardb/data-size b/cedardb/data-size new file mode 100755 index 0000000000..da53638e42 --- /dev/null +++ b/cedardb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +PGPASSWORD=test psql -h localhost -U postgres -q -t -A -c "SELECT pg_total_relation_size('hits');" diff --git a/cedardb/install b/cedardb/install new file mode 100755 index 0000000000..0043c4c84e --- /dev/null +++ b/cedardb/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client gzip + +sudo docker pull cedardb/cedardb:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/cedardb/load b/cedardb/load new file mode 100755 index 0000000000..eeb6fd3a77 --- /dev/null +++ b/cedardb/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +mkdir -p data +mv hits.tsv data/ +chmod -R 777 data + +PGPASSWORD=test psql -h localhost -U postgres -t < create.sql + +PGPASSWORD=test psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';" + +rm -f data/hits.tsv +sync diff --git a/cedardb/query b/cedardb/query new file mode 100755 index 0000000000..3261388dc6 --- /dev/null +++ b/cedardb/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CedarDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cedardb/results/20260509/c6a.4xlarge.json b/cedardb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..0f8cf5dbd3 --- /dev/null +++ b/cedardb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "CedarDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","PostgreSQL compatible"], + "load_time": 711, + "data_size": 27846842128, + "result": [ + [0.084, 0.011, 0.011], + [0.09, 0.004, 0.005], + [0.94, 0.027, 0.027], + [1.335, 0.026, 0.026], + [1.434, 0.14, 0.131], + [1.69, 0.262, 0.264], + [0.078, 0.022, 0.022], + [0.098, 0.005, 0.005], + [2.245, 0.198, 0.193], + [3.802, 0.259, 0.255], + [2.388, 0.032, 0.031], + [2.76, 0.034, 0.035], + [1.697, 0.217, 0.216], + [3.89, 0.367, 0.356], + [2.087, 0.232, 0.234], + [1.45, 0.162, 0.156], + [3.821, 0.539, 0.452], + [3.817, 0.443, 0.415], + [14.081, 11.88, 11.034], + [0.901, 0.002, 0.001], + [10.963, 0.633, 0.632], + [13.245, 0.218, 0.222], + [21.126, 0.387, 0.379], + [15.539, 0.067, 0.07], + [4.687, 0.006, 0.005], + [1.369, 0.01, 0.01], + [4.677, 0.006, 0.006], + [11.934, 0.673, 0.671], + [8.759, 3.576, 3.571], + [0.304, 0.026, 0.026], + [4.681, 0.095, 0.094], + [7.621, 0.142, 0.139], + [20.039, 14.766, 14.903], + [11.416, 1.541, 1.408], + [11.421, 1.537, 1.387], + [0.747, 0.147, 0.146], + [0.421, 0.061, 0.062], + [0.448, 0.058, 0.057], + [0.049, 0.004, 0.004], + [0.617, 0.092, 0.09], + [0.037, 0.004, 0.004], + [0.03, 0.003, 0.003], + [0.438, 0.035, 0.037] +] +} + diff --git a/cedardb/run.sh b/cedardb/run.sh deleted file mode 100755 index f76d6409ee..0000000000 --- a/cedardb/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - # wait for the server quietly so retry-loop messages don't pollute log.txt - # (the awk filter in benchmark.sh treats any `psql: error` line as a failed query) - until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - until PGPASSWORD=test psql -h localhost -U postgres -c "SELECT 'Ok';" > /dev/null 2>&1; do sleep 1; done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/cedardb/start b/cedardb/start new file mode 100755 index 0000000000..0f4c8b56f6 --- /dev/null +++ b/cedardb/start @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# `docker run --rm` cleans up container on exit; we run detached. +sudo docker stop cedardb >/dev/null 2>&1 || true +sudo docker rm cedardb >/dev/null 2>&1 || true + +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest >/dev/null + +until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do + sleep 1 +done diff --git a/cedardb/stop b/cedardb/stop new file mode 100755 index 0000000000..5d6ade0a89 --- /dev/null +++ b/cedardb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop cedardb >/dev/null 2>&1 || true diff --git a/chdb-dataframe/benchmark.sh b/chdb-dataframe/benchmark.sh index 0bb86a8ea8..fc4bacc8f3 100755 --- a/chdb-dataframe/benchmark.sh +++ b/chdb-dataframe/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas pyarrow -pip install chdb - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb-dataframe/check b/chdb-dataframe/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/chdb-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/chdb-dataframe/data-size b/chdb-dataframe/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/chdb-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/chdb-dataframe/install b/chdb-dataframe/install new file mode 100755 index 0000000000..d1c83816b3 --- /dev/null +++ b/chdb-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas pyarrow chdb fastapi uvicorn diff --git a/chdb-dataframe/load b/chdb-dataframe/load new file mode 100755 index 0000000000..ceba6becac --- /dev/null +++ b/chdb-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/chdb-dataframe/query b/chdb-dataframe/query new file mode 100755 index 0000000000..a4fe4abfb0 --- /dev/null +++ b/chdb-dataframe/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running chdb server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/chdb-dataframe/query.py b/chdb-dataframe/query.py deleted file mode 100755 index f350c8ad58..0000000000 --- a/chdb-dataframe/query.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import pandas as pd -import timeit -import datetime -import json -import subprocess -import chdb - -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") - -dataframe_size = hits.memory_usage().sum() - -# print("Dataframe(numpy) size:", dataframe_size, "bytes") - -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") - -# fix all object columns to string -start = timeit.default_timer() -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) - -print("Dataframe(numpy) normalization time:", timeit.default_timer() - start) - -queries = [] -with open("queries.sql") as f: - queries = f.readlines() - -# conn = chdb.connect("./tmp?verbose&log-level=test") -conn = chdb.connect("./tmp") -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - times = [] - for _ in range(3): - start = timeit.default_timer() - result = conn.query(q, "Null") - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) diff --git a/chdb-dataframe/results/20240909/c6a.metal.json b/chdb-dataframe/results/20240909/c6a.metal.json index 7de75d568b..869268a2eb 100644 --- a/chdb-dataframe/results/20240909/c6a.metal.json +++ b/chdb-dataframe/results/20240909/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20241212/c6a.metal.json b/chdb-dataframe/results/20241212/c6a.metal.json index abeb232b08..794d2131cb 100644 --- a/chdb-dataframe/results/20241212/c6a.metal.json +++ b/chdb-dataframe/results/20241212/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20250907/c6a.metal.json b/chdb-dataframe/results/20250907/c6a.metal.json index dd3b8a23cb..6aac5bb17f 100644 --- a/chdb-dataframe/results/20250907/c6a.metal.json +++ b/chdb-dataframe/results/20250907/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20250907/c7a.metal-48xl.json b/chdb-dataframe/results/20250907/c7a.metal-48xl.json index 5329eff2e6..731ebd0ad2 100644 --- a/chdb-dataframe/results/20250907/c7a.metal-48xl.json +++ b/chdb-dataframe/results/20250907/c7a.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20250907/c8g.metal-48xl.json b/chdb-dataframe/results/20250907/c8g.metal-48xl.json index eb9bc2367f..f074fa407c 100644 --- a/chdb-dataframe/results/20250907/c8g.metal-48xl.json +++ b/chdb-dataframe/results/20250907/c8g.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20251110/c6a.metal.json b/chdb-dataframe/results/20251110/c6a.metal.json index edfe844cbe..b721784855 100644 --- a/chdb-dataframe/results/20251110/c6a.metal.json +++ b/chdb-dataframe/results/20251110/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20251110/c7i.metal-48xl.json b/chdb-dataframe/results/20251110/c7i.metal-48xl.json index f35315b58b..8fe7719e83 100644 --- a/chdb-dataframe/results/20251110/c7i.metal-48xl.json +++ b/chdb-dataframe/results/20251110/c7i.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20251215/c6a.metal.json b/chdb-dataframe/results/20251215/c6a.metal.json index a29e858a2b..aeb0f580c9 100644 --- a/chdb-dataframe/results/20251215/c6a.metal.json +++ b/chdb-dataframe/results/20251215/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20251215/c7a.metal-48xl.json b/chdb-dataframe/results/20251215/c7a.metal-48xl.json index 5e447c5675..ff5c5209b5 100644 --- a/chdb-dataframe/results/20251215/c7a.metal-48xl.json +++ b/chdb-dataframe/results/20251215/c7a.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20251215/c8g.metal-48xl.json b/chdb-dataframe/results/20251215/c8g.metal-48xl.json index a087552249..acc0a72099 100644 --- a/chdb-dataframe/results/20251215/c8g.metal-48xl.json +++ b/chdb-dataframe/results/20251215/c8g.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20260205/c6a.metal.json b/chdb-dataframe/results/20260205/c6a.metal.json index 187c6a7a89..5c7489a72c 100644 --- a/chdb-dataframe/results/20260205/c6a.metal.json +++ b/chdb-dataframe/results/20260205/c6a.metal.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20260205/c7a.metal-48xl.json b/chdb-dataframe/results/20260205/c7a.metal-48xl.json index ee40c75a4c..1fa07b9f0a 100644 --- a/chdb-dataframe/results/20260205/c7a.metal-48xl.json +++ b/chdb-dataframe/results/20260205/c7a.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20260205/c7i.metal-48xl.json b/chdb-dataframe/results/20260205/c7i.metal-48xl.json index dd2eb34459..3f0faaf436 100644 --- a/chdb-dataframe/results/20260205/c7i.metal-48xl.json +++ b/chdb-dataframe/results/20260205/c7i.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless", "Python", "dataframe", "in-memory", "lukewarm-cold-run", "historical"], + "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "Python", "dataframe", "in-memory", "lukewarm-cold-run", "historical"], "load_time": 84, "data_size": 153826500608, "result": [ diff --git a/chdb-dataframe/results/20260205/c8g.metal-48xl.json b/chdb-dataframe/results/20260205/c8g.metal-48xl.json index d53b4f4d22..af448669d4 100644 --- a/chdb-dataframe/results/20260205/c8g.metal-48xl.json +++ b/chdb-dataframe/results/20260205/c8g.metal-48xl.json @@ -12,7 +12,6 @@ "ClickHouse derivative", "embedded", "stateless", - "serverless", "Python", "dataframe", "in-memory", diff --git a/chdb-dataframe/results/20260221/c6a.metal.json b/chdb-dataframe/results/20260221/c6a.metal.json index 9e3d84772f..742cfd52d9 100644 --- a/chdb-dataframe/results/20260221/c6a.metal.json +++ b/chdb-dataframe/results/20260221/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 103, "data_size": 154990247936, "result": [ diff --git a/chdb-dataframe/results/20260221/c7a.metal-48xl.json b/chdb-dataframe/results/20260221/c7a.metal-48xl.json index 1f98935660..4c50376853 100644 --- a/chdb-dataframe/results/20260221/c7a.metal-48xl.json +++ b/chdb-dataframe/results/20260221/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 98, "data_size": 141248589824, "result": [ diff --git a/chdb-dataframe/results/20260221/c8g.metal-48xl.json b/chdb-dataframe/results/20260221/c8g.metal-48xl.json index 682e134e33..8caab427f3 100644 --- a/chdb-dataframe/results/20260221/c8g.metal-48xl.json +++ b/chdb-dataframe/results/20260221/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 78, "data_size": 150402093056, "result": [ diff --git a/chdb-dataframe/results/20260309/c6a.metal.json b/chdb-dataframe/results/20260309/c6a.metal.json index ef181af540..f183180e07 100644 --- a/chdb-dataframe/results/20260309/c6a.metal.json +++ b/chdb-dataframe/results/20260309/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 26, "data_size": 260567318528, "result": [ diff --git a/chdb-dataframe/results/20260309/c7a.metal-48xl.json b/chdb-dataframe/results/20260309/c7a.metal-48xl.json index c44bf6f3d2..1e5b5e926a 100644 --- a/chdb-dataframe/results/20260309/c7a.metal-48xl.json +++ b/chdb-dataframe/results/20260309/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 24, "data_size": 249856888832, "result": [ diff --git a/chdb-dataframe/results/20260309/c8g.metal-48xl.json b/chdb-dataframe/results/20260309/c8g.metal-48xl.json index f26d48c679..78b1c78609 100644 --- a/chdb-dataframe/results/20260309/c8g.metal-48xl.json +++ b/chdb-dataframe/results/20260309/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"], "load_time": 15, "data_size": 262742663168, "result": [ diff --git a/chdb-dataframe/server.py b/chdb-dataframe/server.py new file mode 100644 index 0000000000..38365b7867 --- /dev/null +++ b/chdb-dataframe/server.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around chDB so it conforms to the ClickBench +install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + chdb against the loaded DataFrame, returns + {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) + +The query strings (43 of them, addressing Python(hits)) match the previous +chdb-dataframe/queries.sql, exposed over HTTP. +""" + +import os +import timeit + +import chdb +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits: pd.DataFrame | None = None # noqa: F841 — referenced by chdb's Python() table function +conn = None + + +def _make_runner(sql: str): + return lambda _df: conn.query(sql, "Null") + + +# 43 ClickBench queries — chdb addresses the in-process pandas DataFrame named +# `hits` via the Python() table function. SQL strings come straight from the +# prior chdb-dataframe/queries.sql. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM Python(hits);", + "SELECT COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM Python(hits);", + "SELECT AVG(UserID) FROM Python(hits);", + "SELECT COUNT(DISTINCT UserID) FROM Python(hits);", + "SELECT COUNT(DISTINCT SearchPhrase) FROM Python(hits);", + "SELECT MIN(EventDate), MAX(EventDate) FROM Python(hits);", + "SELECT AdvEngineID, COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM Python(hits) GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM Python(hits) GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM Python(hits) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM Python(hits) WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM Python(hits) WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM Python(hits) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM Python(hits) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM Python(hits) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM Python(hits) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM Python(hits) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM Python(hits);", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM Python(hits) GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM Python(hits) GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM Python(hits) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, conn + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + # chdb addresses `hits` via Python(hits); the connection picks up the + # variable from the module globals at query time. + conn = chdb.connect("./tmp") + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + conn.query(sql, "Null") + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_CHDB_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/chdb-dataframe/start b/chdb-dataframe/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/chdb-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/chdb-dataframe/stop b/chdb-dataframe/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/chdb-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/chdb-dataframe/template.json b/chdb-dataframe/template.json index fb88daef2f..d1ee44e3cc 100644 --- a/chdb-dataframe/template.json +++ b/chdb-dataframe/template.json @@ -3,5 +3,5 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless","Python","dataframe","in-memory","lukewarm-cold-run"] + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","Python","dataframe","in-memory","lukewarm-cold-run"] } diff --git a/chdb-parquet-partitioned/benchmark.sh b/chdb-parquet-partitioned/benchmark.sh index db3290d0c8..3b63e772a6 100755 --- a/chdb-parquet-partitioned/benchmark.sh +++ b/chdb-parquet-partitioned/benchmark.sh @@ -1,23 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install psutil pyarrow -pip install chdb - -# Load the data -../download-hits-parquet-partitioned - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb-parquet-partitioned/check b/chdb-parquet-partitioned/check new file mode 100755 index 0000000000..cd67e7c07c --- /dev/null +++ b/chdb-parquet-partitioned/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import chdb; chdb.query('SELECT 1')" >/dev/null diff --git a/chdb-parquet-partitioned/data-size b/chdb-parquet-partitioned/data-size new file mode 100755 index 0000000000..2d6921ab6d --- /dev/null +++ b/chdb-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/chdb-parquet-partitioned/install b/chdb-parquet-partitioned/install new file mode 100755 index 0000000000..5232a3de51 --- /dev/null +++ b/chdb-parquet-partitioned/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install psutil pyarrow chdb diff --git a/chdb-parquet-partitioned/load b/chdb-parquet-partitioned/load new file mode 100755 index 0000000000..5a0c7a9c76 --- /dev/null +++ b/chdb-parquet-partitioned/load @@ -0,0 +1,5 @@ +#!/bin/bash +# chdb-parquet-partitioned queries the parquet files directly via file(). +# Nothing to load; the dataset is already in CWD as hits_*.parquet. +set -e +sync diff --git a/chdb-parquet-partitioned/query b/chdb-parquet-partitioned/query new file mode 100755 index 0000000000..e32521b589 --- /dev/null +++ b/chdb-parquet-partitioned/query @@ -0,0 +1,40 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via chdb against the partitioned +# parquet files in CWD. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Stage stdin into a temp file: `python3 - <<'PY'` already consumes stdin to +# read the program, so sys.stdin.read() inside the heredoc returns "". +query_file=$(mktemp) +trap 'rm -f "$query_file"' EXIT +cat > "$query_file" + +python3 - "$query_file" <<'PY' +import sys +import timeit +import chdb + +with open(sys.argv[1]) as f: + query = f.read() + +conn = chdb.connect() + +start = timeit.default_timer() +try: + res = conn.query(query, "CSV") + out = str(res) + end = timeit.default_timer() + if out: + sys.stdout.write(out) + if not out.endswith("\n"): + sys.stdout.write("\n") +finally: + conn.close() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/chdb-parquet-partitioned/query.py b/chdb-parquet-partitioned/query.py deleted file mode 100755 index 1f9c3a0484..0000000000 --- a/chdb-parquet-partitioned/query.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 - -import chdb -import timeit -import sys - -query = sys.stdin.read() -print(query) - -conn = chdb.connect() -for try_num in range(3): - start = timeit.default_timer() - conn.query(query, "Null") - end = timeit.default_timer() - print(round(end - start, 3)) - -conn.close() diff --git a/chdb-parquet-partitioned/results/20231203/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20231203/c6a.4xlarge.json index d9a1443631..ff02613c84 100644 --- a/chdb-parquet-partitioned/results/20231203/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20231203/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737670832, diff --git a/chdb-parquet-partitioned/results/20231203/c6a.metal.json b/chdb-parquet-partitioned/results/20231203/c6a.metal.json index 1fe9b0e9a3..d79288ce02 100644 --- a/chdb-parquet-partitioned/results/20231203/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20231203/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737670832, diff --git a/chdb-parquet-partitioned/results/20241212/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20241212/c6a.4xlarge.json index f8cb36ec01..ffa11ebf27 100644 --- a/chdb-parquet-partitioned/results/20241212/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20241212/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737670832, diff --git a/chdb-parquet-partitioned/results/20241212/c6a.metal.json b/chdb-parquet-partitioned/results/20241212/c6a.metal.json index b0702c274c..e82642a42c 100644 --- a/chdb-parquet-partitioned/results/20241212/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20241212/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737670832, diff --git a/chdb-parquet-partitioned/results/20250710/c6a.2xlarge.json b/chdb-parquet-partitioned/results/20250710/c6a.2xlarge.json index 45f64c3c59..a605beb90f 100644 --- a/chdb-parquet-partitioned/results/20250710/c6a.2xlarge.json +++ b/chdb-parquet-partitioned/results/20250710/c6a.2xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250710/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20250710/c6a.4xlarge.json index 12487c5b88..5d84c9825c 100644 --- a/chdb-parquet-partitioned/results/20250710/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20250710/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250711/c6a.large.json b/chdb-parquet-partitioned/results/20250711/c6a.large.json index f822c52d89..86cd53b1d9 100644 --- a/chdb-parquet-partitioned/results/20250711/c6a.large.json +++ b/chdb-parquet-partitioned/results/20250711/c6a.large.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250711/c6a.xlarge.json b/chdb-parquet-partitioned/results/20250711/c6a.xlarge.json index cdb7a6e0d7..8e69d77f35 100644 --- a/chdb-parquet-partitioned/results/20250711/c6a.xlarge.json +++ b/chdb-parquet-partitioned/results/20250711/c6a.xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250712/c8g.4xlarge.json b/chdb-parquet-partitioned/results/20250712/c8g.4xlarge.json index 66ff5270b9..d3f7bbaaa3 100644 --- a/chdb-parquet-partitioned/results/20250712/c8g.4xlarge.json +++ b/chdb-parquet-partitioned/results/20250712/c8g.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250712/t3a.small.json b/chdb-parquet-partitioned/results/20250712/t3a.small.json index f30ff37790..98610af8de 100644 --- a/chdb-parquet-partitioned/results/20250712/t3a.small.json +++ b/chdb-parquet-partitioned/results/20250712/t3a.small.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250830/c7a.metal-48xl.json b/chdb-parquet-partitioned/results/20250830/c7a.metal-48xl.json index 4d2ce17f8c..f26f352e64 100644 --- a/chdb-parquet-partitioned/results/20250830/c7a.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20250830/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20250831/c6a.2xlarge.json b/chdb-parquet-partitioned/results/20250831/c6a.2xlarge.json index 6160169f85..499d3a3fa7 100644 --- a/chdb-parquet-partitioned/results/20250831/c6a.2xlarge.json +++ b/chdb-parquet-partitioned/results/20250831/c6a.2xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250831/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20250831/c6a.4xlarge.json index 38692de09f..de8fa2d7ed 100644 --- a/chdb-parquet-partitioned/results/20250831/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20250831/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250831/c6a.large.json b/chdb-parquet-partitioned/results/20250831/c6a.large.json index 8173ba192a..2da157da03 100644 --- a/chdb-parquet-partitioned/results/20250831/c6a.large.json +++ b/chdb-parquet-partitioned/results/20250831/c6a.large.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250831/c6a.metal.json b/chdb-parquet-partitioned/results/20250831/c6a.metal.json index ff5f9c467b..953c9cc401 100644 --- a/chdb-parquet-partitioned/results/20250831/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20250831/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250831/c6a.xlarge.json b/chdb-parquet-partitioned/results/20250831/c6a.xlarge.json index 15d29bfaf8..b8c895b133 100644 --- a/chdb-parquet-partitioned/results/20250831/c6a.xlarge.json +++ b/chdb-parquet-partitioned/results/20250831/c6a.xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20250831/t3a.small.json b/chdb-parquet-partitioned/results/20250831/t3a.small.json index 329e01c047..e8fa498869 100644 --- a/chdb-parquet-partitioned/results/20250831/t3a.small.json +++ b/chdb-parquet-partitioned/results/20250831/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c6a.2xlarge.json b/chdb-parquet-partitioned/results/20251110/c6a.2xlarge.json index bca7393acd..b5b3d19440 100644 --- a/chdb-parquet-partitioned/results/20251110/c6a.2xlarge.json +++ b/chdb-parquet-partitioned/results/20251110/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20251110/c6a.4xlarge.json index fcdc260da4..278ef4460f 100644 --- a/chdb-parquet-partitioned/results/20251110/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20251110/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c6a.large.json b/chdb-parquet-partitioned/results/20251110/c6a.large.json index c266850cce..ea24bbba09 100644 --- a/chdb-parquet-partitioned/results/20251110/c6a.large.json +++ b/chdb-parquet-partitioned/results/20251110/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c6a.metal.json b/chdb-parquet-partitioned/results/20251110/c6a.metal.json index f05dfe24c4..5413383997 100644 --- a/chdb-parquet-partitioned/results/20251110/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20251110/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14737666736, diff --git a/chdb-parquet-partitioned/results/20251110/c6a.xlarge.json b/chdb-parquet-partitioned/results/20251110/c6a.xlarge.json index 83486be761..6a2c394a0a 100644 --- a/chdb-parquet-partitioned/results/20251110/c6a.xlarge.json +++ b/chdb-parquet-partitioned/results/20251110/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c7i.metal-48xl.json b/chdb-parquet-partitioned/results/20251110/c7i.metal-48xl.json index c69c7bbcd3..aab0137eff 100644 --- a/chdb-parquet-partitioned/results/20251110/c7i.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20251110/c7i.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless", "historical"], + "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "historical"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20251110/c8g.4xlarge.json b/chdb-parquet-partitioned/results/20251110/c8g.4xlarge.json index 4ce7ee6b34..754ee60310 100644 --- a/chdb-parquet-partitioned/results/20251110/c8g.4xlarge.json +++ b/chdb-parquet-partitioned/results/20251110/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c6a.2xlarge.json b/chdb-parquet-partitioned/results/20260221/c6a.2xlarge.json index 6319d7e68d..fecde75336 100644 --- a/chdb-parquet-partitioned/results/20260221/c6a.2xlarge.json +++ b/chdb-parquet-partitioned/results/20260221/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20260221/c6a.4xlarge.json index f213a29693..bb6859720c 100644 --- a/chdb-parquet-partitioned/results/20260221/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20260221/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c6a.large.json b/chdb-parquet-partitioned/results/20260221/c6a.large.json index 2c2f41d790..d935780c16 100644 --- a/chdb-parquet-partitioned/results/20260221/c6a.large.json +++ b/chdb-parquet-partitioned/results/20260221/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c6a.metal.json b/chdb-parquet-partitioned/results/20260221/c6a.metal.json index d44fcdaf3e..9818f80494 100644 --- a/chdb-parquet-partitioned/results/20260221/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20260221/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c6a.xlarge.json b/chdb-parquet-partitioned/results/20260221/c6a.xlarge.json index 051854949f..3657ba20e8 100644 --- a/chdb-parquet-partitioned/results/20260221/c6a.xlarge.json +++ b/chdb-parquet-partitioned/results/20260221/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c7a.metal-48xl.json b/chdb-parquet-partitioned/results/20260221/c7a.metal-48xl.json index 1aa56789d7..396146d607 100644 --- a/chdb-parquet-partitioned/results/20260221/c7a.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20260221/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c8g.4xlarge.json b/chdb-parquet-partitioned/results/20260221/c8g.4xlarge.json index 292837add8..5c29c5a82b 100644 --- a/chdb-parquet-partitioned/results/20260221/c8g.4xlarge.json +++ b/chdb-parquet-partitioned/results/20260221/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260221/c8g.metal-48xl.json b/chdb-parquet-partitioned/results/20260221/c8g.metal-48xl.json index 5733d5c235..8a119569d5 100644 --- a/chdb-parquet-partitioned/results/20260221/c8g.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20260221/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c6a.2xlarge.json b/chdb-parquet-partitioned/results/20260309/c6a.2xlarge.json index e0f5c2d47b..51a85fc688 100644 --- a/chdb-parquet-partitioned/results/20260309/c6a.2xlarge.json +++ b/chdb-parquet-partitioned/results/20260309/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20260309/c6a.4xlarge.json index 4e3c19e1f1..5c44ee8ddc 100644 --- a/chdb-parquet-partitioned/results/20260309/c6a.4xlarge.json +++ b/chdb-parquet-partitioned/results/20260309/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c6a.large.json b/chdb-parquet-partitioned/results/20260309/c6a.large.json index 29215c846b..27679ef9da 100644 --- a/chdb-parquet-partitioned/results/20260309/c6a.large.json +++ b/chdb-parquet-partitioned/results/20260309/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c6a.metal.json b/chdb-parquet-partitioned/results/20260309/c6a.metal.json index 23f87063c3..2ff2d68828 100644 --- a/chdb-parquet-partitioned/results/20260309/c6a.metal.json +++ b/chdb-parquet-partitioned/results/20260309/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c6a.xlarge.json b/chdb-parquet-partitioned/results/20260309/c6a.xlarge.json index 74b3cbcd46..e13a713d80 100644 --- a/chdb-parquet-partitioned/results/20260309/c6a.xlarge.json +++ b/chdb-parquet-partitioned/results/20260309/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c7a.metal-48xl.json b/chdb-parquet-partitioned/results/20260309/c7a.metal-48xl.json index 896a10e08b..1c7821813b 100644 --- a/chdb-parquet-partitioned/results/20260309/c7a.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20260309/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c8g.4xlarge.json b/chdb-parquet-partitioned/results/20260309/c8g.4xlarge.json index 69110d3ba3..aae739e30f 100644 --- a/chdb-parquet-partitioned/results/20260309/c8g.4xlarge.json +++ b/chdb-parquet-partitioned/results/20260309/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/c8g.metal-48xl.json b/chdb-parquet-partitioned/results/20260309/c8g.metal-48xl.json index 3f8c0844cf..c42d31a556 100644 --- a/chdb-parquet-partitioned/results/20260309/c8g.metal-48xl.json +++ b/chdb-parquet-partitioned/results/20260309/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260309/t3a.small.json b/chdb-parquet-partitioned/results/20260309/t3a.small.json index eaddc59186..446c553c94 100644 --- a/chdb-parquet-partitioned/results/20260309/t3a.small.json +++ b/chdb-parquet-partitioned/results/20260309/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/chdb-parquet-partitioned/results/20260509/c6a.4xlarge.json b/chdb-parquet-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..43ad317cec --- /dev/null +++ b/chdb-parquet-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "chDB (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [0.068, 0.025, 0.025], + [0.166, 0.046, 0.046], + [0.246, 0.067, 0.071], + [0.644, 0.104, 0.103], + [0.777, 0.44, 0.45], + [1.156, 0.553, 0.544], + [0.149, 0.045, 0.044], + [0.194, 0.046, 0.049], + [1.121, 0.686, 0.685], + [1.789, 0.791, 0.788], + [0.981, 0.21, 0.21], + [0.945, 0.218, 0.214], + [1.313, 0.681, 0.687], + [2.494, 1.036, 1.007], + [1.179, 0.792, 0.807], + [0.7, 0.534, 0.524], + [2.938, 2.118, 1.94], + [2.55, 1.55, 1.317], + [5.675, 3.877, 3.911], + [0.283, 0.093, 0.094], + [9.562, 1.173, 1.171], + [11.28, 1.42, 1.417], + [21.656, 2.473, 2.496], + [53.621, 2.845, 2.827], + [2.705, 0.385, 0.383], + [0.853, 0.383, 0.378], + [2.696, 0.355, 0.362], + [9.799, 1.592, 1.711], + [9.614, 9.27, 9.365], + [2.519, 2.42, 2.419], + [2.659, 0.738, 0.731], + [6.325, 1.062, 1.072], + [7.638, 5.452, 5.503], + [10.686, 3.082, 3.111], + [10.723, 3.102, 3.127], + [0.52, 0.366, 0.37], + [0.334, 0.12, 0.131], + [0.24, 0.079, 0.09], + [0.279, 0.059, 0.06], + [0.461, 0.2, 0.204], + [0.188, 0.05, 0.045], + [0.178, 0.044, 0.044], + [0.16, 0.042, 0.04] +] +} + diff --git a/chdb-parquet-partitioned/results/20260509/c6a.metal.json b/chdb-parquet-partitioned/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..d1bdc99168 --- /dev/null +++ b/chdb-parquet-partitioned/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "chDB (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], + "load_time": 19, + "data_size": 14737666736, + "result": [ + [0.061, 0.013, 0.014], + [0.114, 0.06, 0.059], + [0.127, 0.084, 0.072], + [0.43, 0.077, 0.087], + [1.032, 0.529, 0.532], + [1.421, 0.651, 0.726], + [0.101, 0.045, 0.049], + [0.154, 0.083, 0.086], + [1.41, 0.87, 0.889], + [1.976, 0.925, 0.907], + [1.15, 0.682, 0.669], + [0.904, 0.445, 0.421], + [1.381, 0.652, 0.646], + [3.741, 1.641, 1.709], + [1.096, 0.311, 0.296], + [0.599, 0.217, 0.222], + [2.452, 0.406, 0.39], + [2.425, 0.37, 0.381], + [4.5, 0.879, 0.854], + [0.191, 0.068, 0.066], + [9.501, 0.324, 0.33], + [11.619, 0.723, 0.762], + [22.171, 1.038, 1.02], + [53.679, 5.459, 5.508], + [2.719, 0.189, 0.211], + [0.864, 0.178, 0.18], + [2.715, 0.183, 0.198], + [9.835, 0.593, 0.594], + [9.357, 2.355, 2.278], + [0.475, 0.685, 0.489], + [2.91, 0.644, 0.655], + [6.203, 0.376, 0.368], + [5.404, 1.329, 1.194], + [10.062, 0.926, 0.925], + [10.035, 0.919, 0.979], + [0.308, 0.227, 0.216], + [0.261, 0.136, 0.142], + [0.195, 0.097, 0.105], + [0.216, 0.078, 0.075], + [0.397, 0.197, 0.199], + [0.143, 0.044, 0.043], + [0.119, 0.039, 0.039], + [0.115, 0.033, 0.034] +] +} + diff --git a/chdb-parquet-partitioned/run.sh b/chdb-parquet-partitioned/run.sh deleted file mode 100755 index 02cb4f6d7c..0000000000 --- a/chdb-parquet-partitioned/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/chdb-parquet-partitioned/start b/chdb-parquet-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/chdb-parquet-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/chdb-parquet-partitioned/stop b/chdb-parquet-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/chdb-parquet-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/chdb-parquet-partitioned/template.json b/chdb-parquet-partitioned/template.json index 88cf63402b..16512d6202 100644 --- a/chdb-parquet-partitioned/template.json +++ b/chdb-parquet-partitioned/template.json @@ -3,5 +3,5 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"] + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"] } diff --git a/chdb/benchmark.sh b/chdb/benchmark.sh index 3f888cbfab..b0b9f4775a 100755 --- a/chdb/benchmark.sh +++ b/chdb/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install psutil pyarrow -pip install chdb - -# Load the data -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' ./load.py - -# Run the queries -./run.sh 2>&1 | tee log.txt - -# Process the log.txt -cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -du -bcs .clickbench | grep total +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb/check b/chdb/check new file mode 100755 index 0000000000..cd67e7c07c --- /dev/null +++ b/chdb/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import chdb; chdb.query('SELECT 1')" >/dev/null diff --git a/chdb/data-size b/chdb/data-size new file mode 100755 index 0000000000..226c121b8f --- /dev/null +++ b/chdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs .clickbench | awk '/total$/ { print $1 }' diff --git a/chdb/install b/chdb/install new file mode 100755 index 0000000000..6dcb72afdb --- /dev/null +++ b/chdb/install @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +# chdb is a Python package (ClickHouse embedded). Install it into a venv. +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install psutil pyarrow chdb diff --git a/chdb/load.py b/chdb/load similarity index 51% rename from chdb/load.py rename to chdb/load index 4b780538ef..8a516b3e30 100755 --- a/chdb/load.py +++ b/chdb/load @@ -1,5 +1,13 @@ -#!/usr/bin/env python3 +#!/bin/bash +set -e +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Idempotent: blow away any prior data dir. +rm -rf .clickbench + +python3 - <<'PY' from chdb import dbapi con = dbapi.connect(path=".clickbench") @@ -10,3 +18,7 @@ cur.close() con.close() +PY + +rm -f hits.csv +sync diff --git a/chdb/query b/chdb/query new file mode 100755 index 0000000000..25d2dc57fc --- /dev/null +++ b/chdb/query @@ -0,0 +1,42 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via chdb against the .clickbench dir. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Stage stdin into a temp file: `python3 - <<'PY'` already consumes stdin to +# read the program, so sys.stdin.read() inside the heredoc returns "". +query_file=$(mktemp) +trap 'rm -f "$query_file"' EXIT +cat > "$query_file" + +python3 - "$query_file" <<'PY' +import sys +import timeit +from chdb import dbapi + +with open(sys.argv[1]) as f: + query = f.read() + +con = dbapi.connect(path=".clickbench") +cur = con.cursor() + +start = timeit.default_timer() +try: + cur._cursor.execute(query) + rows = cur.fetchall() if cur.description else [] + end = timeit.default_timer() +finally: + cur.close() + con.close() + +for row in rows: + print(row) + +# Last line of stderr: fractional seconds. +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/chdb/query.py b/chdb/query.py deleted file mode 100755 index cca7676a18..0000000000 --- a/chdb/query.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -import timeit -import sys -import os -import glob -from chdb import dbapi - -def main(): - query = sys.stdin.read() - print(query) - - con = dbapi.connect(path=".clickbench") - cur = con.cursor() - - for try_num in range(3): - start = timeit.default_timer() - cur._cursor.execute(query) - end = timeit.default_timer() - print(round(end - start, 3)) - - cur.close() - con.close() - -if __name__ == "__main__": - main() diff --git a/chdb/results/20230403/c6a.metal.json b/chdb/results/20230403/c6a.metal.json index fe0959b0c3..8f12c04b6c 100644 --- a/chdb/results/20230403/c6a.metal.json +++ b/chdb/results/20230403/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14779976446, diff --git a/chdb/results/20230505/c6a.4xlarge.json b/chdb/results/20230505/c6a.4xlarge.json index 178082ddb8..0798ef4972 100644 --- a/chdb/results/20230505/c6a.4xlarge.json +++ b/chdb/results/20230505/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14779976446, diff --git a/chdb/results/20230505/c6a.metal.json b/chdb/results/20230505/c6a.metal.json index cf1b086984..b0a3a507b8 100644 --- a/chdb/results/20230505/c6a.metal.json +++ b/chdb/results/20230505/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14779976446, diff --git a/chdb/results/20231203/c6a.4xlarge.json b/chdb/results/20231203/c6a.4xlarge.json index b57c9db3f1..3053cf76d3 100644 --- a/chdb/results/20231203/c6a.4xlarge.json +++ b/chdb/results/20231203/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 637, "data_size": 14737670832, diff --git a/chdb/results/20231203/c6a.metal.json b/chdb/results/20231203/c6a.metal.json index 529cf0de7d..3939f187fa 100644 --- a/chdb/results/20231203/c6a.metal.json +++ b/chdb/results/20231203/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 584, "data_size": 14737670832, diff --git a/chdb/results/20240704/c6a.metal.json b/chdb/results/20240704/c6a.metal.json index c7cce8457f..da5fb4b1e4 100644 --- a/chdb/results/20240704/c6a.metal.json +++ b/chdb/results/20240704/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 122, "data_size": 14737670832, diff --git a/chdb/results/20241212/c6a.4xlarge.json b/chdb/results/20241212/c6a.4xlarge.json index 7c10a2fddd..cd99cc01c2 100644 --- a/chdb/results/20241212/c6a.4xlarge.json +++ b/chdb/results/20241212/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 514, "data_size": 14737670832, diff --git a/chdb/results/20241212/c6a.metal.json b/chdb/results/20241212/c6a.metal.json index c6c16e874b..1d6577b6e9 100644 --- a/chdb/results/20241212/c6a.metal.json +++ b/chdb/results/20241212/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 133, "data_size": 14737670832, diff --git a/chdb/results/20250710/c6a.2xlarge.json b/chdb/results/20250710/c6a.2xlarge.json index 95ce06cd38..0e03f12326 100644 --- a/chdb/results/20250710/c6a.2xlarge.json +++ b/chdb/results/20250710/c6a.2xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 560, "data_size": 14464129336, diff --git a/chdb/results/20250710/c6a.4xlarge.json b/chdb/results/20250710/c6a.4xlarge.json index 2df442cc8b..d5908a8fd2 100644 --- a/chdb/results/20250710/c6a.4xlarge.json +++ b/chdb/results/20250710/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 516, "data_size": 14476359987, diff --git a/chdb/results/20250711/c6a.large.json b/chdb/results/20250711/c6a.large.json index 65866e8778..f5faaf9556 100644 --- a/chdb/results/20250711/c6a.large.json +++ b/chdb/results/20250711/c6a.large.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 840, "data_size": 14464183809, diff --git a/chdb/results/20250711/c6a.xlarge.json b/chdb/results/20250711/c6a.xlarge.json index 136370b78f..c55b127d29 100644 --- a/chdb/results/20250711/c6a.xlarge.json +++ b/chdb/results/20250711/c6a.xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 602, "data_size": 14464160379, diff --git a/chdb/results/20250712/c8g.4xlarge.json b/chdb/results/20250712/c8g.4xlarge.json index 6240df4779..74c95b0beb 100644 --- a/chdb/results/20250712/c8g.4xlarge.json +++ b/chdb/results/20250712/c8g.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 526, "data_size": 14466469883, diff --git a/chdb/results/20250712/t3a.small.json b/chdb/results/20250712/t3a.small.json index db91f03bb7..01f5672a3f 100644 --- a/chdb/results/20250712/t3a.small.json +++ b/chdb/results/20250712/t3a.small.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 1501, "data_size": 14464183809, diff --git a/chdb/results/20250830/c7a.metal-48xl.json b/chdb/results/20250830/c7a.metal-48xl.json index 0d7b97fe7e..bbb36c9622 100644 --- a/chdb/results/20250830/c7a.metal-48xl.json +++ b/chdb/results/20250830/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 121, "data_size": 14470455838, "result": [ diff --git a/chdb/results/20250831/c6a.2xlarge.json b/chdb/results/20250831/c6a.2xlarge.json index 75c0235536..71a06d0b1a 100644 --- a/chdb/results/20250831/c6a.2xlarge.json +++ b/chdb/results/20250831/c6a.2xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 562, "data_size": 14464129336, diff --git a/chdb/results/20250831/c6a.4xlarge.json b/chdb/results/20250831/c6a.4xlarge.json index f6ccd6859b..243db8d513 100644 --- a/chdb/results/20250831/c6a.4xlarge.json +++ b/chdb/results/20250831/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 519, "data_size": 14476296852, diff --git a/chdb/results/20250831/c6a.large.json b/chdb/results/20250831/c6a.large.json index 289ead57db..d4958d4b14 100644 --- a/chdb/results/20250831/c6a.large.json +++ b/chdb/results/20250831/c6a.large.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 850, "data_size": 14464160379, diff --git a/chdb/results/20250831/c6a.metal.json b/chdb/results/20250831/c6a.metal.json index d9fe492d36..b2b1e612f7 100644 --- a/chdb/results/20250831/c6a.metal.json +++ b/chdb/results/20250831/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 134, "data_size": 14469728876, "result": [ diff --git a/chdb/results/20250831/c6a.xlarge.json b/chdb/results/20250831/c6a.xlarge.json index 3809f0c1f0..bba8718a20 100644 --- a/chdb/results/20250831/c6a.xlarge.json +++ b/chdb/results/20250831/c6a.xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 625, "data_size": 14464160379, diff --git a/chdb/results/20250831/t3a.small.json b/chdb/results/20250831/t3a.small.json index e00b7262fb..a0dcfc56e8 100644 --- a/chdb/results/20250831/t3a.small.json +++ b/chdb/results/20250831/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 1490, "data_size": 14464160379, "result": [ diff --git a/chdb/results/20251110/c6a.2xlarge.json b/chdb/results/20251110/c6a.2xlarge.json index 6a3eaec215..15ee81b686 100644 --- a/chdb/results/20251110/c6a.2xlarge.json +++ b/chdb/results/20251110/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 561, "data_size": 14477033643, "result": [ diff --git a/chdb/results/20251110/c6a.4xlarge.json b/chdb/results/20251110/c6a.4xlarge.json index c0b4f0a81b..2ea7677f38 100644 --- a/chdb/results/20251110/c6a.4xlarge.json +++ b/chdb/results/20251110/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 516, "data_size": 14477072504, "result": [ diff --git a/chdb/results/20251110/c6a.large.json b/chdb/results/20251110/c6a.large.json index 8e25ff49d2..d22e22f319 100644 --- a/chdb/results/20251110/c6a.large.json +++ b/chdb/results/20251110/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 818, "data_size": 14467595822, "result": [ diff --git a/chdb/results/20251110/c6a.metal.json b/chdb/results/20251110/c6a.metal.json index e4b32b5cee..e37c9dd1ae 100644 --- a/chdb/results/20251110/c6a.metal.json +++ b/chdb/results/20251110/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 134, "data_size": 14477033643, diff --git a/chdb/results/20251110/c6a.xlarge.json b/chdb/results/20251110/c6a.xlarge.json index e0f838a68a..5ff30e7216 100644 --- a/chdb/results/20251110/c6a.xlarge.json +++ b/chdb/results/20251110/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 608, "data_size": 14465782664, "result": [ diff --git a/chdb/results/20251110/c7i.metal-48xl.json b/chdb/results/20251110/c7i.metal-48xl.json index 7a64693847..a5731f88d2 100644 --- a/chdb/results/20251110/c7i.metal-48xl.json +++ b/chdb/results/20251110/c7i.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless", "historical"], + "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "historical"], "load_time": 449, "data_size": 14477033643, "result": [ diff --git a/chdb/results/20251110/c8g.4xlarge.json b/chdb/results/20251110/c8g.4xlarge.json index bf8fceddbe..a52061820c 100644 --- a/chdb/results/20251110/c8g.4xlarge.json +++ b/chdb/results/20251110/c8g.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 0, "data_size": 14470258174, diff --git a/chdb/results/20260221/c6a.2xlarge.json b/chdb/results/20260221/c6a.2xlarge.json index 189e118560..4bbda87457 100644 --- a/chdb/results/20260221/c6a.2xlarge.json +++ b/chdb/results/20260221/c6a.2xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 577, "data_size": 14477368661, diff --git a/chdb/results/20260221/c6a.4xlarge.json b/chdb/results/20260221/c6a.4xlarge.json index 7b5bb357ed..3355edac16 100644 --- a/chdb/results/20260221/c6a.4xlarge.json +++ b/chdb/results/20260221/c6a.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 529, "data_size": 14477065232, diff --git a/chdb/results/20260221/c6a.large.json b/chdb/results/20260221/c6a.large.json index 2ac56f3860..4b44096c05 100644 --- a/chdb/results/20260221/c6a.large.json +++ b/chdb/results/20260221/c6a.large.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 744, "data_size": 14467595822, diff --git a/chdb/results/20260221/c6a.metal.json b/chdb/results/20260221/c6a.metal.json index 24a6b81fb1..4a63845217 100644 --- a/chdb/results/20260221/c6a.metal.json +++ b/chdb/results/20260221/c6a.metal.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 150, "data_size": 14469507443, diff --git a/chdb/results/20260221/c6a.xlarge.json b/chdb/results/20260221/c6a.xlarge.json index 7e0db44bf8..55223e5b18 100644 --- a/chdb/results/20260221/c6a.xlarge.json +++ b/chdb/results/20260221/c6a.xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 611, "data_size": 14465799207, diff --git a/chdb/results/20260221/c7a.metal-48xl.json b/chdb/results/20260221/c7a.metal-48xl.json index ba9604f829..4f9331f11a 100644 --- a/chdb/results/20260221/c7a.metal-48xl.json +++ b/chdb/results/20260221/c7a.metal-48xl.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 138, "data_size": 14474217084, diff --git a/chdb/results/20260221/c8g.4xlarge.json b/chdb/results/20260221/c8g.4xlarge.json index 02d13d6fee..9d7e8794ab 100644 --- a/chdb/results/20260221/c8g.4xlarge.json +++ b/chdb/results/20260221/c8g.4xlarge.json @@ -11,8 +11,7 @@ "column-oriented", "ClickHouse derivative", "embedded", - "stateless", - "serverless" + "stateless" ], "load_time": 617, "data_size": 14477072504, diff --git a/chdb/results/20260221/c8g.metal-48xl.json b/chdb/results/20260221/c8g.metal-48xl.json index 4251a3123d..ec48f3ba47 100644 --- a/chdb/results/20260221/c8g.metal-48xl.json +++ b/chdb/results/20260221/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 122, "data_size": 14478415627, "result": [ diff --git a/chdb/results/20260221/t3a.small.json b/chdb/results/20260221/t3a.small.json index dbe10e7a60..2800366f62 100644 --- a/chdb/results/20260221/t3a.small.json +++ b/chdb/results/20260221/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 1310, "data_size": 14469197557, "result": [ diff --git a/chdb/results/20260309/c6a.2xlarge.json b/chdb/results/20260309/c6a.2xlarge.json index b8da23dac3..a75a410f78 100644 --- a/chdb/results/20260309/c6a.2xlarge.json +++ b/chdb/results/20260309/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 558, "data_size": 15259769402, "result": [ diff --git a/chdb/results/20260309/c6a.4xlarge.json b/chdb/results/20260309/c6a.4xlarge.json index fc48a82d22..d20283e914 100644 --- a/chdb/results/20260309/c6a.4xlarge.json +++ b/chdb/results/20260309/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 484, "data_size": 15262232797, "result": [ diff --git a/chdb/results/20260309/c6a.large.json b/chdb/results/20260309/c6a.large.json index eeaff0de8c..57f6068b44 100644 --- a/chdb/results/20260309/c6a.large.json +++ b/chdb/results/20260309/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 779, "data_size": 15256492059, "result": [ diff --git a/chdb/results/20260309/c6a.metal.json b/chdb/results/20260309/c6a.metal.json index 6c22221169..6acb2a53b8 100644 --- a/chdb/results/20260309/c6a.metal.json +++ b/chdb/results/20260309/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 144, "data_size": 15262873227, "result": [ diff --git a/chdb/results/20260309/c6a.xlarge.json b/chdb/results/20260309/c6a.xlarge.json index 05614c206a..3306614988 100644 --- a/chdb/results/20260309/c6a.xlarge.json +++ b/chdb/results/20260309/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 597, "data_size": 15256819035, "result": [ diff --git a/chdb/results/20260309/c7a.metal-48xl.json b/chdb/results/20260309/c7a.metal-48xl.json index bda8e77304..2cdecbb68d 100644 --- a/chdb/results/20260309/c7a.metal-48xl.json +++ b/chdb/results/20260309/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 138, "data_size": 15268011342, "result": [ diff --git a/chdb/results/20260309/c8g.4xlarge.json b/chdb/results/20260309/c8g.4xlarge.json index 0ad8ef5793..64a27096f0 100644 --- a/chdb/results/20260309/c8g.4xlarge.json +++ b/chdb/results/20260309/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 508, "data_size": 15260292485, "result": [ diff --git a/chdb/results/20260309/c8g.metal-48xl.json b/chdb/results/20260309/c8g.metal-48xl.json index f48905a10a..d96153aabb 100644 --- a/chdb/results/20260309/c8g.metal-48xl.json +++ b/chdb/results/20260309/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 122, "data_size": 15264266838, "result": [ diff --git a/chdb/results/20260505/c6a.2xlarge.json b/chdb/results/20260505/c6a.2xlarge.json index 9429e7da0b..fcaeb82d00 100644 --- a/chdb/results/20260505/c6a.2xlarge.json +++ b/chdb/results/20260505/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 554, "data_size": 15259769402, "result": [ diff --git a/chdb/results/20260505/c6a.4xlarge.json b/chdb/results/20260505/c6a.4xlarge.json index ad3bc4869a..7c5d62a426 100644 --- a/chdb/results/20260505/c6a.4xlarge.json +++ b/chdb/results/20260505/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 486, "data_size": 15262238989, "result": [ diff --git a/chdb/results/20260505/c6a.large.json b/chdb/results/20260505/c6a.large.json index 3fdac9e6ff..6d42b2184b 100644 --- a/chdb/results/20260505/c6a.large.json +++ b/chdb/results/20260505/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 787, "data_size": 15256492059, "result": [ diff --git a/chdb/results/20260505/c6a.metal.json b/chdb/results/20260505/c6a.metal.json index a075c33776..d193ff64e6 100644 --- a/chdb/results/20260505/c6a.metal.json +++ b/chdb/results/20260505/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 145, "data_size": 15266642610, "result": [ diff --git a/chdb/results/20260505/c6a.xlarge.json b/chdb/results/20260505/c6a.xlarge.json index 453504762f..b0753b622e 100644 --- a/chdb/results/20260505/c6a.xlarge.json +++ b/chdb/results/20260505/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 602, "data_size": 15257125379, "result": [ diff --git a/chdb/results/20260505/c7a.metal-48xl.json b/chdb/results/20260505/c7a.metal-48xl.json index 7f823088ba..54ec3b564b 100644 --- a/chdb/results/20260505/c7a.metal-48xl.json +++ b/chdb/results/20260505/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 137, "data_size": 15265398942, "result": [ diff --git a/chdb/results/20260505/c8g.4xlarge.json b/chdb/results/20260505/c8g.4xlarge.json index 6daffc0c64..f3c93f79c6 100644 --- a/chdb/results/20260505/c8g.4xlarge.json +++ b/chdb/results/20260505/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 528, "data_size": 15260279858, "result": [ diff --git a/chdb/results/20260505/c8g.metal-48xl.json b/chdb/results/20260505/c8g.metal-48xl.json index dd727e041f..b8be8773ee 100644 --- a/chdb/results/20260505/c8g.metal-48xl.json +++ b/chdb/results/20260505/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 128, "data_size": 15261573895, "result": [ diff --git a/chdb/results/20260505/t3a.small.json b/chdb/results/20260505/t3a.small.json index 5968d0b175..377c804b09 100644 --- a/chdb/results/20260505/t3a.small.json +++ b/chdb/results/20260505/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"], + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], "load_time": 1520, "data_size": 15263718473, "result": [ diff --git a/chdb/results/20260509/c6a.4xlarge.json b/chdb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..cb92d7f67c --- /dev/null +++ b/chdb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "chDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], + "load_time": 504, + "data_size": 15260139166, + "result": [ + [0.012, 0.002, 0.002], + [0.051, 0.011, 0.011], + [0.107, 0.026, 0.026], + [0.159, 0.033, 0.033], + [0.464, 0.388, 0.391], + [0.912, 0.5, 0.513], + [0.045, 0.012, 0.014], + [0.064, 0.015, 0.015], + [0.666, 0.507, 0.519], + [0.857, 0.602, 0.607], + [0.302, 0.168, 0.179], + [0.282, 0.172, 0.175], + [1.047, 0.651, 0.638], + [1.991, 0.906, 0.906], + [1.239, 0.667, 0.619], + [0.597, 0.46, 0.462], + [2.57, 1.857, 1.86], + [1.873, 1.166, 1.163], + [5.017, 3.775, 3.803], + [0.092, 0.022, 0.022], + [10.751, 0.541, 0.538], + [12.35, 0.688, 0.712], + [14.592, 1.126, 1.151], + [11.772, 0.624, 0.628], + [2.271, 0.147, 0.147], + [0.721, 0.178, 0.18], + [2.278, 0.151, 0.145], + [0.238, 0.086, 0.085], + [9.701, 5.278, 5.305], + [0.095, 0.036, 0.038], + [0.539, 0.383, 0.367], + [3.876, 0.727, 0.735], + [6.426, 4.841, 4.872], + [12.048, 3.284, 3.325], + [12.132, 3.269, 3.192], + [0.391, 0.319, 0.325], + [0.122, 0.052, 0.054], + [0.099, 0.031, 0.031], + [0.104, 0.026, 0.026], + [0.17, 0.092, 0.092], + [0.094, 0.022, 0.022], + [0.086, 0.015, 0.015], + [0.079, 0.015, 0.014] +] +} + diff --git a/chdb/results/20260509/c6a.metal.json b/chdb/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..9f7ae0d3fb --- /dev/null +++ b/chdb/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "chDB", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"], + "load_time": 164, + "data_size": 15261599392, + "result": [ + [0.014, 0.004, 0.004], + [0.082, 0.045, 0.048], + [0.108, 0.049, 0.048], + [0.184, 0.028, 0.027], + [0.605, 0.543, 0.535], + [0.995, 0.529, 0.521], + [0.047, 0.021, 0.022], + [0.136, 0.074, 0.068], + [0.96, 0.819, 0.798], + [1.181, 0.857, 0.807], + [0.318, 0.245, 0.23], + [0.541, 0.293, 0.294], + [1.055, 0.63, 0.652], + [2.927, 1.612, 1.626], + [0.87, 0.31, 0.335], + [0.252, 0.188, 0.182], + [1.587, 0.379, 0.379], + [1.572, 0.343, 0.347], + [3.182, 0.876, 0.883], + [0.099, 0.02, 0.02], + [9.861, 0.277, 0.243], + [11.81, 1.187, 1.249], + [14.517, 1.453, 1.493], + [11.151, 0.598, 0.628], + [2.021, 0.274, 0.264], + [0.532, 0.101, 0.095], + [2.089, 0.262, 0.268], + [0.174, 0.05, 0.045], + [9.112, 1.629, 1.594], + [0.119, 0.064, 0.059], + [0.713, 0.48, 0.509], + [3.366, 0.261, 0.274], + [3.69, 1.112, 1.152], + [10.365, 0.956, 0.963], + [10.455, 0.926, 0.931], + [0.235, 0.197, 0.203], + [0.104, 0.063, 0.059], + [0.095, 0.033, 0.034], + [0.104, 0.026, 0.027], + [0.156, 0.088, 0.079], + [0.09, 0.026, 0.026], + [0.083, 0.019, 0.02], + [0.08, 0.018, 0.018] +] +} + diff --git a/chdb/run.sh b/chdb/run.sh deleted file mode 100755 index 02cb4f6d7c..0000000000 --- a/chdb/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/chdb/start b/chdb/start new file mode 100755 index 0000000000..71836e5f8f --- /dev/null +++ b/chdb/start @@ -0,0 +1,3 @@ +#!/bin/bash +# chdb is an embedded library — no daemon to start. +exit 0 diff --git a/chdb/stop b/chdb/stop new file mode 100755 index 0000000000..1e5556318f --- /dev/null +++ b/chdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# chdb is an embedded library — no daemon to stop. +exit 0 diff --git a/chdb/template.json b/chdb/template.json index 106dc9db4c..e9b90a7f32 100644 --- a/chdb/template.json +++ b/chdb/template.json @@ -3,5 +3,5 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless","serverless"] + "tags": ["C++","column-oriented","ClickHouse derivative","embedded","stateless"] } diff --git a/chyt/benchmark.sh b/chyt/benchmark.sh index d4ee82c46b..21a189925f 100755 --- a/chyt/benchmark.sh +++ b/chyt/benchmark.sh @@ -1,28 +1,8 @@ #!/bin/bash - +# Thin shim — actual flow is in lib/benchmark-common.sh. +# CHYT executes against a remote YT cluster ($YT_PROXY); no local download. export YT_USE_HOSTS=0 -export CHYT_ALIAS=*ch_public - -echo "----------------" -# Create table -echo "Creating table" -command time -f '%e' yt clickhouse execute "$(cat create.sql)" --alias $CHYT_ALIAS --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Fill table -echo -n "Load time: " -command time -f '%e' yt clickhouse execute "$(cat fill_data.sql)" --alias $CHYT_ALIAS --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Sort table -echo -n "Load time: " -command time -f '%e' yt sort --src //home/hits --dst //home/hits --sort-by "CounterID" --sort-by "EventDate" --sort-by "UserID" --sort-by "EventTime" --sort-by "WatchID" --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Run benchmark -echo "Starting benchmark" -./run.sh -echo "----------------" +export CHYT_ALIAS="${CHYT_ALIAS:-*ch_public}" +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chyt/check b/chyt/check new file mode 100755 index 0000000000..a31144aa42 --- /dev/null +++ b/chyt/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +yt clickhouse execute "SELECT 1" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" >/dev/null diff --git a/chyt/data-size b/chyt/data-size new file mode 100755 index 0000000000..723231ca8e --- /dev/null +++ b/chyt/data-size @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" + +# Report the byte size of the //home/hits table on the YT cluster. +yt get "//home/hits/@uncompressed_data_size" --proxy "$YT_PROXY" diff --git a/chyt/install b/chyt/install new file mode 100755 index 0000000000..d7512fad18 --- /dev/null +++ b/chyt/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# CHYT runs on a remote YT cluster — install the YT Python client locally to +# drive it. Idempotent. +if [ -d myenv ] && [ -x myenv/bin/yt ]; then + exit 0 +fi + +sudo apt-get install -y python3-pip python3-venv +python3 -m venv myenv +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --upgrade pip +pip install ytsaurus-client ytsaurus-yson diff --git a/chyt/load b/chyt/load new file mode 100755 index 0000000000..2e23011d93 --- /dev/null +++ b/chyt/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +# Create the table on the remote cluster. +yt clickhouse execute "$(cat create.sql)" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" + +# Fill the table from a public dataset URL (see fill_data.sql). +yt clickhouse execute "$(cat fill_data.sql)" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" + +# Sort the resulting table to give CHYT useful primary-key ordering. +yt sort --src //home/hits --dst //home/hits \ + --sort-by CounterID --sort-by EventDate --sort-by UserID \ + --sort-by EventTime --sort-by WatchID \ + --proxy "$YT_PROXY" + +sync diff --git a/chyt/query b/chyt/query new file mode 100755 index 0000000000..ff5f826464 --- /dev/null +++ b/chyt/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `yt clickhouse execute` against a +# remote CHYT clique. Stdout: query result. Stderr: query runtime in fractional +# seconds on the last line (extracted from the JSON `statistics.elapsed`). +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +query=$(cat) + +# `yt clickhouse execute --format JSON` returns a JSON envelope with timing +# info. We need to capture both data (stdout) and elapsed (stderr). +out=$(yt clickhouse execute "$query" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" --format JSON) + +# Result body to stdout. +printf '%s\n' "$out" + +# elapsed seconds → stderr. +printf '%s\n' "$out" | jq -r '.statistics.elapsed' >&2 diff --git a/chyt/run.sh b/chyt/run.sh deleted file mode 100755 index 5de7977fbf..0000000000 --- a/chyt/run.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash - -trap ctrl_c INT - -CPU_HIGH=12 -CPU_LOW=10 -RAM_HIGH=51539607552 -RAM_LOW=42949672960 - -apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install ytsaurus-client --break-system-packages -pip install ytsaurus-yson --break-system-packages - -function ctrl_c() { - echo "Exiting benchmark. Ctrl+C" - exit 1 -} - -throbber() { - local pid=$1 - local sp="\|/-" - local i=0 - - while kill -0 $pid > /dev/null; do - printf "\rWaiting... %c" "${sp:$i:1}" - ((i = (i + 1) % 4)) - sleep 0.1 - done - printf "\rWaiting... Done!\n" -} - - -stop_public_clique() { - yt clickhouse ctl --address $YT_CONTROLLER --proxy $YT_PROXY --cluster-name $CLUSTER_NAME stop *ch_public -} - -create_clique() { - yt clickhouse ctl create --speclet-options "{"active" = %true;"enable_geodata" = %false;"family" = "chyt";"instance_count" = 1;"instance_cpu" = 12;"instance_total_memory" = 51539607552;"pool" = "research";"restart_on_speclet_change" = %true;"stage" = "production";}" --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench -} - -insert_data() { - yt query --settings '{"clique"="clickbench"}' --format json --async chyt "$(cat fill_data.sql)" > fill_query_id -} - -data_filling_waiting() { - for _ in {1..300} - do - COUNT=$(yt clickhouse execute --alias *clickbench 'select count(*) as c from `//home/hits`') - if [[ "$COUNT" == 99997497 ]]; then - yt abort-query $(cat fill_query_id) - break - else - sleep 60 - fi - done -} - - -fill_data() { -command time -f '%e' yt clickhouse execute "$(cat create.sql)" --alias *clickbench --proxy $YT_PROXY -insert_data -data_filling_waiting & -throbber $! -yt sort --src //home/hits --dst //home/hits --sort-by "CounterID" --sort-by "EventDate" --sort-by "UserID" --sort-by "EventTime" --sort-by "WatchID" --proxy $YT_PROXY -} - - -check_ready() { - TOTAL_JOBS=$(yt list-operations --proxy $YT_PROXY --filter clique --filter clickbench --state running --format json | jq .operations[0].brief_progress.jobs.total) - RUNNING_JOBS=$(yt list-operations --proxy $YT_PROXY --filter clique --filter clickbench --state running --format json | jq .operations[0].brief_progress.jobs.running) - STATE=$(yt clickhouse ctl status --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench | grep "state" | head -n 1 | sed "s/^[ \t]*//") - HEALTH=$(yt clickhouse ctl status --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench | grep "health" | head -n 1 | sed "s/^[ \t]*//") - - if [[ "$STATE" == "\"state\" = \"active\";" && "$HEALTH" == "\"health\" = \"good\";" && "$TOTAL_JOBS" -eq "$RUNNING_JOBS" ]]; then - return 0 - else - return 1 - - fi -} - -change_clique_size() { - echo "Changing size. Instance count $1, vCPU $2, RAM $3 " - yt clickhouse ctl set-speclet --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME --alias clickbench "{"active" = %true;"enable_geodata" = %false;"family" = "chyt";"instance_count" = $1;"instance_cpu" = $2;"instance_total_memory" = $3;"pool" = "research";"restart_on_speclet_change" = %true;"stage" = "production";}" -} - - -run() { - TRIES=3 - QUERY_NUM=1 - TOTAL_LINES=$(wc -l < queries.sql) - cat queries.sql | while read query; do - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(yt clickhouse execute "$query" --alias *clickbench@0 --proxy $YT_PROXY --format JSON | jq .statistics.elapsed 2>&1) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - if [[ $QUERY_NUM == $TOTAL_LINES ]] - then echo "]" - else - echo "]," - fi - QUERY_NUM=$((QUERY_NUM + 1)) - done - -} - -clique_waiting() { - for _ in {1..300} - do - if check_ready; then - echo "Clique is almost ready. Waiting 1 minute to stabilize" - sleep 60 - break - else - echo "Clique not ready. Waiting for 10 seconds" - sleep 10 - fi - done -} - -echo "-------------------------------------" -echo "Stopping public clique" -stop_public_clique - -create_clique - -change_clique_size 1 $CPU_HIGH $RAM_HIGH - -clique_waiting -echo "-------------------------------------" -echo -n "Load time: " -command time -f '%e' fill_data -echo "-------------------------------------" - -for i in "1 $CPU_HIGH $RAM_HIGH 48GB" "2 $CPU_HIGH $RAM_HIGH 96GB" "4 $CPU_HIGH $RAM_HIGH 192GB" "9 $CPU_LOW $RAM_LOW 360GB" -do - set -- $i - echo "Running test for $4 clique" - change_clique_size $1 $2 $3 - clique_waiting - run -done diff --git a/chyt/start b/chyt/start new file mode 100755 index 0000000000..d1ada551c4 --- /dev/null +++ b/chyt/start @@ -0,0 +1,3 @@ +#!/bin/bash +# CHYT runs on a remote YT cluster ($YT_PROXY); nothing to start locally. +exit 0 diff --git a/chyt/stop b/chyt/stop new file mode 100755 index 0000000000..43bcdcb09e --- /dev/null +++ b/chyt/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# CHYT runs on a remote YT cluster; nothing to stop locally. +exit 0 diff --git a/citus/benchmark.sh b/citus/benchmark.sh index b082a31df5..531bd65038 100755 --- a/citus/benchmark.sh +++ b/citus/benchmark.sh @@ -1,33 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y docker.io -sudo apt-get install -y postgresql-client - -export PGPASSWORD=mypass -sudo docker run -d --name citus -p 5432:5432 -e POSTGRES_PASSWORD=$PGPASSWORD citusdata/citus:11.0 - -../download-hits-tsv - -echo "*:*:*:*:mypass" > .pgpass -chmod 400 .pgpass - -psql -U postgres -h localhost -d postgres -t -c 'CREATE DATABASE test' -psql -U postgres -h localhost -d postgres test -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo -n "Load time: " -command time -f '%e' psql -U postgres -h localhost -d postgres test -q -t -c "\\copy hits FROM 'hits.tsv'" - -# COPY 99997497 -# Time: 1579203.482 ms (26:19.203) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec -i citus du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/citus/check b/citus/check new file mode 100755 index 0000000000..d8b098776a --- /dev/null +++ b/citus/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD=${PGPASSWORD:-mypass} +psql -U postgres -h localhost -d postgres -t -c 'SELECT 1' >/dev/null diff --git a/citus/data-size b/citus/data-size new file mode 100755 index 0000000000..73535f867d --- /dev/null +++ b/citus/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/citus/install b/citus/install new file mode 100755 index 0000000000..0caf5deac8 --- /dev/null +++ b/citus/install @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +CITUS_VERSION=${CITUS_VERSION:-11.0} +PGPASSWORD=${PGPASSWORD:-mypass} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "citusdata/citus:$CITUS_VERSION" + +# (Re)create container so install is idempotent. +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD="$PGPASSWORD" \ + "citusdata/citus:$CITUS_VERSION" + +# Persist the password for psql clients invoked from this directory. +echo "*:*:*:*:$PGPASSWORD" > .pgpass +chmod 600 .pgpass diff --git a/citus/load b/citus/load new file mode 100755 index 0000000000..e91fbb13c9 --- /dev/null +++ b/citus/load @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +export PGPASSWORD=${PGPASSWORD:-mypass} + +psql -U postgres -h localhost -d postgres -t -c "DROP DATABASE IF EXISTS test" +psql -U postgres -h localhost -d postgres -t -c "CREATE DATABASE test" + +# create.sql for citus relies on the columnar access method, which the citus +# extension provides. Ensure it's enabled in the test DB. +psql -U postgres -h localhost -d test -t -c "CREATE EXTENSION IF NOT EXISTS citus" +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t < create.sql + +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t -c "\\copy hits FROM 'hits.tsv'" +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t -c 'VACUUM ANALYZE hits' + +rm -f hits.tsv +sync diff --git a/citus/query b/citus/query new file mode 100755 index 0000000000..349a1d48f6 --- /dev/null +++ b/citus/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD=${PGPASSWORD:-mypass} +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -U postgres -h localhost -d test -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/citus/results/20260509/c6a.4xlarge.json b/citus/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..5ad19e70ac --- /dev/null +++ b/citus/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Citus", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","PostgreSQL compatible","column-oriented","lukewarm-cold-run"], + "load_time": 1546, + "data_size": 18982266741, + "result": [ + [5.302, 4.339, 4.341], + [5.815, 4.069, 3.929], + [10.749, 6.738, 6.658], + [10.834, 5.945, 6.133], + [34.32, 30.816, 31.208], + [105.386, 101.513, 101.689], + [8.326, 5.268, 5.164], + [5.891, 4.108, 3.913], + [63.51, 59.494, 59.288], + [70.089, 63.262, 63.032], + [13.266, 8.127, 8.113], + [14.648, 9.456, 9.44], + [16.988, 13.536, 13.534], + [72.851, 66.372, 66.397], + [18.291, 14.123, 14.13], + [39.243, 34.924, 34.288], + [52.469, 46.482, 46.575], + [28.742, 22.266, 22.254], + [85.914, 75.418, 75.068], + [7.645, 4.27, 4.245], + [32.043, 23.282, 23.335], + [35.424, 24.432, 24.327], + [50.119, 31.285, 31.229], + [167.547, 122.941, 122.807], + [15.884, 8.494, 8.487], + [10.862, 6.918, 6.919], + [15.91, 8.608, 8.552], + [45.354, 35.156, 35.232], + [452.928, 449.598, 460.734], + [56.739, 53.16, 51.636], + [25.317, 16.311, 16.323], + [34.066, 19.996, 20.024], + [99.002, 87.573, 87.632], + [88.574, 77.008, 75.216], + [92.17, 80.576, 80.469], + [43.302, 40.638, 40.289], + [28.96, 16.257, 16.438], + [26.366, 14.768, 14.727], + [27.767, 15.265, 15.361], + [41.567, 23.095, 23.196], + [22.071, 11.849, 11.756], + [20.022, 9.789, 9.755], + [15.1, 8.062, 8.074] +] +} + diff --git a/citus/results/20260510/c6a.metal.json b/citus/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..d5044302ad --- /dev/null +++ b/citus/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Citus", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","PostgreSQL compatible","column-oriented","lukewarm-cold-run"], + "load_time": 1523, + "data_size": 18982258549, + "result": [ + [5.025, 4.202, 4.488], + [5.448, 4.052, 3.788], + [11.292, 6.557, 6.397], + [8.869, 5.722, 5.722], + [32.645, 29.705, 29.676], + [100.702, 97.265, 97.456], + [7.027, 5.032, 5.269], + [5.391, 3.782, 3.753], + [60.255, 56.288, 56.177], + [66.05, 60.09, 60.174], + [12.309, 7.829, 7.765], + [13.64, 9.008, 8.991], + [15.891, 12.85, 12.883], + [69.79, 63.564, 63.889], + [17.082, 13.673, 13.447], + [36.217, 33.784, 33.531], + [50.137, 44.421, 45.609], + [27.153, 21.343, 21.265], + [81.615, 71.709, 71.475], + [6.905, 4.062, 4.04], + [29.784, 22.394, 22.487], + [33.279, 23.596, 23.292], + [47.182, 29.821, 30.136], + [156.746, 124.499, 123.257], + [14.838, 8.745, 8.141], + [10.082, 6.591, 6.57], + [14.851, 8.718, 8.173], + [42.83, 33.849, 33.645], + [428.676, 427.566, 431.146], + [51.999, 51.398, 49.277], + [23.616, 15.352, 15.399], + [31.846, 19.125, 19.17], + [93.943, 83.742, 83.554], + [78.926, 70.471, 70.602], + [82.577, 80.614, 74.516], + [42.642, 38.862, 39.371], + [27.077, 15.77, 15.777], + [23.909, 13.218, 13.308], + [26.042, 14.507, 14.335], + [39.052, 22.298, 22.382], + [19.677, 10.861, 10.795], + [19.072, 9.12, 9.287], + [13.936, 7.712, 7.645] +] +} + diff --git a/citus/run.sh b/citus/run.sh deleted file mode 100755 index 0952225f61..0000000000 --- a/citus/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -U postgres -h localhost -d postgres --no-password -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/citus/start b/citus/start new file mode 100755 index 0000000000..f72133d4dd --- /dev/null +++ b/citus/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/citus/stop b/citus/stop new file mode 100755 index 0000000000..a1bdaef323 --- /dev/null +++ b/citus/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/clickhouse-datalake-partitioned/benchmark.sh b/clickhouse-datalake-partitioned/benchmark.sh index c4fec4d808..33e6ce27ba 100755 --- a/clickhouse-datalake-partitioned/benchmark.sh +++ b/clickhouse-datalake-partitioned/benchmark.sh @@ -1,19 +1,6 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -# Configure - -RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) -> clickhouse-local.yaml echo " -page_cache_max_size: ${RAM} -" - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: 14737666736" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-datalake-partitioned/check b/clickhouse-datalake-partitioned/check new file mode 100755 index 0000000000..39c3b45706 --- /dev/null +++ b/clickhouse-datalake-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --path . --query "SELECT 1" >/dev/null diff --git a/clickhouse-datalake-partitioned/data-size b/clickhouse-datalake-partitioned/data-size new file mode 100755 index 0000000000..ec242ca744 --- /dev/null +++ b/clickhouse-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored in S3 — fixed size (100 partitioned parquet files). +echo 14737666736 diff --git a/clickhouse-datalake-partitioned/install b/clickhouse-datalake-partitioned/install new file mode 100755 index 0000000000..ee46804c16 --- /dev/null +++ b/clickhouse-datalake-partitioned/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi + +# Use a userspace page cache sized to ~80% of RAM for S3 object reads. +RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) +cat > clickhouse-local.yaml <> result.csv - i=$((i+1)) - done <<< "$(./clickhouse local --path . --time --format Null --use_page_cache_for_object_storage 1 --query "$query; $query; $query" 2>&1)" - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done -./clickhouse local --path . --query="DROP TABLE hits" diff --git a/clickhouse-datalake-partitioned/start b/clickhouse-datalake-partitioned/start new file mode 100755 index 0000000000..a726d93477 --- /dev/null +++ b/clickhouse-datalake-partitioned/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-datalake-partitioned/stop b/clickhouse-datalake-partitioned/stop new file mode 100755 index 0000000000..7661285688 --- /dev/null +++ b/clickhouse-datalake-partitioned/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-datalake/benchmark.sh b/clickhouse-datalake/benchmark.sh index 8b4d0718d0..33e6ce27ba 100755 --- a/clickhouse-datalake/benchmark.sh +++ b/clickhouse-datalake/benchmark.sh @@ -1,19 +1,6 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -# Configure - -RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) -> clickhouse-local.yaml echo " -page_cache_max_size: ${RAM} -" - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: 14779976446" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-datalake/check b/clickhouse-datalake/check new file mode 100755 index 0000000000..39c3b45706 --- /dev/null +++ b/clickhouse-datalake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --path . --query "SELECT 1" >/dev/null diff --git a/clickhouse-datalake/data-size b/clickhouse-datalake/data-size new file mode 100755 index 0000000000..eeeeea5605 --- /dev/null +++ b/clickhouse-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored in S3 — fixed size (single parquet). +echo 14779976446 diff --git a/clickhouse-datalake/install b/clickhouse-datalake/install new file mode 100755 index 0000000000..ee46804c16 --- /dev/null +++ b/clickhouse-datalake/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi + +# Use a userspace page cache sized to ~80% of RAM for S3 object reads. +RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) +cat > clickhouse-local.yaml <> result.csv - i=$((i+1)) - done <<< "$(./clickhouse local --path . --time --format Null --use_page_cache_for_object_storage 1 --query "$query; $query; $query" 2>&1)" - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done -./clickhouse local --path . --query="DROP TABLE hits" diff --git a/clickhouse-datalake/start b/clickhouse-datalake/start new file mode 100755 index 0000000000..a726d93477 --- /dev/null +++ b/clickhouse-datalake/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-datalake/stop b/clickhouse-datalake/stop new file mode 100755 index 0000000000..7661285688 --- /dev/null +++ b/clickhouse-datalake/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-parquet-partitioned/benchmark.sh b/clickhouse-parquet-partitioned/benchmark.sh index 6ade867598..3b63e772a6 100755 --- a/clickhouse-parquet-partitioned/benchmark.sh +++ b/clickhouse-parquet-partitioned/benchmark.sh @@ -1,17 +1,5 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -../download-hits-parquet-partitioned - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Use for ClickHouse (Parquet, single) -# du -b hits.parquet +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-parquet-partitioned/check b/clickhouse-parquet-partitioned/check new file mode 100755 index 0000000000..86d2609b68 --- /dev/null +++ b/clickhouse-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --query "SELECT 1" >/dev/null diff --git a/clickhouse-parquet-partitioned/data-size b/clickhouse-parquet-partitioned/data-size new file mode 100755 index 0000000000..2d6921ab6d --- /dev/null +++ b/clickhouse-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/clickhouse-parquet-partitioned/install b/clickhouse-parquet-partitioned/install new file mode 100755 index 0000000000..43a2ea1c30 --- /dev/null +++ b/clickhouse-parquet-partitioned/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi diff --git a/clickhouse-parquet-partitioned/load b/clickhouse-parquet-partitioned/load new file mode 100755 index 0000000000..e496175596 --- /dev/null +++ b/clickhouse-parquet-partitioned/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# clickhouse-local with File(Parquet, 'hits_*.parquet') reads the parquet files +# in place, so there's no separate ingest step. Keep the downloaded +# hits_*.parquet files in this directory. +sync diff --git a/clickhouse-parquet-partitioned/query b/clickhouse-parquet-partitioned/query new file mode 100755 index 0000000000..a157a84bf3 --- /dev/null +++ b/clickhouse-parquet-partitioned/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-local with the table +# definition from create.sql prepended. Stdout: query result. Stderr: query +# runtime in fractional seconds on the last line. Exit non-zero on error. +set -e + +query=$(cat) +./clickhouse local --time --query="$(cat create.sql); ${query}" diff --git a/clickhouse-parquet-partitioned/results/20260509/c6a.4xlarge.json b/clickhouse-parquet-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..8b34f68861 --- /dev/null +++ b/clickhouse-parquet-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless","ClickHouse derivative"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [0.113, 0.01, 0.01], + [0.175, 0.028, 0.028], + [0.207, 0.05, 0.051], + [0.633, 0.08, 0.079], + [1.178, 0.335, 0.338], + [1.212, 0.679, 0.704], + [0.178, 0.031, 0.03], + [0.189, 0.03, 0.029], + [1.094, 0.504, 0.497], + [1.701, 0.583, 0.607], + [0.899, 0.173, 0.182], + [1.214, 0.186, 0.19], + [1.563, 0.639, 0.648], + [2.844, 0.915, 0.938], + [1.426, 0.727, 0.742], + [0.883, 0.478, 0.477], + [3.464, 1.878, 1.808], + [2.927, 1.419, 1.179], + [6.075, 3.648, 3.657], + [0.324, 0.072, 0.073], + [9.532, 1.126, 1.125], + [11.207, 1.177, 1.187], + [21.47, 1.525, 1.52], + [53.664, 2.872, 2.867], + [2.746, 0.339, 0.344], + [0.851, 0.238, 0.238], + [2.77, 0.343, 0.34], + [9.753, 1.795, 1.812], + [9.735, 9.6, 9.531], + [0.216, 0.051, 0.052], + [2.63, 0.587, 0.587], + [6.329, 0.971, 0.975], + [6.852, 4.399, 4.41], + [10.593, 3.007, 2.997], + [10.594, 3.025, 3.006], + [0.447, 0.329, 0.324], + [0.273, 0.1, 0.097], + [0.25, 0.059, 0.052], + [0.281, 0.046, 0.046], + [0.436, 0.184, 0.188], + [0.221, 0.037, 0.036], + [0.209, 0.035, 0.032], + [0.194, 0.024, 0.023] +] +} + diff --git a/clickhouse-parquet-partitioned/results/20260509/c6a.metal.json b/clickhouse-parquet-partitioned/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..99eeafb8f0 --- /dev/null +++ b/clickhouse-parquet-partitioned/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless","ClickHouse derivative"], + "load_time": 64, + "data_size": 14737666736, + "result": [ + [0.092, 0.033, 0.012], + [0.156, 0.055, 0.059], + [0.218, 0.076, 0.078], + [0.838, 0.074, 0.082], + [1.278, 0.134, 0.136], + [1.486, 0.153, 0.169], + [0.139, 0.039, 0.053], + [0.153, 0.065, 0.057], + [1.267, 0.365, 0.346], + [1.75, 0.357, 0.357], + [0.937, 0.164, 0.174], + [1.336, 0.149, 0.149], + [1.558, 0.222, 0.215], + [2.592, 0.275, 0.277], + [1.135, 0.259, 0.242], + [0.784, 0.157, 0.164], + [2.877, 0.435, 0.412], + [2.524, 0.331, 0.325], + [4.636, 0.824, 0.834], + [0.329, 0.071, 0.07], + [10.645, 0.378, 0.4], + [11.714, 0.38, 0.38], + [22.073, 0.404, 0.414], + [53.81, 0.919, 0.877], + [2.755, 0.153, 0.145], + [0.866, 0.129, 0.119], + [2.762, 0.136, 0.146], + [9.798, 0.506, 0.546], + [8.815, 1.715, 1.716], + [0.206, 0.107, 0.088], + [2.568, 0.212, 0.208], + [6.256, 0.267, 0.279], + [5.312, 1.159, 1.135], + [9.899, 0.758, 0.792], + [9.873, 0.834, 0.749], + [0.313, 0.126, 0.118], + [0.267, 0.102, 0.103], + [0.195, 0.07, 0.068], + [0.223, 0.056, 0.056], + [0.402, 0.15, 0.158], + [0.197, 0.042, 0.042], + [0.196, 0.039, 0.038], + [0.168, 0.03, 0.03] +] +} + diff --git a/clickhouse-parquet-partitioned/run.sh b/clickhouse-parquet-partitioned/run.sh deleted file mode 100755 index b0fd1faa55..0000000000 --- a/clickhouse-parquet-partitioned/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./clickhouse local --time --format Null --query="$(cat create.sql); $query" 2>&1 | tail -n1) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-parquet-partitioned/start b/clickhouse-parquet-partitioned/start new file mode 100755 index 0000000000..a726d93477 --- /dev/null +++ b/clickhouse-parquet-partitioned/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-parquet-partitioned/stop b/clickhouse-parquet-partitioned/stop new file mode 100755 index 0000000000..7661285688 --- /dev/null +++ b/clickhouse-parquet-partitioned/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-parquet/benchmark.sh b/clickhouse-parquet/benchmark.sh index d6845a14b5..fc4bacc8f3 100755 --- a/clickhouse-parquet/benchmark.sh +++ b/clickhouse-parquet/benchmark.sh @@ -1,14 +1,5 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -../download-hits-parquet-single - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-parquet/check b/clickhouse-parquet/check new file mode 100755 index 0000000000..86d2609b68 --- /dev/null +++ b/clickhouse-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --query "SELECT 1" >/dev/null diff --git a/clickhouse-parquet/data-size b/clickhouse-parquet/data-size new file mode 100755 index 0000000000..1a38db62ca --- /dev/null +++ b/clickhouse-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits.parquet | awk 'END { print $1 }' diff --git a/clickhouse-parquet/install b/clickhouse-parquet/install new file mode 100755 index 0000000000..43a2ea1c30 --- /dev/null +++ b/clickhouse-parquet/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi diff --git a/clickhouse-parquet/load b/clickhouse-parquet/load new file mode 100755 index 0000000000..0d7d5fc290 --- /dev/null +++ b/clickhouse-parquet/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# clickhouse-local with File(Parquet) engine reads the parquet file in place, +# so there's no separate ingest step. The "load" is implicit — just keep the +# downloaded hits.parquet in this directory. +sync diff --git a/clickhouse-parquet/query b/clickhouse-parquet/query new file mode 100755 index 0000000000..a157a84bf3 --- /dev/null +++ b/clickhouse-parquet/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-local with the table +# definition from create.sql prepended. Stdout: query result. Stderr: query +# runtime in fractional seconds on the last line. Exit non-zero on error. +set -e + +query=$(cat) +./clickhouse local --time --query="$(cat create.sql); ${query}" diff --git a/clickhouse-parquet/results/20260509/c6a.4xlarge.json b/clickhouse-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..7cdd72212e --- /dev/null +++ b/clickhouse-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless","ClickHouse derivative"], + "load_time": 5, + "data_size": 14779976446, + "result": [ + [0.136, 0.026, 0.026], + [0.211, 0.053, 0.069], + [0.29, 0.095, 0.093], + [0.377, 0.105, 0.102], + [0.683, 0.36, 0.361], + [0.884, 0.647, 0.645], + [0.211, 0.056, 0.054], + [0.229, 0.056, 0.059], + [0.831, 0.527, 0.521], + [1.389, 0.604, 0.596], + [0.693, 0.24, 0.239], + [0.648, 0.259, 0.263], + [1.096, 0.624, 0.629], + [2.569, 0.932, 0.929], + [1.167, 0.717, 0.712], + [0.699, 0.509, 0.505], + [3, 1.771, 1.762], + [2.353, 1.148, 1.142], + [5.806, 3.579, 3.602], + [0.375, 0.095, 0.096], + [10.99, 1.168, 1.166], + [12.672, 1.344, 1.362], + [24.547, 1.991, 1.993], + [62.402, 4.629, 4.62], + [2.481, 0.486, 0.483], + [0.773, 0.266, 0.267], + [2.461, 0.486, 0.485], + [9.672, 1.813, 1.789], + [9.613, 9.326, 9.37], + [0.27, 0.085, 0.085], + [2.333, 0.78, 0.774], + [5.922, 1.11, 1.13], + [6.425, 4.399, 4.392], + [10.583, 3.09, 3.071], + [10.631, 3.074, 3.058], + [0.556, 0.352, 0.356], + [0.417, 0.124, 0.104], + [0.3, 0.094, 0.08], + [0.379, 0.073, 0.075], + [0.415, 0.137, 0.134], + [0.28, 0.061, 0.058], + [0.259, 0.052, 0.053], + [0.242, 0.043, 0.043] +] +} + diff --git a/clickhouse-parquet/results/20260509/c6a.metal.json b/clickhouse-parquet/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..422bc506ba --- /dev/null +++ b/clickhouse-parquet/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless","ClickHouse derivative"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [0.12, 0.027, 0.028], + [0.19, 0.071, 0.072], + [0.21, 0.081, 0.084], + [0.347, 0.089, 0.095], + [0.452, 0.151, 0.151], + [0.756, 0.179, 0.18], + [0.162, 0.069, 0.067], + [0.174, 0.077, 0.077], + [0.831, 0.392, 0.398], + [1.25, 0.422, 0.408], + [0.558, 0.189, 0.184], + [0.577, 0.18, 0.174], + [0.825, 0.225, 0.23], + [2.054, 0.279, 0.279], + [0.889, 0.225, 0.218], + [0.373, 0.166, 0.171], + [2.08, 0.404, 0.404], + [1.989, 0.322, 0.336], + [4.012, 0.968, 0.912], + [0.315, 0.085, 0.082], + [9.508, 0.534, 1.017], + [11.262, 1.14, 1.293], + [21.615, 2.084, 1.518], + [54.628, 6.521, 6.735], + [2.48, 0.158, 0.154], + [0.724, 0.134, 0.132], + [2.448, 0.16, 0.158], + [9.752, 1.977, 1.935], + [8.123, 1.457, 1.401], + [0.229, 0.098, 0.096], + [2.207, 0.241, 0.236], + [5.668, 0.32, 0.411], + [4.896, 1.158, 1.232], + [9.861, 0.871, 0.861], + [9.894, 1.05, 0.942], + [0.364, 0.156, 0.148], + [0.317, 0.124, 0.139], + [0.29, 0.105, 0.11], + [0.353, 0.098, 0.086], + [0.388, 0.147, 0.129], + [0.256, 0.061, 0.069], + [0.238, 0.057, 0.061], + [0.22, 0.052, 0.05] +] +} + diff --git a/clickhouse-parquet/run.sh b/clickhouse-parquet/run.sh deleted file mode 100755 index b0fd1faa55..0000000000 --- a/clickhouse-parquet/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./clickhouse local --time --format Null --query="$(cat create.sql); $query" 2>&1 | tail -n1) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-parquet/start b/clickhouse-parquet/start new file mode 100755 index 0000000000..a726d93477 --- /dev/null +++ b/clickhouse-parquet/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-parquet/stop b/clickhouse-parquet/stop new file mode 100755 index 0000000000..7661285688 --- /dev/null +++ b/clickhouse-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-tencent/benchmark.sh b/clickhouse-tencent/benchmark.sh index 755ab5e26b..6a7f45d3a1 100755 --- a/clickhouse-tencent/benchmark.sh +++ b/clickhouse-tencent/benchmark.sh @@ -1,32 +1,5 @@ #!/bin/bash - -if [ ! -x /usr/bin/clickhouse ] -then -wget --continue --progress=dot:giga https://clickhouse-builds.s3.amazonaws.com/PRs/81944/e3a48c0de6d188232cc544244ba6862b63eb4762/build_amd_release/clickhouse-common-static-25.9.1.1-amd64.tgz -O clickhouse-tencent.tgz - mkdir -p clickhouse-tencent && tar -xzf clickhouse-tencent.tgz -C clickhouse-tencent - sudo clickhouse-tencent/clickhouse-common-static-25.9.1.1/usr/bin/clickhouse install --noninteractive -fi - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -clickhouse-client < create.sql - -../download-hits-parquet-partitioned -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -echo -n "Load time: " -clickhouse-client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) - -# Run the queries - -./run.sh "$1" - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse-tencent/check b/clickhouse-tencent/check new file mode 100755 index 0000000000..febe4e0de2 --- /dev/null +++ b/clickhouse-tencent/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse-tencent/data-size b/clickhouse-tencent/data-size new file mode 100755 index 0000000000..7770f6efb6 --- /dev/null +++ b/clickhouse-tencent/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse-tencent/install b/clickhouse-tencent/install new file mode 100755 index 0000000000..aef34d30a9 --- /dev/null +++ b/clickhouse-tencent/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +# Install a specific Tencent-built clickhouse package (PR build). +if [ ! -x /usr/bin/clickhouse ]; then + wget --continue --progress=dot:giga \ + https://clickhouse-builds.s3.amazonaws.com/PRs/81944/e3a48c0de6d188232cc544244ba6862b63eb4762/build_amd_release/clickhouse-common-static-25.9.1.1-amd64.tgz \ + -O clickhouse-tencent.tgz + mkdir -p clickhouse-tencent && tar -xzf clickhouse-tencent.tgz -C clickhouse-tencent + sudo clickhouse-tencent/clickhouse-common-static-25.9.1.1/usr/bin/clickhouse install --noninteractive +fi + +# See clickhouse/install for the rationale — without this, ./check +# passes on SELECT 1 before user-database parts are loaded, and the +# first cold query stalls on the part loader. +sudo mkdir -p /etc/clickhouse-server/config.d +echo 'async_load_databases: false' \ + | sudo tee /etc/clickhouse-server/config.d/async_load_databases.yaml >/dev/null diff --git a/clickhouse-tencent/load b/clickhouse-tencent/load new file mode 100755 index 0000000000..4a423a9b42 --- /dev/null +++ b/clickhouse-tencent/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +clickhouse-client < create.sql + +sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ +sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet + +clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" + +sudo rm -f /var/lib/clickhouse/user_files/hits_*.parquet +sync diff --git a/clickhouse-tencent/query b/clickhouse-tencent/query new file mode 100755 index 0000000000..9ef756b1f8 --- /dev/null +++ b/clickhouse-tencent/query @@ -0,0 +1,9 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +query=$(cat) +clickhouse-client --time --query="$query" diff --git a/clickhouse-tencent/run.sh b/clickhouse-tencent/run.sh deleted file mode 100755 index 0dce71cf96..0000000000 --- a/clickhouse-tencent/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-tencent/start b/clickhouse-tencent/start new file mode 100755 index 0000000000..54819af9cc --- /dev/null +++ b/clickhouse-tencent/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Idempotent: if already up, do nothing. +if clickhouse-client --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo clickhouse start diff --git a/clickhouse-tencent/stop b/clickhouse-tencent/stop new file mode 100755 index 0000000000..ea9d529c3e --- /dev/null +++ b/clickhouse-tencent/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/clickhouse-web/benchmark.sh b/clickhouse-web/benchmark.sh index eb927e5ef1..21c0f79d7e 100755 --- a/clickhouse-web/benchmark.sh +++ b/clickhouse-web/benchmark.sh @@ -1,32 +1,6 @@ #!/bin/bash - -# The benchmark should be run in the eu-central-1 (Frankfurt) region. -# Allocate a network-optimized ("n") machine, e.g. c5n.4xlarge. - -# Install - -curl https://clickhouse.com/ | sh -sudo ./clickhouse install --noninteractive -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# A directory for cache -sudo mkdir /dev/shm/clickhouse -sudo chown clickhouse:clickhouse /dev/shm/clickhouse - -# Load the data - -echo -n "Load time: " -clickhouse-client --time < create.sql - -# Run the queries - -./run.sh - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read from a remote ClickHouse-hosted web disk; no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse-web/check b/clickhouse-web/check new file mode 100755 index 0000000000..febe4e0de2 --- /dev/null +++ b/clickhouse-web/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse-web/data-size b/clickhouse-web/data-size new file mode 100755 index 0000000000..7770f6efb6 --- /dev/null +++ b/clickhouse-web/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse-web/install b/clickhouse-web/install new file mode 100755 index 0000000000..eb23629536 --- /dev/null +++ b/clickhouse-web/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Note: this benchmark expects to run in eu-central-1 (Frankfurt) on an +# n-class network-optimized machine (e.g. c5n.4xlarge), since data is fetched +# over HTTP from a public ClickHouse-hosted dataset. + +if [ ! -x /usr/bin/clickhouse ]; then + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive +fi + +# Cache directory used by the web disk. +sudo mkdir -p /dev/shm/clickhouse +sudo chown clickhouse:clickhouse /dev/shm/clickhouse diff --git a/clickhouse-web/load b/clickhouse-web/load new file mode 100755 index 0000000000..8b928b8f5b --- /dev/null +++ b/clickhouse-web/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql is an ATTACH TABLE that points to a remote web disk; nothing is +# downloaded or written here, the table is materialized on-demand at query +# time, with /dev/shm/clickhouse/ as a local cache. +clickhouse-client < create.sql +sync diff --git a/clickhouse-web/query b/clickhouse-web/query new file mode 100755 index 0000000000..72a6eda1e8 --- /dev/null +++ b/clickhouse-web/query @@ -0,0 +1,12 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +# +# The web-disk cache is dropped before the query so timings are cold. +set -e + +query=$(cat) +clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" >/dev/null +clickhouse-client --time --query="$query" diff --git a/clickhouse-web/results/20230226/c5n.4xlarge.json b/clickhouse-web/results/20230226/c5n.4xlarge.json index 961c8101ea..2c28d8dab3 100644 --- a/clickhouse-web/results/20230226/c5n.4xlarge.json +++ b/clickhouse-web/results/20230226/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20230423/c5n.4xlarge.json b/clickhouse-web/results/20230423/c5n.4xlarge.json index c478fcaeb1..6c616bcae1 100644 --- a/clickhouse-web/results/20230423/c5n.4xlarge.json +++ b/clickhouse-web/results/20230423/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20230423/c6a.metal.json b/clickhouse-web/results/20230423/c6a.metal.json index de040176f6..fe99184619 100644 --- a/clickhouse-web/results/20230423/c6a.metal.json +++ b/clickhouse-web/results/20230423/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20230609/c5n.4xlarge.json b/clickhouse-web/results/20230609/c5n.4xlarge.json index 9e72cc4daf..8d08779fe2 100644 --- a/clickhouse-web/results/20230609/c5n.4xlarge.json +++ b/clickhouse-web/results/20230609/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20230901/c5n.4xlarge.json b/clickhouse-web/results/20230901/c5n.4xlarge.json index 7212a930ad..9a3046db24 100644 --- a/clickhouse-web/results/20230901/c5n.4xlarge.json +++ b/clickhouse-web/results/20230901/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20230901/c6a.metal.json b/clickhouse-web/results/20230901/c6a.metal.json index bbcc3b19ab..a3166caf5b 100644 --- a/clickhouse-web/results/20230901/c6a.metal.json +++ b/clickhouse-web/results/20230901/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20231209/c5n.4xlarge.json b/clickhouse-web/results/20231209/c5n.4xlarge.json index b059eabaff..110aab5e89 100644 --- a/clickhouse-web/results/20231209/c5n.4xlarge.json +++ b/clickhouse-web/results/20231209/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20231209/c6a.metal.json b/clickhouse-web/results/20231209/c6a.metal.json index d97712df5c..255e2f3307 100644 --- a/clickhouse-web/results/20231209/c6a.metal.json +++ b/clickhouse-web/results/20231209/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20240131/c5n.4xlarge.json b/clickhouse-web/results/20240131/c5n.4xlarge.json index 8d5218e79d..f1959141bc 100644 --- a/clickhouse-web/results/20240131/c5n.4xlarge.json +++ b/clickhouse-web/results/20240131/c5n.4xlarge.json @@ -9,7 +9,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "historical" ], diff --git a/clickhouse-web/results/20240131/c6a.metal.json b/clickhouse-web/results/20240131/c6a.metal.json index ae836ea542..bd4395cdda 100644 --- a/clickhouse-web/results/20240131/c6a.metal.json +++ b/clickhouse-web/results/20240131/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20241106/c5n.4xlarge.json b/clickhouse-web/results/20241106/c5n.4xlarge.json index 6a59e107b1..62c43455c7 100644 --- a/clickhouse-web/results/20241106/c5n.4xlarge.json +++ b/clickhouse-web/results/20241106/c5n.4xlarge.json @@ -5,7 +5,7 @@ "cluster_size": 1, "tuned": "no", "comment": "", - "tags": ["C++", "column-oriented", "ClickHouse derivative", "serverless", "stateless", "historical"], + "tags": ["C++", "column-oriented", "ClickHouse derivative", "stateless", "historical"], "load_time": 1, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20241106/c6a.metal.json b/clickhouse-web/results/20241106/c6a.metal.json index 7c4fb534b4..cc0e43445f 100644 --- a/clickhouse-web/results/20241106/c6a.metal.json +++ b/clickhouse-web/results/20241106/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250409/c6a.metal.json b/clickhouse-web/results/20250409/c6a.metal.json index a2eeb92ad1..33f0ef2e5c 100644 --- a/clickhouse-web/results/20250409/c6a.metal.json +++ b/clickhouse-web/results/20250409/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250608/c6a.4xlarge.json b/clickhouse-web/results/20250608/c6a.4xlarge.json index deb52ac64a..3ddde070c2 100644 --- a/clickhouse-web/results/20250608/c6a.4xlarge.json +++ b/clickhouse-web/results/20250608/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250608/c6a.metal.json b/clickhouse-web/results/20250608/c6a.metal.json index 9d6b1dacdf..358e69ead9 100644 --- a/clickhouse-web/results/20250608/c6a.metal.json +++ b/clickhouse-web/results/20250608/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250609/c6a.4xlarge.json b/clickhouse-web/results/20250609/c6a.4xlarge.json index 72128d84ca..1b30aa628b 100644 --- a/clickhouse-web/results/20250609/c6a.4xlarge.json +++ b/clickhouse-web/results/20250609/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250609/c6a.metal.json b/clickhouse-web/results/20250609/c6a.metal.json index f290d04946..b01ebd7328 100644 --- a/clickhouse-web/results/20250609/c6a.metal.json +++ b/clickhouse-web/results/20250609/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250620/c6a.4xlarge.json b/clickhouse-web/results/20250620/c6a.4xlarge.json index 2b2805758d..25637f1846 100644 --- a/clickhouse-web/results/20250620/c6a.4xlarge.json +++ b/clickhouse-web/results/20250620/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250620/c6a.metal.json b/clickhouse-web/results/20250620/c6a.metal.json index 7da9c0ae5c..96e4345356 100644 --- a/clickhouse-web/results/20250620/c6a.metal.json +++ b/clickhouse-web/results/20250620/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250710/c6a.4xlarge.json b/clickhouse-web/results/20250710/c6a.4xlarge.json index 37b9cc0609..a95ffe7608 100644 --- a/clickhouse-web/results/20250710/c6a.4xlarge.json +++ b/clickhouse-web/results/20250710/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250712/c8g.4xlarge.json b/clickhouse-web/results/20250712/c8g.4xlarge.json index 58682a4263..f8725772e2 100644 --- a/clickhouse-web/results/20250712/c8g.4xlarge.json +++ b/clickhouse-web/results/20250712/c8g.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250817/c8g.metal-48xl.json b/clickhouse-web/results/20250817/c8g.metal-48xl.json index efcc9a2b9d..415bcb16e7 100644 --- a/clickhouse-web/results/20250817/c8g.metal-48xl.json +++ b/clickhouse-web/results/20250817/c8g.metal-48xl.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c6a.2xlarge.json b/clickhouse-web/results/20250830/c6a.2xlarge.json index 7dee891491..203e79d320 100644 --- a/clickhouse-web/results/20250830/c6a.2xlarge.json +++ b/clickhouse-web/results/20250830/c6a.2xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c6a.4xlarge.json b/clickhouse-web/results/20250830/c6a.4xlarge.json index cb833263a9..62dd3093b8 100644 --- a/clickhouse-web/results/20250830/c6a.4xlarge.json +++ b/clickhouse-web/results/20250830/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c6a.metal.json b/clickhouse-web/results/20250830/c6a.metal.json index 0b24c43bbb..927304ce9a 100644 --- a/clickhouse-web/results/20250830/c6a.metal.json +++ b/clickhouse-web/results/20250830/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c6a.xlarge.json b/clickhouse-web/results/20250830/c6a.xlarge.json index 245cc9c8b5..284921dfc6 100644 --- a/clickhouse-web/results/20250830/c6a.xlarge.json +++ b/clickhouse-web/results/20250830/c6a.xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c7a.metal-48xl.json b/clickhouse-web/results/20250830/c7a.metal-48xl.json index 6580462c3c..5a37892b10 100644 --- a/clickhouse-web/results/20250830/c7a.metal-48xl.json +++ b/clickhouse-web/results/20250830/c7a.metal-48xl.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c8g.4xlarge.json b/clickhouse-web/results/20250830/c8g.4xlarge.json index f0c0d7147f..6ff5ff5dc6 100644 --- a/clickhouse-web/results/20250830/c8g.4xlarge.json +++ b/clickhouse-web/results/20250830/c8g.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250830/c8g.metal-48xl.json b/clickhouse-web/results/20250830/c8g.metal-48xl.json index 8b16e3b345..48e4168894 100644 --- a/clickhouse-web/results/20250830/c8g.metal-48xl.json +++ b/clickhouse-web/results/20250830/c8g.metal-48xl.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250831/c6a.2xlarge.json b/clickhouse-web/results/20250831/c6a.2xlarge.json index c20e00e2a2..2521d9c8b9 100644 --- a/clickhouse-web/results/20250831/c6a.2xlarge.json +++ b/clickhouse-web/results/20250831/c6a.2xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250831/c6a.4xlarge.json b/clickhouse-web/results/20250831/c6a.4xlarge.json index 2eab562526..1c89b25d10 100644 --- a/clickhouse-web/results/20250831/c6a.4xlarge.json +++ b/clickhouse-web/results/20250831/c6a.4xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250831/c6a.metal.json b/clickhouse-web/results/20250831/c6a.metal.json index ef8dab9375..9a5d515885 100644 --- a/clickhouse-web/results/20250831/c6a.metal.json +++ b/clickhouse-web/results/20250831/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250831/c6a.xlarge.json b/clickhouse-web/results/20250831/c6a.xlarge.json index 269e562bbf..773db0dd62 100644 --- a/clickhouse-web/results/20250831/c6a.xlarge.json +++ b/clickhouse-web/results/20250831/c6a.xlarge.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20250907/c6a.metal.json b/clickhouse-web/results/20250907/c6a.metal.json index a3e66538be..f4888570c4 100644 --- a/clickhouse-web/results/20250907/c6a.metal.json +++ b/clickhouse-web/results/20250907/c6a.metal.json @@ -10,7 +10,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ], diff --git a/clickhouse-web/results/20251009/c6a.2xlarge.json b/clickhouse-web/results/20251009/c6a.2xlarge.json index 46ecdce4a8..a9f1a355a7 100644 --- a/clickhouse-web/results/20251009/c6a.2xlarge.json +++ b/clickhouse-web/results/20251009/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c6a.4xlarge.json b/clickhouse-web/results/20251009/c6a.4xlarge.json index 9b918039fb..a5bbcd2881 100644 --- a/clickhouse-web/results/20251009/c6a.4xlarge.json +++ b/clickhouse-web/results/20251009/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c6a.metal.json b/clickhouse-web/results/20251009/c6a.metal.json index cad6b69912..15e2405c6b 100644 --- a/clickhouse-web/results/20251009/c6a.metal.json +++ b/clickhouse-web/results/20251009/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c6a.xlarge.json b/clickhouse-web/results/20251009/c6a.xlarge.json index a94a1150e2..a14ddbf5dc 100644 --- a/clickhouse-web/results/20251009/c6a.xlarge.json +++ b/clickhouse-web/results/20251009/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c7a.metal-48xl.json b/clickhouse-web/results/20251009/c7a.metal-48xl.json index 02b4cf1b2b..2ae5c0514a 100644 --- a/clickhouse-web/results/20251009/c7a.metal-48xl.json +++ b/clickhouse-web/results/20251009/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c8g.4xlarge.json b/clickhouse-web/results/20251009/c8g.4xlarge.json index 67a5d691fc..852e934a45 100644 --- a/clickhouse-web/results/20251009/c8g.4xlarge.json +++ b/clickhouse-web/results/20251009/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251009/c8g.metal-48xl.json b/clickhouse-web/results/20251009/c8g.metal-48xl.json index f94c54b3e8..e45c98f6c6 100644 --- a/clickhouse-web/results/20251009/c8g.metal-48xl.json +++ b/clickhouse-web/results/20251009/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless", "lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless", "lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c6a.2xlarge.json b/clickhouse-web/results/20251220/c6a.2xlarge.json index a1dba6c150..4fe70f21c1 100644 --- a/clickhouse-web/results/20251220/c6a.2xlarge.json +++ b/clickhouse-web/results/20251220/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c6a.4xlarge.json b/clickhouse-web/results/20251220/c6a.4xlarge.json index 17dc0ac301..90b4fe5d63 100644 --- a/clickhouse-web/results/20251220/c6a.4xlarge.json +++ b/clickhouse-web/results/20251220/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c6a.metal.json b/clickhouse-web/results/20251220/c6a.metal.json index d9fd3383a9..8ceb261d78 100644 --- a/clickhouse-web/results/20251220/c6a.metal.json +++ b/clickhouse-web/results/20251220/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c6a.xlarge.json b/clickhouse-web/results/20251220/c6a.xlarge.json index f74d7b00db..45279aa469 100644 --- a/clickhouse-web/results/20251220/c6a.xlarge.json +++ b/clickhouse-web/results/20251220/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c7a.metal-48xl.json b/clickhouse-web/results/20251220/c7a.metal-48xl.json index 6570c567b2..9e046043e7 100644 --- a/clickhouse-web/results/20251220/c7a.metal-48xl.json +++ b/clickhouse-web/results/20251220/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c8g.4xlarge.json b/clickhouse-web/results/20251220/c8g.4xlarge.json index 925996d8ac..ad60b64d9a 100644 --- a/clickhouse-web/results/20251220/c8g.4xlarge.json +++ b/clickhouse-web/results/20251220/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20251220/c8g.metal-48xl.json b/clickhouse-web/results/20251220/c8g.metal-48xl.json index 2cedafd527..bbc09eda59 100644 --- a/clickhouse-web/results/20251220/c8g.metal-48xl.json +++ b/clickhouse-web/results/20251220/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c6a.2xlarge.json b/clickhouse-web/results/20260309/c6a.2xlarge.json index eeedb8332c..be29b323b3 100644 --- a/clickhouse-web/results/20260309/c6a.2xlarge.json +++ b/clickhouse-web/results/20260309/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c6a.4xlarge.json b/clickhouse-web/results/20260309/c6a.4xlarge.json index b9e0644e29..948d78238a 100644 --- a/clickhouse-web/results/20260309/c6a.4xlarge.json +++ b/clickhouse-web/results/20260309/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c6a.metal.json b/clickhouse-web/results/20260309/c6a.metal.json index dc62a7c82f..35da44319d 100644 --- a/clickhouse-web/results/20260309/c6a.metal.json +++ b/clickhouse-web/results/20260309/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c6a.xlarge.json b/clickhouse-web/results/20260309/c6a.xlarge.json index 5927210400..6af8c9a8b9 100644 --- a/clickhouse-web/results/20260309/c6a.xlarge.json +++ b/clickhouse-web/results/20260309/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c7a.metal-48xl.json b/clickhouse-web/results/20260309/c7a.metal-48xl.json index a297c45b0c..ae3aaf6ebf 100644 --- a/clickhouse-web/results/20260309/c7a.metal-48xl.json +++ b/clickhouse-web/results/20260309/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c8g.4xlarge.json b/clickhouse-web/results/20260309/c8g.4xlarge.json index 57fc2660a9..06ad78037b 100644 --- a/clickhouse-web/results/20260309/c8g.4xlarge.json +++ b/clickhouse-web/results/20260309/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260309/c8g.metal-48xl.json b/clickhouse-web/results/20260309/c8g.metal-48xl.json index f6f05c221b..3e28d55a9e 100644 --- a/clickhouse-web/results/20260309/c8g.metal-48xl.json +++ b/clickhouse-web/results/20260309/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c6a.2xlarge.json b/clickhouse-web/results/20260327/c6a.2xlarge.json index 086322f032..2d4a294a96 100644 --- a/clickhouse-web/results/20260327/c6a.2xlarge.json +++ b/clickhouse-web/results/20260327/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c6a.4xlarge.json b/clickhouse-web/results/20260327/c6a.4xlarge.json index f3b38ca5ff..daaa0675cc 100644 --- a/clickhouse-web/results/20260327/c6a.4xlarge.json +++ b/clickhouse-web/results/20260327/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c6a.metal.json b/clickhouse-web/results/20260327/c6a.metal.json index 5972bb231a..53ecd60a00 100644 --- a/clickhouse-web/results/20260327/c6a.metal.json +++ b/clickhouse-web/results/20260327/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c6a.xlarge.json b/clickhouse-web/results/20260327/c6a.xlarge.json index 3be92c7ff0..81949011ba 100644 --- a/clickhouse-web/results/20260327/c6a.xlarge.json +++ b/clickhouse-web/results/20260327/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c7a.metal-48xl.json b/clickhouse-web/results/20260327/c7a.metal-48xl.json index 0f623e51bf..9956b2bb47 100644 --- a/clickhouse-web/results/20260327/c7a.metal-48xl.json +++ b/clickhouse-web/results/20260327/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c8g.4xlarge.json b/clickhouse-web/results/20260327/c8g.4xlarge.json index 554dab3170..d4a0aa1bcf 100644 --- a/clickhouse-web/results/20260327/c8g.4xlarge.json +++ b/clickhouse-web/results/20260327/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260327/c8g.metal-48xl.json b/clickhouse-web/results/20260327/c8g.metal-48xl.json index 1623e0bc5a..457f053e0b 100644 --- a/clickhouse-web/results/20260327/c8g.metal-48xl.json +++ b/clickhouse-web/results/20260327/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c6a.2xlarge.json b/clickhouse-web/results/20260501/c6a.2xlarge.json index 8f99d5ddce..b14a134774 100644 --- a/clickhouse-web/results/20260501/c6a.2xlarge.json +++ b/clickhouse-web/results/20260501/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c6a.4xlarge.json b/clickhouse-web/results/20260501/c6a.4xlarge.json index 0d45c40d17..f60ac1a4d8 100644 --- a/clickhouse-web/results/20260501/c6a.4xlarge.json +++ b/clickhouse-web/results/20260501/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c6a.metal.json b/clickhouse-web/results/20260501/c6a.metal.json index 49984f2d2a..cac32d7f58 100644 --- a/clickhouse-web/results/20260501/c6a.metal.json +++ b/clickhouse-web/results/20260501/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c6a.xlarge.json b/clickhouse-web/results/20260501/c6a.xlarge.json index 853f437034..662d3955ec 100644 --- a/clickhouse-web/results/20260501/c6a.xlarge.json +++ b/clickhouse-web/results/20260501/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c7a.metal-48xl.json b/clickhouse-web/results/20260501/c7a.metal-48xl.json index 2dad6d6d30..0b2bf2a965 100644 --- a/clickhouse-web/results/20260501/c7a.metal-48xl.json +++ b/clickhouse-web/results/20260501/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c8g.4xlarge.json b/clickhouse-web/results/20260501/c8g.4xlarge.json index 5a81f4e4d1..b05f69cd8a 100644 --- a/clickhouse-web/results/20260501/c8g.4xlarge.json +++ b/clickhouse-web/results/20260501/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260501/c8g.metal-48xl.json b/clickhouse-web/results/20260501/c8g.metal-48xl.json index 3a836406dc..e11803534e 100644 --- a/clickhouse-web/results/20260501/c8g.metal-48xl.json +++ b/clickhouse-web/results/20260501/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["C++","column-oriented","ClickHouse derivative","serverless","stateless","lukewarm-cold-run"], + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], "load_time": 0, "data_size": 14557009492, "result": [ diff --git a/clickhouse-web/results/20260509/c6a.4xlarge.json b/clickhouse-web/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..fbc8104e8e --- /dev/null +++ b/clickhouse-web/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (web)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], + "load_time": 1, + "data_size": 14557009492, + "result": [ + [0.002, 0.002, 0.002], + [0.166, 0.049, 0.084], + [0.315, 0.149, 0.203], + [0.396, 0.242, 0.253], + [0.499, 0.419, 0.43], + [0.993, 0.836, 1.181], + [0.095, 0.037, 0.035], + [0.111, 0.044, 0.048], + [0.795, 0.654, 0.655], + [0.87, 1.28, 0.757], + [0.62, 0.53, 0.515], + [0.622, 0.564, 0.55], + [0.933, 0.818, 0.822], + [1.376, 1.414, 1.58], + [1.194, 1.037, 1.031], + [0.71, 0.535, 0.549], + [2.089, 1.945, 1.921], + [1.557, 1.415, 1.48], + [4.035, 3.554, 3.403], + [0.638, 0.313, 0.387], + [4.469, 3.542, 3.539], + [3.757, 1.884, 1.909], + [5.06, 3.33, 3.454], + [5.305, 2.203, 1.36], + [0.603, 0.565, 1.182], + [0.546, 0.714, 0.505], + [0.576, 0.58, 1.152], + [3.527, 3.415, 3.406], + [10.71, 10.228, 10.151], + [0.368, 0.195, 0.172], + [1.34, 1.006, 1.385], + [2.507, 2.249, 2.211], + [4.911, 4.772, 4.847], + [5.142, 5.054, 5.18], + [5.194, 5.283, 5.301], + [0.523, 0.45, 0.405], + [0.259, 0.159, 0.159], + [0.337, 0.149, 0.139], + [0.313, 0.131, 0.133], + [0.432, 0.198, 0.199], + [0.266, 0.102, 0.105], + [0.255, 0.096, 0.097], + [0.185, 0.153, 0.104] +] +} + diff --git a/clickhouse-web/results/20260509/c6a.metal.json b/clickhouse-web/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..53e750ae8d --- /dev/null +++ b/clickhouse-web/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse (web)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","stateless","lukewarm-cold-run"], + "load_time": 1, + "data_size": 14557009492, + "result": [ + [0.002, 0.001, 0.001], + [0.199, 0.086, 0.072], + [0.313, 0.165, 0.142], + [0.312, 0.172, 0.272], + [0.303, 0.21, 0.217], + [0.736, 0.455, 0.425], + [0.127, 0.074, 0.037], + [0.175, 0.079, 0.116], + [0.623, 0.476, 0.461], + [0.594, 0.449, 0.466], + [0.429, 0.308, 0.3], + [0.405, 0.291, 0.324], + [0.573, 0.466, 0.501], + [0.681, 0.584, 0.565], + [0.605, 0.509, 0.485], + [0.277, 0.221, 0.245], + [0.673, 0.566, 0.568], + [0.565, 0.505, 0.498], + [1.096, 0.836, 0.832], + [0.593, 0.269, 0.332], + [1.298, 1.08, 1.263], + [1.055, 1.289, 1.24], + [1.367, 1.543, 1.55], + [5.208, 0.667, 0.841], + [0.241, 1.151, 0.368], + [0.454, 0.417, 0.422], + [0.305, 0.24, 0.336], + [1.29, 0.908, 0.924], + [1.898, 1.713, 1.683], + [0.213, 0.136, 0.148], + [0.714, 0.504, 0.585], + [0.865, 0.759, 1.357], + [2.211, 1.489, 1.403], + [1.949, 1.683, 1.612], + [1.552, 1.551, 1.702], + [0.254, 0.178, 0.187], + [0.363, 0.172, 0.182], + [0.21, 0.204, 0.183], + [0.348, 0.146, 0.125], + [0.471, 0.202, 0.225], + [0.321, 0.087, 0.088], + [0.25, 0.087, 0.086], + [0.22, 0.094, 0.124] +] +} + diff --git a/clickhouse-web/run.sh b/clickhouse-web/run.sh deleted file mode 100755 index 502332872c..0000000000 --- a/clickhouse-web/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-web/start b/clickhouse-web/start new file mode 100755 index 0000000000..54819af9cc --- /dev/null +++ b/clickhouse-web/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Idempotent: if already up, do nothing. +if clickhouse-client --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo clickhouse start diff --git a/clickhouse-web/stop b/clickhouse-web/stop new file mode 100755 index 0000000000..ea9d529c3e --- /dev/null +++ b/clickhouse-web/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/clickhouse-web/template.json b/clickhouse-web/template.json index 9686fefe69..4caddd16fd 100644 --- a/clickhouse-web/template.json +++ b/clickhouse-web/template.json @@ -7,7 +7,6 @@ "C++", "column-oriented", "ClickHouse derivative", - "serverless", "stateless", "lukewarm-cold-run" ] diff --git a/clickhouse/benchmark.sh b/clickhouse/benchmark.sh index 18fcf86ef6..6a7f45d3a1 100755 --- a/clickhouse/benchmark.sh +++ b/clickhouse/benchmark.sh @@ -1,52 +1,5 @@ #!/bin/bash - -# Install - -if [ ! -x /usr/bin/clickhouse ] -then - curl https://clickhouse.com/ | sh - sudo ./clickhouse install --noninteractive -fi - -# Optional: if you want to use higher compression: -if (( 0 )); then - echo " -compression: - case: - method: zstd - " | sudo tee /etc/clickhouse-server/config.d/compression.yaml -fi; - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -elif [ ! -z "$1" ]; then - SUFFIX="-$1" -fi - -# Load the data - -clickhouse-client < create"$SUFFIX".sql - -../download-hits-parquet-partitioned -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -echo -n "Load time: " -clickhouse-client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) - -# Run the queries - -./run.sh "$1" - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse/check b/clickhouse/check new file mode 100755 index 0000000000..febe4e0de2 --- /dev/null +++ b/clickhouse/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse/data-size b/clickhouse/data-size new file mode 100755 index 0000000000..7770f6efb6 --- /dev/null +++ b/clickhouse/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse/install b/clickhouse/install new file mode 100755 index 0000000000..4d05fc61f8 --- /dev/null +++ b/clickhouse/install @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +if [ ! -x /usr/bin/clickhouse ]; then + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive +fi + +# Force synchronous database load on startup. With the default +# async_load_databases=1, the server binds its listen port and answers +# SELECT 1 before user-database parts have finished loading — so the +# lib's ./check passes, drop_caches+restart looks "ready", and the +# first real query (e.g. Q40) then stalls 2–3 s waiting for the part +# loader to finish. That manifested as a ~2 s floor on every cold run +# on c6a.4xlarge (Q40 going 2.89 s -> 0.25 s here once parts are +# already loaded when the cold timer starts). +sudo mkdir -p /etc/clickhouse-server/config.d +echo 'async_load_databases: false' \ + | sudo tee /etc/clickhouse-server/config.d/async_load_databases.yaml >/dev/null diff --git a/clickhouse/load b/clickhouse/load new file mode 100755 index 0000000000..4a423a9b42 --- /dev/null +++ b/clickhouse/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +clickhouse-client < create.sql + +sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ +sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet + +clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" + +sudo rm -f /var/lib/clickhouse/user_files/hits_*.parquet +sync diff --git a/clickhouse/query b/clickhouse/query new file mode 100755 index 0000000000..c6abe5b818 --- /dev/null +++ b/clickhouse/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +clickhouse-client --time diff --git a/clickhouse/results/20260509/c6a.4xlarge.json b/clickhouse/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..53e4126cdd --- /dev/null +++ b/clickhouse/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 296, + "data_size": 15296909015, + "result": [ + [0.6, 0.105, 0.002], + [0.965, 0.008, 0.021], + [0.742, 0.206, 0.034], + [1.586, 0.028, 0.026], + [1.848, 0.299, 0.29], + [2.515, 0.627, 0.576], + [1.247, 0.028, 0.01], + [1.237, 0.012, 0.021], + [2.14, 0.498, 0.467], + [2.837, 0.533, 0.481], + [1.986, 0.169, 0.145], + [2.888, 0.144, 0.145], + [3.498, 0.524, 0.531], + [4.125, 0.793, 0.751], + [3.37, 0.59, 0.517], + [1.824, 0.618, 0.392], + [4.525, 1.591, 1.596], + [4.047, 0.987, 0.966], + [7.018, 3.312, 2.887], + [2.918, 0.003, 0.003], + [11.602, 0.306, 0.312], + [13.141, 0.077, 0.077], + [16.411, 0.657, 0.669], + [4.84, 0.109, 0.108], + [4.478, 0.033, 0.03], + [2.956, 0.289, 0.143], + [4.546, 0.068, 0.032], + [3.535, 0.089, 0.088], + [12.687, 9.546, 9.639], + [0.781, 0.428, 0.035], + [2.279, 0.408, 0.326], + [4.669, 0.585, 0.566], + [7.26, 4.257, 4.375], + [11.876, 3.172, 3.205], + [12.136, 3.186, 3.223], + [1.422, 1.351, 0.28], + [2.412, 0.057, 0.048], + [2.342, 0.038, 0.022], + [2.472, 0.018, 0.017], + [2.417, 0.081, 0.08], + [2.571, 0.024, 0.015], + [2.656, 0.011, 0.011], + [2.399, 0.009, 0.01] +] +} + diff --git a/clickhouse/results/20260509/c6a.metal.json b/clickhouse/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..7ab8656376 --- /dev/null +++ b/clickhouse/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "ClickHouse", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 226, + "data_size": 15313552322, + "result": [ + [0.006, 0.001, 0.002], + [0.209, 0.04, 0.01], + [0.823, 0.025, 0.012], + [0.845, 0.122, 0.017], + [1.282, 0.074, 0.071], + [2.422, 0.119, 0.121], + [0.376, 0.128, 0.01], + [0.407, 0.014, 0.026], + [1.413, 0.304, 0.286], + [2.636, 0.302, 0.301], + [1.317, 0.362, 0.248], + [1.863, 0.107, 0.112], + [2.74, 0.197, 0.262], + [3.519, 0.249, 0.275], + [2.126, 0.378, 0.248], + [1.157, 0.71, 0.327], + [2.809, 0.327, 0.335], + [2.482, 0.759, 0.416], + [3.926, 0.826, 0.814], + [1.605, 0.003, 0.003], + [10.566, 0.084, 0.116], + [11.734, 0.149, 0.16], + [14.939, 0.259, 0.281], + [4.574, 0.12, 0.08], + [3.147, 0.09, 0.036], + [1.883, 0.654, 0.066], + [3.133, 0.504, 0.032], + [1.174, 0.461, 0.399], + [9.043, 1.388, 1.479], + [1.919, 0.051, 0.039], + [3.36, 0.1, 0.11], + [5.694, 0.186, 0.235], + [5.874, 1.214, 1.295], + [10.18, 0.7, 0.77], + [11.402, 0.709, 0.789], + [2.874, 0.095, 0.133], + [3.155, 0.122, 0.089], + [2.617, 0.055, 0.034], + [3.013, 0.029, 0.027], + [3.526, 0.138, 0.14], + [2.47, 0.034, 0.016], + [2.235, 0.023, 0.013], + [2.154, 0.113, 0.013] +] +} + diff --git a/clickhouse/run.sh b/clickhouse/run.sh deleted file mode 100755 index 820a39e3bc..0000000000 --- a/clickhouse/run.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -else if [ ! -z "$1" ]; then - SUFFIX="-$1" -fi -fi - -TRIES=3 -QUERY_NUM=1 -cat queries"$SUFFIX".sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse/start b/clickhouse/start new file mode 100755 index 0000000000..a3aa66fe75 --- /dev/null +++ b/clickhouse/start @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo clickhouse start diff --git a/clickhouse/stop b/clickhouse/stop new file mode 100755 index 0000000000..ea9d529c3e --- /dev/null +++ b/clickhouse/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/cloud-init.sh.in b/cloud-init.sh.in index b65a808c96..f44d208dfd 100644 --- a/cloud-init.sh.in +++ b/cloud-init.sh.in @@ -2,6 +2,27 @@ # This is the script for cloud-init, to run on a VM in unattended fashion. See run-benchmark.sh +# Cloud-init runs scripts as root with no HOME exported. Tools that follow +# XDG-ish conventions (DuckDB extensions in ~/.duckdb, the GizmoSQL one-line +# installer that runs `sh -u`, etc.) then fall over with messages like +# `Can't find the home directory at ''` or `HOME: parameter not set`. Set it +# once here so every per-system install/start/load/query inherits it. +export HOME="${HOME:-/root}" + +# c6a.4xlarge has 32 GB RAM with no swap. Loading the 75 GB hits.tsv into +# row-oriented databases (mysql, mariadb, postgres, mongodb, cratedb) and +# in-process Python servers (pandas/polars/duckdb-dataframe) fills RAM +# during ingest, and earlyoom (or the kernel OOM killer) then takes the +# DB out and the run dies with `Lost connection during query`. A 16 GB +# swapfile gives those loads the headroom they need without affecting +# query-time numbers (queries don't touch swap once the data is hot). +if [ ! -f /swapfile ]; then + fallocate -l 16G /swapfile + chmod 600 /swapfile + mkswap /swapfile >/dev/null + swapon /swapfile +fi + export DEBIAN_FRONTEND=noninteractive apt-get update -y apt-get install -y wget curl git jq earlyoom @@ -25,7 +46,14 @@ jq -c -r '.tags' template.json | tee -a log echo -n 'Disk usage before: ' | tee -a log df -B1 / | tail -n1 | awk '{ print $3 }' | tee -a log -timeout 20000 ./benchmark.sh 2>&1 | tee -a log +# 20000s (~5.5h) wasn't enough for slow OLTP-style systems (mysql, +# mariadb, postgresql{,-indexed,-orioledb}, mongodb, cratedb, sqlite, +# turso, timescaledb-no-columnstore, mysql-myisam) — they hit the +# timeout mid-load or a few queries in. 36000s (10h) clears the +# observed worst case while still capping a runaway run. Override at +# render time on the operator side by exporting `timeout` before +# run-benchmark.sh. +timeout @timeout@ ./benchmark.sh 2>&1 | tee -a log echo -n 'Disk usage after: ' | tee -a log df -B1 / | tail -n1 | awk '{ print $3 }' | tee -a log diff --git a/cloudberry/README.md b/cloudberry/README.md index 999ff99284..518724a787 100644 --- a/cloudberry/README.md +++ b/cloudberry/README.md @@ -1,3 +1,15 @@ Cloudberry DB is a fork of Greenplum DB, based on PG 14. -To run the test, put all files in a single directory and run benchmark.sh under root user, then follow the instructions (you will need to run the script multiple times with different options). +The benchmark runs Cloudberry inside a privileged Rocky 9 docker container, +so the host can be Ubuntu/Debian/RHEL/anything that runs docker. The install +script builds Cloudberry from source inside the container (~10–20 min on +first install) and initializes a single-segment cluster via gpinitsystem. + +The other scripts (start/stop/check/load/query) `docker exec` into the +running container. + +To run the test: + +``` +./benchmark.sh +``` diff --git a/cloudberry/benchmark.sh b/cloudberry/benchmark.sh index 4271c60306..531bd65038 100755 --- a/cloudberry/benchmark.sh +++ b/cloudberry/benchmark.sh @@ -1,126 +1,5 @@ #!/bin/bash -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -if [[ $1 == '' ]]; then - echo "SELINUX=disabled" > /etc/selinux/config - SHMALL=$(expr $(getconf _PHYS_PAGES) / 2) - SHMAX=$(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE)) - echo "Using shmall=$SHMALL, shmax=$SHMAX" - echo " -kernel.shmall = $SHMALL -kernel.shmmax = $SHMAX -kernel.shmmni = 4096 -vm.overcommit_memory = 2 # See Segment Host Memory -vm.overcommit_ratio = 95 # See Segment Host Memory -net.ipv4.ip_local_port_range = 10000 65535 # See Port Settings -kernel.sem = 250 2048000 200 8192 -kernel.sysrq = 1 -kernel.core_uses_pid = 1 -kernel.msgmnb = 65536 -kernel.msgmax = 65536 -kernel.msgmni = 2048 -net.ipv4.tcp_syncookies = 1 -net.ipv4.conf.default.accept_source_route = 0 -net.ipv4.tcp_max_syn_backlog = 4096 -net.ipv4.conf.all.arp_filter = 1 -net.ipv4.ipfrag_high_thresh = 41943040 -net.ipv4.ipfrag_low_thresh = 31457280 -net.ipv4.ipfrag_time = 60 -net.core.netdev_max_backlog = 10000 -net.core.rmem_max = 2097152 -net.core.wmem_max = 2097152 -vm.swappiness = 10 -vm.zone_reclaim_mode = 0 -vm.dirty_expire_centisecs = 500 -vm.dirty_writeback_centisecs = 100 -vm.dirty_background_ratio = 0 # See System Memory -vm.dirty_ratio = 0 -vm.dirty_background_bytes = 1610612736 -vm.dirty_bytes = 4294967296 -" >> /etc/sysctl.conf - sysctl -p - - echo " -* soft nofile 524288 -* hard nofile 524288 -* soft nproc 131072 -* hard nproc 131072 -" > /etc/security/limits.conf - - echo " -RemoveIPC=no -" > /etc/systemd/logind.conf - - groupadd gpadmin - useradd gpadmin -r -m -g gpadmin - sudo -u gpadmin ssh-keygen -t rsa -b 4096 - usermod -aG wheel gpadmin - echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - - grubby --update-kernel=ALL --args="transparent_hugepage=never" - - echo "Please reboot now. Then launch the script with the 'db-install' parameter." - -elif [[ $1 == 'db-install' ]]; then - echo "Database installation" - yum install -y go - export GOPROXY=https://goproxy.io,direct - yum -y install R apr apr-devel apr-util automake autoconf bash bison bison-devel bzip2 bzip2-devel flex flex-devel gcc gcc-c++ git gdb iproute krb5-devel less libevent libevent-devel libxml2 libxml2-devel libyaml libzstd-devel libzstd make openldap openssh openssh-clients openssh-server openssl openssl-devel openssl-libs perl python3-devel readline readline-devel rsync sed sudo tar vim wget which zip zlib python3-pip python3-venv python3-psycopg2 postgresql15 libpq-devel psutils - yum install curl libcurl-devel --allowerasing - yum install https://cdn.amazonlinux.com/2/core/2.0/x86_64/6b0225ccc542f3834c95733dcf321ab9f1e77e6ca6817469771a8af7c49efe6c/../../../../../blobstore/4846e71174e99f1b7f0985aa01631de003633d3a5f1a950812323c175214ae16/xerces-c-3.1.1-10.amzn2.x86_64.rpm - yum install https://cdn.amazonlinux.com/2/core/2.0/x86_64/6b0225ccc542f3834c95733dcf321ab9f1e77e6ca6817469771a8af7c49efe6c/../../../../../blobstore/53208ffe95cd1e38bba94984661e79134b3cc1b039922e828c40df7214ecaee8/xerces-c-devel-3.1.1-10.amzn2.x86_64.rpm - - python3 -m venv myenv - source myenv/bin/activate - pip install PygreSQL psutil - if [[ $2 != 'no_dl' ]]; then wget --continue --progress=dot:giga https://github.com/cloudberrydb/cloudberrydb/archive/refs/tags/1.5.3.tar.gz; fi - tar -xzf 1.5.3.tar.gz - cd cloudberrydb-1.5.3/ - echo -e "/usr/local/lib \n/usr/local/lib64" >> /etc/ld.so.conf - ldconfig - ./configure --prefix=/usr/local/cloudberrydb - make -j8 - make -j8 install - chown -R gpadmin:gpadmin /usr/local - chown -R gpadmin:gpadmin /usr/local/cloudberry* - echo "source /usr/local/cloudberrydb/greenplum_path.sh" >> /home/gpadmin/.bashrc - echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config - systemctl restart sshd - passwd gpadmin - sudo -iu gpadmin ssh-copy-id localhost - echo "localhost" > /home/gpadmin/hosts - mkdir -p /data0/primary/ - mkdir -p /data0/mirror/ - mkdir -p /data0/coordinator/ - chown -R gpadmin:gpadmin /data0 - echo "export COORDINATOR_DATA_DIRECTORY=/data0/coordinator/gpseg-1" >> /home/gpadmin/.bashrc - cp $SCRIPT_DIR/gpinitsystem_config /home/gpadmin/ - chown gpadmin:gpadmin /home/gpadmin/* - sudo -iu gpadmin gpinitsystem -c gpinitsystem_config -h hosts - echo "Database should be up. Run the script with the 'test' paramater to run the tests" - -elif [[ $1 == 'test' ]]; then - echo "Will run tests" - cd $SCRIPT_DIR - cp $SCRIPT_DIR/create.sql /home/gpadmin/ - cp $SCRIPT_DIR/queries.sql /home/gpadmin/ - cp $SCRIPT_DIR/run.sh /home/gpadmin/ - chmod +x /home/gpadmin/run.sh - chown gpadmin:gpadmin /home/gpadmin/* - if [[ $2 != 'no_dl' ]]; then sudo -iu gpadmin wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz'; fi - if [[ $2 != 'no_dl' ]]; then sudo -iu gpadmin gzip -d -f hits.tsv.gz; fi - sudo -iu gpadmin chmod 777 ~ hits.tsv - sudo -iu gpadmin psql -d postgres -f /home/gpadmin/create.sql 2>&1 | tee load_out.txt - if grep 'ERROR' load_out.txt - then - exit 1 - fi - sudo -iu gpadmin nohup gpfdist & - if [[ $2 != 'no_dl' ]]; then echo -n "Load time: " - command time -f '%e' sudo -iu gpadmin psql -d postgres -t -c "insert into hits select * from hits_ext;"; fi - if [[ $2 != 'no_dl' ]]; then echo -n "Load time: " - command time -f '%e' sudo -iu gpadmin psql -d postgres -t -c "ANALYZE hits;"; fi - du -sh /data0* - sudo -iu gpadmin /home/gpadmin/run.sh 2>&1 | tee log.txt - cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cloudberry/check b/cloudberry/check new file mode 100755 index 0000000000..55c5297e8c --- /dev/null +++ b/cloudberry/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +NAME=clickbench-cloudberry + +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'psql -d postgres -t -c "SELECT 1"' >/dev/null diff --git a/cloudberry/data-size b/cloudberry/data-size new file mode 100755 index 0000000000..92acac4590 --- /dev/null +++ b/cloudberry/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo du -bcs /data0 2>/dev/null | grep total | awk '{print $1}' diff --git a/cloudberry/install b/cloudberry/install new file mode 100755 index 0000000000..54715cf7a5 --- /dev/null +++ b/cloudberry/install @@ -0,0 +1,179 @@ +#!/bin/bash +# Cloudberry needs a RHEL-family host (yum, grubby, the wheel group, /data0, +# etc.). To make the benchmark portable, we run everything inside a Rocky 9 +# container instead of bare-metal. The other scripts (start/stop/check/load/ +# query) docker-exec into this container. +# +# Idempotent: re-running this script after a successful install does nothing. +set -eu + +NAME=clickbench-cloudberry +IMAGE=rockylinux:9 +CBDB_VERSION=1.5.3 + +# 1. Make sure docker is installed and the daemon is up. +if ! command -v docker >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io + elif command -v yum >/dev/null 2>&1; then + sudo yum install -y docker + else + echo "cloudberry/install: install docker manually first" >&2 + exit 1 + fi +fi +sudo systemctl start docker 2>/dev/null || sudo service docker start || true + +# 2. Skip if the container is already created and the cluster is initialized. +if sudo docker container inspect "$NAME" >/dev/null 2>&1; then + if sudo docker exec "$NAME" test -d /data0/coordinator/gpseg-1 2>/dev/null; then + echo "cloudberry: container '$NAME' already initialized; nothing to do" >&2 + exit 0 + fi +fi + +sudo docker pull "$IMAGE" + +# 3. Create a long-lived privileged container. We can't use --network host +# because gpinitsystem's ssh-to-localhost would land on the host's sshd. +# Bridge mode gives the container its own port 22 (isolated to the +# container's sshd); we publish 5432 so clients on the host can reach +# postgres. +if ! sudo docker container inspect "$NAME" >/dev/null 2>&1; then + sudo docker run -d --privileged --hostname localhost -p 5432:5432 \ + --name "$NAME" "$IMAGE" sleep infinity +fi + +# 4. Inside the container: install build deps, kernel tuning, gpadmin user, +# sshd, build Cloudberry from source, init a single-segment cluster. +# +# The whole RHEL-specific install lives in a heredoc so this script remains +# the single source of truth. It mirrors the original cloudberry/install +# logic minus the host-only bits (grubby, systemctl restart sshd, reboot). +sudo docker exec -i "$NAME" bash -s </dev/null +sysctl -w kernel.shmmax=\$SHMAX >/dev/null +sysctl -w kernel.shmmni=4096 >/dev/null +sysctl -w vm.overcommit_memory=2 >/dev/null +sysctl -w vm.overcommit_ratio=95 >/dev/null +sysctl -w net.ipv4.ip_local_port_range="10000 65535" >/dev/null +sysctl -w kernel.sem="250 2048000 200 8192" >/dev/null + +cat > /etc/security/limits.d/99-gpadmin.conf </dev/null 2>&1 || useradd gpadmin -m -g gpadmin +echo "gpadmin ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/gpadmin +sudo -u gpadmin bash -c ' + set -e + mkdir -p ~/.ssh + chmod 700 ~/.ssh + [ -f ~/.ssh/id_rsa ] || ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa + cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys +' + +# /bin/ping in this image has no caps and no setuid bit, so even root can't +# open a raw socket. gpinitsystem pings localhost to verify reachability. +setcap cap_net_raw+ep /bin/ping +[ -e /bin/ping6 ] && setcap cap_net_raw+ep /bin/ping6 || true + +# sshd setup (ssh-keygen -A creates host keys). +ssh-keygen -A +mkdir -p /run/sshd +echo "PasswordAuthentication no" >> /etc/ssh/sshd_config +echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config +/usr/sbin/sshd + +# Build Cloudberry from source if not already built. +if [ ! -x /usr/local/cloudberrydb/bin/postgres ]; then + cd /tmp + if [ ! -f cloudberry-${CBDB_VERSION}.tar.gz ]; then + curl -fsSL -o cloudberry-${CBDB_VERSION}.tar.gz \ + https://github.com/cloudberrydb/cloudberrydb/archive/refs/tags/${CBDB_VERSION}.tar.gz + fi + rm -rf cloudberry-${CBDB_VERSION} + tar -xzf cloudberry-${CBDB_VERSION}.tar.gz + cd cloudberry-${CBDB_VERSION} + echo -e "/usr/local/lib\n/usr/local/lib64" >> /etc/ld.so.conf + ldconfig + ./configure --prefix=/usr/local/cloudberrydb + # Cap parallelism: cloudberry's build (especially the link phase) eats + # several GB per worker; on big boxes \$(nproc) can OOM the host. + JOBS=\$(nproc); JOBS=\$(( JOBS > 16 ? 16 : JOBS )) + make -j\$JOBS + make install + chown -R gpadmin:gpadmin /usr/local/cloudberrydb +fi + +# Cloudberry's gpstart/gpstop import the python pgdb (PygreSQL) module. +# Build PygreSQL against cloudberry's pg_config so the right libpq is +# linked. 5.2.5 is the last release before the pgdb→pg rename. +source /usr/local/cloudberrydb/greenplum_path.sh +pip3 install --quiet 'PygreSQL==5.2.5' psutil + +# Data dirs and gpadmin shell env. +mkdir -p /data0/primary /data0/coordinator +chown -R gpadmin:gpadmin /data0 +grep -q greenplum_path /home/gpadmin/.bashrc || \ + echo "source /usr/local/cloudberrydb/greenplum_path.sh" >> /home/gpadmin/.bashrc +grep -q COORDINATOR_DATA_DIRECTORY /home/gpadmin/.bashrc || \ + echo "export COORDINATOR_DATA_DIRECTORY=/data0/coordinator/gpseg-1" >> /home/gpadmin/.bashrc + +echo localhost > /home/gpadmin/hosts +chown gpadmin:gpadmin /home/gpadmin/hosts +EOF + +# 5. Drop gpinitsystem_config in and run gpinitsystem. +sudo docker cp "$(dirname "$0")/gpinitsystem_config" \ + "$NAME":/home/gpadmin/gpinitsystem_config +sudo docker exec "$NAME" chown gpadmin:gpadmin /home/gpadmin/gpinitsystem_config + +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'gpinitsystem -a -c gpinitsystem_config -h hosts' diff --git a/cloudberry/load b/cloudberry/load new file mode 100755 index 0000000000..85dd0cc740 --- /dev/null +++ b/cloudberry/load @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +NAME=clickbench-cloudberry + +# Move hits.tsv and create.sql into the container so gpadmin/gpfdist can +# read them. `docker cp` is fine; the file is ~75GB but only copied once. +# Stream hits.tsv and create.sql into the container via tar so that any +# symlinks on the host (e.g. hits.tsv -> /elsewhere) are dereferenced. +# `tar -h` follows symlinks; piping through `docker exec -i ... tar` +# avoids `docker cp`'s symlink-passes-through behavior. +tar -ch hits.tsv create.sql | \ + sudo docker exec -i "$NAME" tar -xC /home/gpadmin +sudo docker exec "$NAME" chown gpadmin:gpadmin \ + /home/gpadmin/hits.tsv /home/gpadmin/create.sql + +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'psql -d postgres -v ON_ERROR_STOP=1 -f /home/gpadmin/create.sql' + +# gpfdist serves hits.tsv to the gpfdist:// foreign table referenced by +# hits_ext. Spawn it once; subsequent loads reuse the running daemon. +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'pgrep -u gpadmin gpfdist || nohup gpfdist -d /home/gpadmin >/tmp/gpfdist.log 2>&1 &' + +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'psql -d postgres -v ON_ERROR_STOP=1 -t -c "INSERT INTO hits SELECT * FROM hits_ext;"' +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc \ + 'psql -d postgres -v ON_ERROR_STOP=1 -t -c "ANALYZE hits;"' + +sudo docker exec "$NAME" rm -f /home/gpadmin/hits.tsv +rm -f hits.tsv +sync diff --git a/cloudberry/query b/cloudberry/query new file mode 100755 index 0000000000..ba22bd7a93 --- /dev/null +++ b/cloudberry/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB +# inside the clickbench-cloudberry container as gpadmin. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +NAME=clickbench-cloudberry +query=$(cat) + +# Pipe '\timing' + query through psql inside the container under gpadmin. +out=$(printf '\\timing\n%s\n' "$query" | \ + sudo docker exec -i "$NAME" sudo -iu gpadmin bash -lc 'psql -d postgres -t' 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/cloudberry/run.sh b/cloudberry/run.sh deleted file mode 100755 index 23a2756b7f..0000000000 --- a/cloudberry/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - echo '\timing' > /tmp/query_temp.sql - echo "$query" >> /tmp/query_temp.sql - psql -d postgres -t -f /tmp/query_temp.sql 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/cloudberry/start b/cloudberry/start new file mode 100755 index 0000000000..1edbbf305e --- /dev/null +++ b/cloudberry/start @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +NAME=clickbench-cloudberry + +# Bring the docker container up if it isn't already. +sudo docker start "$NAME" >/dev/null + +# sshd doesn't auto-start in the container; gpinitsystem/gpstart need ssh +# to localhost. Then bring the cluster up. gpstart is a no-op if it's +# already running. +sudo docker exec "$NAME" bash -c 'pgrep sshd >/dev/null || /usr/sbin/sshd' +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc 'gpstart -a' || true diff --git a/cloudberry/stop b/cloudberry/stop new file mode 100755 index 0000000000..4debf4c4f0 --- /dev/null +++ b/cloudberry/stop @@ -0,0 +1,8 @@ +#!/bin/bash + +NAME=clickbench-cloudberry + +# Bring the cluster down inside the container, then pause the container. +# Both steps tolerate "already stopped". +sudo docker exec "$NAME" sudo -iu gpadmin bash -lc 'gpstop -a' 2>/dev/null || true +sudo docker stop "$NAME" >/dev/null 2>&1 || true diff --git a/cockroachdb/benchmark.sh b/cockroachdb/benchmark.sh index 4951f6cfe5..8ae96793b8 100755 --- a/cockroachdb/benchmark.sh +++ b/cockroachdb/benchmark.sh @@ -1,47 +1,9 @@ #!/bin/bash - -CRDBVERSION=25.1.6 -CRDBDATADIR=/var/lib/cockroach-data - -sudo apt-get update -y -# Includes unbuffer utility program -sudo apt-get install -y expect-dev - -wget --continue --progress=dot:giga https://binaries.cockroachdb.com/cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture).tgz -tar -xvzf cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture).tgz -sudo cp -r cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture)/* /usr/local/bin/ -# Build Tag: v25.1.6 -cockroach version | grep "^Build Tag" -sudo mkdir -p $CRDBDATADIR -# Increase cache size to 25% for better read performance -# For details see https://www.cockroachlabs.com/docs/v25.1/recommended-production-settings#cache-and-sql-memory-size -sudo cockroach start-single-node --insecure --listen-addr=localhost --background --store=$CRDBDATADIR --cache=.25 --pid-file=crdb.pid - -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' -O /tmp/hits.csv.gz -# Make data file available in "extern" directory, so it can be loaded via nodelocal -sudo mkdir -p $CRDBDATADIR/extern -gzip -d -c /tmp/hits.csv.gz | sudo tee $CRDBDATADIR/extern/hits.csv > /dev/null - -# Deactivate query plan cache -# For details see https://www.cockroachlabs.com/docs/v25.1/cost-based-optimizer#query-plan-cache -cockroach sql --insecure --host=localhost --execute='SET CLUSTER SETTING sql.query_cache.enabled = false;' - -cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;' -cockroach sql --insecure --host=localhost --database=test --file='create.sql' -START=$(date +%s) -cockroach sql --insecure --host=localhost --database=test --execute="IMPORT INTO hits(WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) CSV DATA ('nodelocal://1/hits.csv');" -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -cockroach sql --insecure --host=localhost --database=test --execute="SELECT SUM(range_size) FROM [SHOW RANGES FROM TABLE hits WITH DETAILS];" | tail -n1 - -# Values might be given in ms or s, depending on their magnitude -grep -oP 'Time: \K[\d.]+s|Time: \K\d+ms' log.txt | - sed -E 's/([0-9]+(\.[0-9]+)?)s/\1/; s/([0-9]+)ms/\1\/1000/' | - awk '{if ($0 ~ /\//) {print $1/1000} else {print $0}}' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -sudo killall cockroach +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +# cockroach replays its WAL on each restart; after the 60 GB+ IMPORT +# that takes long enough that the lib's default 300 s check window +# times out before SELECT 1 succeeds again. 900 s covers it. +export BENCH_CHECK_TIMEOUT=900 +exec ../lib/benchmark-common.sh diff --git a/cockroachdb/check b/cockroachdb/check new file mode 100755 index 0000000000..436921498d --- /dev/null +++ b/cockroachdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +cockroach sql --insecure --host=localhost -e 'SELECT 1' >/dev/null 2>&1 diff --git a/cockroachdb/data-size b/cockroachdb/data-size new file mode 100755 index 0000000000..ee02785dae --- /dev/null +++ b/cockroachdb/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +cockroach sql --insecure --host=localhost --database=test --format=tsv \ + --execute="SELECT SUM(range_size) FROM [SHOW RANGES FROM TABLE hits WITH DETAILS];" \ + | tail -n1 diff --git a/cockroachdb/install b/cockroachdb/install new file mode 100755 index 0000000000..60cbf116ed --- /dev/null +++ b/cockroachdb/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +CRDBVERSION=${CRDBVERSION:-25.1.6} + +sudo apt-get update -y +# expect-dev provides `unbuffer` (used in query script). +sudo apt-get install -y expect-dev wget bc + +if [ ! -x /usr/local/bin/cockroach ]; then + arch=$(dpkg --print-architecture) + wget --continue --progress=dot:giga \ + "https://binaries.cockroachdb.com/cockroach-v${CRDBVERSION}.linux-${arch}.tgz" + tar -xvzf "cockroach-v${CRDBVERSION}.linux-${arch}.tgz" + sudo cp -r "cockroach-v${CRDBVERSION}.linux-${arch}/"* /usr/local/bin/ +fi + +sudo mkdir -p /var/lib/cockroach-data diff --git a/cockroachdb/load b/cockroachdb/load new file mode 100755 index 0000000000..2afaf6709b --- /dev/null +++ b/cockroachdb/load @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +CRDBDATADIR=/var/lib/cockroach-data + +# Stage data into cockroach's "extern" directory so it can be loaded via nodelocal://. +sudo mkdir -p "$CRDBDATADIR/extern" +sudo cp hits.csv "$CRDBDATADIR/extern/hits.csv" + +cockroach sql --insecure --host=localhost --execute='DROP DATABASE IF EXISTS test CASCADE;' +cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;' +cockroach sql --insecure --host=localhost --database=test --file='create.sql' + +cockroach sql --insecure --host=localhost --database=test --execute="IMPORT INTO hits(WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) CSV DATA ('nodelocal://1/hits.csv');" + +sudo rm -f "$CRDBDATADIR/extern/hits.csv" +rm -f hits.csv +sync diff --git a/cockroachdb/query b/cockroachdb/query new file mode 100755 index 0000000000..7dc4737d1e --- /dev/null +++ b/cockroachdb/query @@ -0,0 +1,41 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via cockroach sql. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# the "Time:" footer; cockroach prints either "Ns" or "Nms"). +# Exit non-zero on error. +# +# Note: cockroach sql only emits the elapsed-time footer when stdout is a TTY. +# We use `unbuffer` to fool isatty(). +set -e + +query=$(cat) + +raw=$(unbuffer cockroach sql --insecure --host=localhost --database=test \ + --execute="$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE '^ERROR:|^pq:'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# Pull the LAST "Time: " line. CockroachDB uses s, ms, µs. +# Magnitude examples: "Time: 1.23s", "Time: 45ms", "Time: 678µs". +t_line=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+(s|ms|µs|us)' | tail -n1) + +if [ -z "$t_line" ]; then + echo "no Time: footer in cockroach output" >&2 + exit 1 +fi + +awk -v t="$t_line" 'BEGIN { + if (match(t, /[0-9.]+/)) { + v = substr(t, RSTART, RLENGTH) + u = substr(t, RSTART+RLENGTH) + if (u == "ms") { printf "%.3f\n", v / 1000 } + else if (u == "µs" || u == "us") { printf "%.6f\n", v / 1000000 } + else { printf "%.3f\n", v } + } +}' >&2 diff --git a/cockroachdb/run.sh b/cockroachdb/run.sh deleted file mode 100755 index 896025cc7a..0000000000 --- a/cockroachdb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - # Apparently, cockroach sql only writes the elapsed time of a statement to file descriptors that refer to a terminal (cf. isatty()). - # Since we *pipe* the output into grep, we need to use unbuffer. - unbuffer cockroach sql --insecure --host=localhost --database=test --execute="${query}" | grep 'Time' - done; -done; diff --git a/cockroachdb/start b/cockroachdb/start new file mode 100755 index 0000000000..b614e3ab44 --- /dev/null +++ b/cockroachdb/start @@ -0,0 +1,16 @@ +#!/bin/bash +set -eu + +CRDBDATADIR=/var/lib/cockroach-data + +if cockroach sql --insecure --host=localhost -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Cache=25% per CockroachDB production tuning recommendations. +sudo cockroach start-single-node --insecure --listen-addr=localhost --background \ + --store="$CRDBDATADIR" --cache=.25 --pid-file=crdb.pid + +# Disable plan cache to keep timings honest run-over-run. +cockroach sql --insecure --host=localhost \ + --execute='SET CLUSTER SETTING sql.query_cache.enabled = false;' diff --git a/cockroachdb/stop b/cockroachdb/stop new file mode 100755 index 0000000000..3440031f55 --- /dev/null +++ b/cockroachdb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +# cockroach has no clean single-node stop; killall is the documented approach. +sudo killall cockroach 2>/dev/null || true diff --git a/collect-results.sh b/collect-results.sh index 2426433543..fe6f3af1a6 100755 --- a/collect-results.sh +++ b/collect-results.sh @@ -2,4 +2,12 @@ # Writes one JSON per (system, machine) under /results//.json, # where YYYYMMDD is taken from the recorded UTC time of the result. -clickhouse-client --query "SELECT format(\$\$SELECT output FROM sink.results WHERE system = '{0}' AND machine = '{1}' ORDER BY time DESC LIMIT 1 INTO OUTFILE '{0}/results/{2}/{1}.json' TRUNCATE FORMAT Raw SETTINGS into_outfile_create_parent_directories = 1;\$\$, system, machine, formatDateTime(time, '%Y%m%d', 'UTC')) FROM sink.results WHERE time >= today() - INTERVAL 1 WEEK LIMIT 1 BY system, machine FORMAT Raw" | clickhouse-client +# +# `LIMIT 1 BY system, machine` without an explicit `ORDER BY` returned an +# arbitrary row per (system, machine) — for systems with several runs in +# the past week, that picked an older row's date for the output directory +# while the inner `ORDER BY time DESC LIMIT 1` still wrote the latest +# content. The result was JSON files landing in the wrong dated subdir +# and `generate-results.sh` never seeing today's entries. Order +# explicitly so LIMIT 1 BY keeps the latest row for each pair. +clickhouse-client --query "SELECT format(\$\$SELECT output FROM sink.results WHERE system = '{0}' AND machine = '{1}' ORDER BY time DESC LIMIT 1 INTO OUTFILE '{0}/results/{2}/{1}.json' TRUNCATE FORMAT Raw SETTINGS into_outfile_create_parent_directories = 1;\$\$, system, machine, formatDateTime(time, '%Y%m%d', 'UTC')) FROM sink.results WHERE time >= today() - INTERVAL 1 WEEK ORDER BY time DESC LIMIT 1 BY system, machine FORMAT Raw" | clickhouse-client diff --git a/cratedb/benchmark.sh b/cratedb/benchmark.sh index 7d5307b29f..a427fca3e2 100755 --- a/cratedb/benchmark.sh +++ b/cratedb/benchmark.sh @@ -1,79 +1,11 @@ #!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Set CRATEDB_MODE=tuned to use create-tuned.sql + queries-tuned.sql. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes -# Tuned execution if "tuned" is passed, default mode otherwise -MODE=$1 - -if [[ $MODE == "tuned" ]]; then - CREATE_FILE="create-tuned.sql" - EMPTY_STRING_AS_NULL=TRUE -else - CREATE_FILE="create.sql" - EMPTY_STRING_AS_NULL=FALSE -fi; - -# Install prerequisites. -sudo apt-get update -y -sudo apt-get install -y apt-transport-https apt-utils curl gnupg lsb-release - -# Import the public GPG key for verifying the package signatures. -curl -sS https://cdn.crate.io/downloads/debian/DEB-GPG-KEY-crate | \ - sudo tee /etc/apt/trusted.gpg.d/cratedb.asc - -# Add CrateDB repository to Apt -echo "deb https://cdn.crate.io/downloads/debian/testing/ default main" | \ - sudo tee /etc/apt/sources.list.d/crate-stable.list - -sudo apt-get update -y -sudo apt-get install -y postgresql-client crate - -sudo systemctl start crate - -for _ in {1..300} -do - psql -U crate -h localhost --no-password -t -c 'SELECT 1' && break - sleep 1 -done - -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -O /tmp/hits.tsv.gz -gzip -d -f /tmp/hits.tsv.gz -chmod 444 /tmp/hits.tsv - -psql -U crate -h localhost --no-password -t < $CREATE_FILE 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 +if [ "${CRATEDB_MODE:-default}" = "tuned" ]; then + export BENCH_QUERIES_FILE="queries-tuned.sql" fi -START=$(date +%s) -command time -f '%e' psql -U crate -h localhost --no-password -q -t -c " - COPY hits - FROM 'file:///tmp/hits.tsv' - WITH - ( - "delimiter"=e'\t', - "format"='csv', - "header"=false, - "empty_string_as_null"=${EMPTY_STRING_AS_NULL} - ) - RETURN SUMMARY;" -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -# One record did not load: -# 99997496 -# {"Missing closing quote for value\n at [Source: UNKNOWN; line: 1, column: 1069]":{"count":1,"line_numbers":[93557187]}} -# Time: 10687056.069 ms (02:58:07.056) - -if [[ $MODE == "tuned" ]]; then - psql -U crate -h localhost --no-password -t -c "REFRESH TABLE hits; OPTIMIZE TABLE hits;" -fi; - -# Some queries don't fit into the available heap space and raise an CircuitBreakingException -./run.sh "$MODE" 2>&1 | tee log.txt - -# Look up shard sizes from system tables. Only consider primary shards in case of multi-node setups with replication. -echo -n "Data size: " -psql -U crate -h localhost --no-password -q -t -c "SELECT SUM(size) FROM sys.shards WHERE table_name = 'hits' AND primary = TRUE;" - -grep -oP 'Time: \d+\.\d+ ms|ERROR' < log.txt | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | - awk '{ if ($1 == "ERROR") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : ($1 / 1000); if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +exec ../lib/benchmark-common.sh diff --git a/cratedb/check b/cratedb/check new file mode 100755 index 0000000000..a29db6dad0 --- /dev/null +++ b/cratedb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -U crate -h localhost --no-password -t -c 'SELECT 1' >/dev/null diff --git a/cratedb/data-size b/cratedb/data-size new file mode 100755 index 0000000000..66d3be5c91 --- /dev/null +++ b/cratedb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +psql -U crate -h localhost --no-password -q -t -A \ + -c "SELECT SUM(size) FROM sys.shards WHERE table_name = 'hits' AND primary = TRUE;" diff --git a/cratedb/install b/cratedb/install new file mode 100755 index 0000000000..7f74b45695 --- /dev/null +++ b/cratedb/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y apt-transport-https apt-utils curl gnupg lsb-release + +if [ ! -f /etc/apt/trusted.gpg.d/cratedb.asc ]; then + curl -sS https://cdn.crate.io/downloads/debian/DEB-GPG-KEY-crate \ + | sudo tee /etc/apt/trusted.gpg.d/cratedb.asc >/dev/null +fi + +if [ ! -f /etc/apt/sources.list.d/crate-stable.list ]; then + echo "deb https://cdn.crate.io/downloads/debian/testing/ default main" \ + | sudo tee /etc/apt/sources.list.d/crate-stable.list + sudo apt-get update -y +fi + +sudo apt-get install -y postgresql-client crate diff --git a/cratedb/load b/cratedb/load new file mode 100755 index 0000000000..2b0b58c466 --- /dev/null +++ b/cratedb/load @@ -0,0 +1,37 @@ +#!/bin/bash +set -eu + +# CrateDB has two schemas (default vs tuned); MODE env selects. +MODE=${CRATEDB_MODE:-default} +if [[ $MODE == "tuned" ]]; then + CREATE_FILE="create-tuned.sql" + EMPTY_STRING_AS_NULL=TRUE +else + CREATE_FILE="create.sql" + EMPTY_STRING_AS_NULL=FALSE +fi + +# Stage data into a known location. +mv hits.tsv /tmp/hits.tsv +chmod 444 /tmp/hits.tsv + +psql -U crate -h localhost --no-password -t < "$CREATE_FILE" + +psql -U crate -h localhost --no-password -q -t -c " + COPY hits + FROM 'file:///tmp/hits.tsv' + WITH + ( + \"delimiter\"=e'\t', + \"format\"='csv', + \"header\"=false, + \"empty_string_as_null\"=${EMPTY_STRING_AS_NULL} + ) + RETURN SUMMARY;" + +if [[ $MODE == "tuned" ]]; then + psql -U crate -h localhost --no-password -t -c "REFRESH TABLE hits; OPTIMIZE TABLE hits;" +fi + +rm -f /tmp/hits.tsv +sync diff --git a/cratedb/query b/cratedb/query new file mode 100755 index 0000000000..eb208e78db --- /dev/null +++ b/cratedb/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CrateDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(psql -U crate -h localhost --no-password -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cratedb/run.sh b/cratedb/run.sh deleted file mode 100755 index 89e6afbb11..0000000000 --- a/cratedb/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -MODE=$1 -TRIES=3 - -if [[ $MODE == "tuned" ]]; then - FILE_NAME="queries-tuned.sql" -else - FILE_NAME="queries.sql" -fi; - -cat $FILE_NAME | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -U crate -h localhost --no-password -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/cratedb/start b/cratedb/start new file mode 100755 index 0000000000..4489d806b3 --- /dev/null +++ b/cratedb/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if psql -U crate -h localhost --no-password -t -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi +sudo systemctl start crate diff --git a/cratedb/stop b/cratedb/stop new file mode 100755 index 0000000000..181c6fa60c --- /dev/null +++ b/cratedb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop crate || true diff --git a/daft-parquet-partitioned/benchmark.sh b/daft-parquet-partitioned/benchmark.sh index 4d1a3920ec..3b63e772a6 100755 --- a/daft-parquet-partitioned/benchmark.sh +++ b/daft-parquet-partitioned/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas -pip install packaging -pip install daft==0.7.4 - -../download-hits-parquet-partitioned - -mode=partitioned -echo "Running $mode mode..." -./run.sh $machine_name $mode 2>&1 | tee "daft_log_${mode}.txt" - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/daft-parquet-partitioned/check b/daft-parquet-partitioned/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/daft-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/daft-parquet-partitioned/data-size b/daft-parquet-partitioned/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/daft-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/daft-parquet-partitioned/install b/daft-parquet-partitioned/install new file mode 100755 index 0000000000..54c195180e --- /dev/null +++ b/daft-parquet-partitioned/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas packaging 'daft==0.7.4' fastapi uvicorn diff --git a/daft-parquet-partitioned/load b/daft-parquet-partitioned/load new file mode 100755 index 0000000000..65019c6623 --- /dev/null +++ b/daft-parquet-partitioned/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. Daft is lazy; reading just +# builds the plan and types — actual data is loaded on first query. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/daft-parquet-partitioned/query b/daft-parquet-partitioned/query new file mode 100755 index 0000000000..6366d71600 --- /dev/null +++ b/daft-parquet-partitioned/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running daft server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/daft-parquet-partitioned/query.py b/daft-parquet-partitioned/query.py deleted file mode 100755 index b761767507..0000000000 --- a/daft-parquet-partitioned/query.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -import daft -import os -import sys -import timeit -import traceback -from daft import col, DataType - -hits = None -current_dir = os.path.dirname(os.path.abspath(__file__)) -query_idx = int(sys.argv[1]) - 1 -is_single_mode = len(sys.argv) > 2 and sys.argv[2] == "single" -parquet_path = os.path.join( - current_dir, - "hits.parquet" if is_single_mode else "hits_*.parquet" -) - -with open("queries.sql") as f: - sql_list = [q.strip() for q in f.read().split(';') if q.strip()] - -def run_single_query(sql, i): - try: - start = timeit.default_timer() - - global hits - if hits is None: - hits = daft.read_parquet(parquet_path) - hits = hits.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) - hits = hits.with_column("EventDate", col("EventDate").cast(DataType.date())) - hits = hits.with_column("URL", col("URL").decode("utf-8")) - hits = hits.with_column("Title", col("Title").decode("utf-8")) - hits = hits.with_column("Referer", col("Referer").decode("utf-8")) - hits = hits.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) - hits = hits.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) - - result = daft.sql(sql) - result.collect() - - run_time = round(timeit.default_timer() - start, 3) - return run_time - except Exception as e: - print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr) - traceback.print_exc() - return None - -if __name__ == "__main__": - sql = sql_list[query_idx] - times = [] - for i in range(3): - elapsed = run_single_query(sql, i) - times.append(f"{elapsed}" if elapsed else "") - print(','.join(times)) diff --git a/daft-parquet-partitioned/results/20250410/c6a.4xlarge.json b/daft-parquet-partitioned/results/20250410/c6a.4xlarge.json index c0b80db43e..4d3094f63d 100644 --- a/daft-parquet-partitioned/results/20250410/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20250410/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet-partitioned/results/20250410/c6a.metal.json b/daft-parquet-partitioned/results/20250410/c6a.metal.json index 02dbd26962..f103db6309 100644 --- a/daft-parquet-partitioned/results/20250410/c6a.metal.json +++ b/daft-parquet-partitioned/results/20250410/c6a.metal.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet-partitioned/results/20250506/c6a.4xlarge.json b/daft-parquet-partitioned/results/20250506/c6a.4xlarge.json index 14d1c5442e..3ec4a6d5e6 100644 --- a/daft-parquet-partitioned/results/20250506/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20250506/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet-partitioned/results/20250507/c6a.metal.json b/daft-parquet-partitioned/results/20250507/c6a.metal.json index f8c8f0b177..e38dd30a0c 100644 --- a/daft-parquet-partitioned/results/20250507/c6a.metal.json +++ b/daft-parquet-partitioned/results/20250507/c6a.metal.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet-partitioned/results/20250710/c6a.2xlarge.json b/daft-parquet-partitioned/results/20250710/c6a.2xlarge.json index 8f2a69e143..fa25ca37e5 100644 --- a/daft-parquet-partitioned/results/20250710/c6a.2xlarge.json +++ b/daft-parquet-partitioned/results/20250710/c6a.2xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14737666736, diff --git a/daft-parquet-partitioned/results/20250710/c6a.4xlarge.json b/daft-parquet-partitioned/results/20250710/c6a.4xlarge.json index 9071991815..1a0821b5e9 100644 --- a/daft-parquet-partitioned/results/20250710/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20250710/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14737666736, diff --git a/daft-parquet-partitioned/results/20250711/c6a.large.json b/daft-parquet-partitioned/results/20250711/c6a.large.json index c634e6fc76..63a7075daa 100644 --- a/daft-parquet-partitioned/results/20250711/c6a.large.json +++ b/daft-parquet-partitioned/results/20250711/c6a.large.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14737666736, diff --git a/daft-parquet-partitioned/results/20250711/c6a.xlarge.json b/daft-parquet-partitioned/results/20250711/c6a.xlarge.json index 4641be1192..ac61a9d963 100644 --- a/daft-parquet-partitioned/results/20250711/c6a.xlarge.json +++ b/daft-parquet-partitioned/results/20250711/c6a.xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14737666736, diff --git a/daft-parquet-partitioned/results/20250712/c8g.4xlarge.json b/daft-parquet-partitioned/results/20250712/c8g.4xlarge.json index 1734625db6..8480694649 100644 --- a/daft-parquet-partitioned/results/20250712/c8g.4xlarge.json +++ b/daft-parquet-partitioned/results/20250712/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250712/t3a.small.json b/daft-parquet-partitioned/results/20250712/t3a.small.json index 1ac0a3e787..cb0e712ee4 100644 --- a/daft-parquet-partitioned/results/20250712/t3a.small.json +++ b/daft-parquet-partitioned/results/20250712/t3a.small.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14737666736, diff --git a/daft-parquet-partitioned/results/20250830/c7a.metal-48xl.json b/daft-parquet-partitioned/results/20250830/c7a.metal-48xl.json index e47ef92d73..c4eceff457 100644 --- a/daft-parquet-partitioned/results/20250830/c7a.metal-48xl.json +++ b/daft-parquet-partitioned/results/20250830/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/c6a.2xlarge.json b/daft-parquet-partitioned/results/20250831/c6a.2xlarge.json index f288aec5ca..d79d7ffab6 100644 --- a/daft-parquet-partitioned/results/20250831/c6a.2xlarge.json +++ b/daft-parquet-partitioned/results/20250831/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/c6a.4xlarge.json b/daft-parquet-partitioned/results/20250831/c6a.4xlarge.json index e123d90235..eb79806414 100644 --- a/daft-parquet-partitioned/results/20250831/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20250831/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/c6a.large.json b/daft-parquet-partitioned/results/20250831/c6a.large.json index 7f41b53bdb..ec97d07db5 100644 --- a/daft-parquet-partitioned/results/20250831/c6a.large.json +++ b/daft-parquet-partitioned/results/20250831/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/c6a.metal.json b/daft-parquet-partitioned/results/20250831/c6a.metal.json index bca7791d16..cc72389a02 100644 --- a/daft-parquet-partitioned/results/20250831/c6a.metal.json +++ b/daft-parquet-partitioned/results/20250831/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/c6a.xlarge.json b/daft-parquet-partitioned/results/20250831/c6a.xlarge.json index e969b21653..e28df5594a 100644 --- a/daft-parquet-partitioned/results/20250831/c6a.xlarge.json +++ b/daft-parquet-partitioned/results/20250831/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20250831/t3a.small.json b/daft-parquet-partitioned/results/20250831/t3a.small.json index 9a78b8e62d..f65606c16e 100644 --- a/daft-parquet-partitioned/results/20250831/t3a.small.json +++ b/daft-parquet-partitioned/results/20250831/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260330/c6a.4xlarge.json b/daft-parquet-partitioned/results/20260330/c6a.4xlarge.json index daf824454b..e6a5894b68 100644 --- a/daft-parquet-partitioned/results/20260330/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20260330/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c6a.2xlarge.json b/daft-parquet-partitioned/results/20260331/c6a.2xlarge.json index a046045686..e8aae920c0 100644 --- a/daft-parquet-partitioned/results/20260331/c6a.2xlarge.json +++ b/daft-parquet-partitioned/results/20260331/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c6a.4xlarge.json b/daft-parquet-partitioned/results/20260331/c6a.4xlarge.json index 422e300cba..539e736efb 100644 --- a/daft-parquet-partitioned/results/20260331/c6a.4xlarge.json +++ b/daft-parquet-partitioned/results/20260331/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c6a.metal.json b/daft-parquet-partitioned/results/20260331/c6a.metal.json index 08a143409a..ef0e00fde3 100644 --- a/daft-parquet-partitioned/results/20260331/c6a.metal.json +++ b/daft-parquet-partitioned/results/20260331/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c6a.xlarge.json b/daft-parquet-partitioned/results/20260331/c6a.xlarge.json index 5d401ac851..42c440c0a3 100644 --- a/daft-parquet-partitioned/results/20260331/c6a.xlarge.json +++ b/daft-parquet-partitioned/results/20260331/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c7a.metal-48xl.json b/daft-parquet-partitioned/results/20260331/c7a.metal-48xl.json index 8dcc81d4d2..d735730ba8 100644 --- a/daft-parquet-partitioned/results/20260331/c7a.metal-48xl.json +++ b/daft-parquet-partitioned/results/20260331/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/c8g.4xlarge.json b/daft-parquet-partitioned/results/20260331/c8g.4xlarge.json index 2fafe2bbea..08ab3a076f 100644 --- a/daft-parquet-partitioned/results/20260331/c8g.4xlarge.json +++ b/daft-parquet-partitioned/results/20260331/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/results/20260331/t3a.small.json b/daft-parquet-partitioned/results/20260331/t3a.small.json index 445392718c..bc65fe31cb 100644 --- a/daft-parquet-partitioned/results/20260331/t3a.small.json +++ b/daft-parquet-partitioned/results/20260331/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/daft-parquet-partitioned/run.sh b/daft-parquet-partitioned/run.sh deleted file mode 100755 index 86d1a512c7..0000000000 --- a/daft-parquet-partitioned/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -mode=${1} - -TRIES=3 -QUERY_COUNT=43 - -declare -a results=() -for ((i=0; i /dev/null - - output=$(python3 query.py $q $mode 2>&1) - IFS=',' read -r t1 t2 t3 <<< "$(echo "$output" | tail -1)" - - results[$((q-1))]="[${t1:-null},${t2:-null},${t3:-null}]" -done - -IFS=, printf '%s,\n' "${results[@]}" | sed '$s/,$//' diff --git a/daft-parquet-partitioned/server.py b/daft-parquet-partitioned/server.py new file mode 100644 index 0000000000..40dd8f913a --- /dev/null +++ b/daft-parquet-partitioned/server.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Daft (partitioned parquet) so it conforms to the +ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits_*.parquet from the working directory, casts + types, holds the Daft DataFrame in memory, registers + it as `hits` for daft.sql, returns {"elapsed": ...} + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching callable via daft.sql, returns + {"elapsed": }. + GET /data-size -> total file size of hits_*.parquet at load time. + +The 43 SQL strings come straight from the prior +daft-parquet-partitioned/queries.sql. +""" + +import os +import timeit + +import daft +import uvicorn +from daft import DataType, col +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits = None +data_bytes = 0 + +PARQUET_GLOB = os.environ.get("BENCH_DAFT_PARQUET", "hits_*.parquet") + + +def _make_runner(sql: str): + return lambda _df: daft.sql(sql).collect() + + +# 43 ClickBench queries — daft.sql against the registered `hits` view. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate) as m1, MAX(EventDate) as m2 FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) as m FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth) AS s0, SUM(ResolutionWidth + 1) AS s1, SUM(ResolutionWidth + 2) AS s2, SUM(ResolutionWidth + 3) AS s3, SUM(ResolutionWidth + 4) AS s4, SUM(ResolutionWidth + 5) AS s5, SUM(ResolutionWidth + 6) AS s6, SUM(ResolutionWidth + 7) AS s7, SUM(ResolutionWidth + 8) AS s8, SUM(ResolutionWidth + 9) AS s9, SUM(ResolutionWidth + 10) AS s10, SUM(ResolutionWidth + 11) AS s11, SUM(ResolutionWidth + 12) AS s12, SUM(ResolutionWidth + 13) AS s13, SUM(ResolutionWidth + 14) AS s14, SUM(ResolutionWidth + 15) AS s15, SUM(ResolutionWidth + 16) AS s16, SUM(ResolutionWidth + 17) AS s17, SUM(ResolutionWidth + 18) AS s18, SUM(ResolutionWidth + 19) AS s19, SUM(ResolutionWidth + 20) AS s20, SUM(ResolutionWidth + 21) AS s21, SUM(ResolutionWidth + 22) AS s22, SUM(ResolutionWidth + 23) AS s23, SUM(ResolutionWidth + 24) AS s24, SUM(ResolutionWidth + 25) AS s25, SUM(ResolutionWidth + 26) AS s26, SUM(ResolutionWidth + 27) AS s27, SUM(ResolutionWidth + 28) AS s28, SUM(ResolutionWidth + 29) AS s29, SUM(ResolutionWidth + 30) AS s30, SUM(ResolutionWidth + 31) AS s31, SUM(ResolutionWidth + 32) AS s32, SUM(ResolutionWidth + 33) AS s33, SUM(ResolutionWidth + 34) AS s34, SUM(ResolutionWidth + 35) AS s35, SUM(ResolutionWidth + 36) AS s36, SUM(ResolutionWidth + 37) AS s37, SUM(ResolutionWidth + 38) AS s38, SUM(ResolutionWidth + 39) AS s39, SUM(ResolutionWidth + 40) AS s40, SUM(ResolutionWidth + 41) AS s41, SUM(ResolutionWidth + 42) AS s42, SUM(ResolutionWidth + 43) AS s43, SUM(ResolutionWidth + 44) AS s44, SUM(ResolutionWidth + 45) AS s45, SUM(ResolutionWidth + 46) AS s46, SUM(ResolutionWidth + 47) AS s47, SUM(ResolutionWidth + 48) AS s48, SUM(ResolutionWidth + 49) AS s49, SUM(ResolutionWidth + 50) AS s50, SUM(ResolutionWidth + 51) AS s51, SUM(ResolutionWidth + 52) AS s52, SUM(ResolutionWidth + 53) AS s53, SUM(ResolutionWidth + 54) AS s54, SUM(ResolutionWidth + 55) AS s55, SUM(ResolutionWidth + 56) AS s56, SUM(ResolutionWidth + 57) AS s57, SUM(ResolutionWidth + 58) AS s58, SUM(ResolutionWidth + 59) AS s59, SUM(ResolutionWidth + 60) AS s60, SUM(ResolutionWidth + 61) AS s61, SUM(ResolutionWidth + 62) AS s62, SUM(ResolutionWidth + 63) AS s63, SUM(ResolutionWidth + 64) AS s64, SUM(ResolutionWidth + 65) AS s65, SUM(ResolutionWidth + 66) AS s66, SUM(ResolutionWidth + 67) AS s67, SUM(ResolutionWidth + 68) AS s68, SUM(ResolutionWidth + 69) AS s69, SUM(ResolutionWidth + 70) AS s70, SUM(ResolutionWidth + 71) AS s71, SUM(ResolutionWidth + 72) AS s72, SUM(ResolutionWidth + 73) AS s73, SUM(ResolutionWidth + 74) AS s74, SUM(ResolutionWidth + 75) AS s75, SUM(ResolutionWidth + 76) AS s76, SUM(ResolutionWidth + 77) AS s77, SUM(ResolutionWidth + 78) AS s78, SUM(ResolutionWidth + 79) AS s79, SUM(ResolutionWidth + 80) AS s80, SUM(ResolutionWidth + 81) AS s81, SUM(ResolutionWidth + 82) AS s82, SUM(ResolutionWidth + 83) AS s83, SUM(ResolutionWidth + 84) AS s84, SUM(ResolutionWidth + 85) AS s85, SUM(ResolutionWidth + 86) AS s86, SUM(ResolutionWidth + 87) AS s87, SUM(ResolutionWidth + 88) AS s88, SUM(ResolutionWidth + 89) AS s89 FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _data_size_bytes() -> int: + import glob + total = 0 + for p in glob.glob(PARQUET_GLOB): + try: + total += os.path.getsize(p) + except OSError: + pass + return total + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, data_bytes + start = timeit.default_timer() + data_bytes = _data_size_bytes() + df = daft.read_parquet(PARQUET_GLOB) + df = df.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) + df = df.with_column("EventDate", col("EventDate").cast(DataType.date())) + df = df.with_column("URL", col("URL").decode("utf-8")) + df = df.with_column("Title", col("Title").decode("utf-8")) + df = df.with_column("Referer", col("Referer").decode("utf-8")) + df = df.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) + df = df.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) + hits = df + # Register so daft.sql can see `hits`. + try: + daft.catalog.register_table("hits", df) # type: ignore[attr-defined] + except Exception: + pass + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if data_bytes: + return {"bytes": int(data_bytes)} + # Fall back to the on-disk size if /load hasn't run yet. + return {"bytes": _data_size_bytes()} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DAFT_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/daft-parquet-partitioned/start b/daft-parquet-partitioned/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/daft-parquet-partitioned/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/daft-parquet-partitioned/stop b/daft-parquet-partitioned/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/daft-parquet-partitioned/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/daft-parquet-partitioned/template.json b/daft-parquet-partitioned/template.json index 2bb208ca8d..7168fbfeed 100644 --- a/daft-parquet-partitioned/template.json +++ b/daft-parquet-partitioned/template.json @@ -6,7 +6,7 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ] } diff --git a/daft-parquet/benchmark.sh b/daft-parquet/benchmark.sh index e456b9a730..fc4bacc8f3 100755 --- a/daft-parquet/benchmark.sh +++ b/daft-parquet/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas -pip install packaging -pip install daft - -../download-hits-parquet-single - -# Run the queries -mode=single -echo "Running $mode mode..." -./run.sh $mode 2>&1 | tee "daft_log_${mode}.txt" - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/daft-parquet/check b/daft-parquet/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/daft-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/daft-parquet/data-size b/daft-parquet/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/daft-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/daft-parquet/install b/daft-parquet/install new file mode 100755 index 0000000000..8d49eef9f2 --- /dev/null +++ b/daft-parquet/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas packaging daft fastapi uvicorn diff --git a/daft-parquet/load b/daft-parquet/load new file mode 100755 index 0000000000..65019c6623 --- /dev/null +++ b/daft-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. Daft is lazy; reading just +# builds the plan and types — actual data is loaded on first query. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/daft-parquet/query b/daft-parquet/query new file mode 100755 index 0000000000..6366d71600 --- /dev/null +++ b/daft-parquet/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running daft server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/daft-parquet/query.py b/daft-parquet/query.py deleted file mode 100755 index b761767507..0000000000 --- a/daft-parquet/query.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -import daft -import os -import sys -import timeit -import traceback -from daft import col, DataType - -hits = None -current_dir = os.path.dirname(os.path.abspath(__file__)) -query_idx = int(sys.argv[1]) - 1 -is_single_mode = len(sys.argv) > 2 and sys.argv[2] == "single" -parquet_path = os.path.join( - current_dir, - "hits.parquet" if is_single_mode else "hits_*.parquet" -) - -with open("queries.sql") as f: - sql_list = [q.strip() for q in f.read().split(';') if q.strip()] - -def run_single_query(sql, i): - try: - start = timeit.default_timer() - - global hits - if hits is None: - hits = daft.read_parquet(parquet_path) - hits = hits.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) - hits = hits.with_column("EventDate", col("EventDate").cast(DataType.date())) - hits = hits.with_column("URL", col("URL").decode("utf-8")) - hits = hits.with_column("Title", col("Title").decode("utf-8")) - hits = hits.with_column("Referer", col("Referer").decode("utf-8")) - hits = hits.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) - hits = hits.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) - - result = daft.sql(sql) - result.collect() - - run_time = round(timeit.default_timer() - start, 3) - return run_time - except Exception as e: - print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr) - traceback.print_exc() - return None - -if __name__ == "__main__": - sql = sql_list[query_idx] - times = [] - for i in range(3): - elapsed = run_single_query(sql, i) - times.append(f"{elapsed}" if elapsed else "") - print(','.join(times)) diff --git a/daft-parquet/results/20250410/c6a.4xlarge.json b/daft-parquet/results/20250410/c6a.4xlarge.json index 94125be8d7..0556d8da88 100644 --- a/daft-parquet/results/20250410/c6a.4xlarge.json +++ b/daft-parquet/results/20250410/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250410/c6a.metal.json b/daft-parquet/results/20250410/c6a.metal.json index dda7c6d41f..9dcbbf6963 100644 --- a/daft-parquet/results/20250410/c6a.metal.json +++ b/daft-parquet/results/20250410/c6a.metal.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250506/c6a.4xlarge.json b/daft-parquet/results/20250506/c6a.4xlarge.json index 70d9f9155c..53dff1d63d 100644 --- a/daft-parquet/results/20250506/c6a.4xlarge.json +++ b/daft-parquet/results/20250506/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250507/c6a.metal.json b/daft-parquet/results/20250507/c6a.metal.json index 26e3bd0444..4ed6a30c95 100644 --- a/daft-parquet/results/20250507/c6a.metal.json +++ b/daft-parquet/results/20250507/c6a.metal.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250710/c6a.2xlarge.json b/daft-parquet/results/20250710/c6a.2xlarge.json index 2629ea2081..0cb2314f0b 100644 --- a/daft-parquet/results/20250710/c6a.2xlarge.json +++ b/daft-parquet/results/20250710/c6a.2xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250710/c6a.4xlarge.json b/daft-parquet/results/20250710/c6a.4xlarge.json index 4a7c3d0dfd..93a9283f74 100644 --- a/daft-parquet/results/20250710/c6a.4xlarge.json +++ b/daft-parquet/results/20250710/c6a.4xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250711/c6a.large.json b/daft-parquet/results/20250711/c6a.large.json index 4bb7b8a6c3..3ef1d740e5 100644 --- a/daft-parquet/results/20250711/c6a.large.json +++ b/daft-parquet/results/20250711/c6a.large.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250711/c6a.xlarge.json b/daft-parquet/results/20250711/c6a.xlarge.json index 553077f33f..1169cc121e 100644 --- a/daft-parquet/results/20250711/c6a.xlarge.json +++ b/daft-parquet/results/20250711/c6a.xlarge.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250712/c8g.4xlarge.json b/daft-parquet/results/20250712/c8g.4xlarge.json index bba10559b6..8d42d9bd23 100644 --- a/daft-parquet/results/20250712/c8g.4xlarge.json +++ b/daft-parquet/results/20250712/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250712/t3a.small.json b/daft-parquet/results/20250712/t3a.small.json index 5ce7b03cec..5d6150a5fc 100644 --- a/daft-parquet/results/20250712/t3a.small.json +++ b/daft-parquet/results/20250712/t3a.small.json @@ -9,8 +9,8 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ], "load_time": 0, "data_size": 14779976446, diff --git a/daft-parquet/results/20250830/c7a.metal-48xl.json b/daft-parquet/results/20250830/c7a.metal-48xl.json index 1e7821be64..51d8abe1d7 100644 --- a/daft-parquet/results/20250830/c7a.metal-48xl.json +++ b/daft-parquet/results/20250830/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/c6a.2xlarge.json b/daft-parquet/results/20250831/c6a.2xlarge.json index 0231b4586f..24e455ca7d 100644 --- a/daft-parquet/results/20250831/c6a.2xlarge.json +++ b/daft-parquet/results/20250831/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/c6a.4xlarge.json b/daft-parquet/results/20250831/c6a.4xlarge.json index 367facbb15..7ee36b1690 100644 --- a/daft-parquet/results/20250831/c6a.4xlarge.json +++ b/daft-parquet/results/20250831/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/c6a.large.json b/daft-parquet/results/20250831/c6a.large.json index 5ac0a271b9..354ed1f663 100644 --- a/daft-parquet/results/20250831/c6a.large.json +++ b/daft-parquet/results/20250831/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/c6a.metal.json b/daft-parquet/results/20250831/c6a.metal.json index 06474911dc..1a693fbf5f 100644 --- a/daft-parquet/results/20250831/c6a.metal.json +++ b/daft-parquet/results/20250831/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/c6a.xlarge.json b/daft-parquet/results/20250831/c6a.xlarge.json index 88c6c5acba..5d7ea38285 100644 --- a/daft-parquet/results/20250831/c6a.xlarge.json +++ b/daft-parquet/results/20250831/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20250831/t3a.small.json b/daft-parquet/results/20250831/t3a.small.json index 50fc13a693..862f207d45 100644 --- a/daft-parquet/results/20250831/t3a.small.json +++ b/daft-parquet/results/20250831/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260303/c6a.4xlarge.json b/daft-parquet/results/20260303/c6a.4xlarge.json index fc5a14838a..fae4f46b77 100644 --- a/daft-parquet/results/20260303/c6a.4xlarge.json +++ b/daft-parquet/results/20260303/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c6a.2xlarge.json b/daft-parquet/results/20260331/c6a.2xlarge.json index d4e281f137..dd9089efd1 100644 --- a/daft-parquet/results/20260331/c6a.2xlarge.json +++ b/daft-parquet/results/20260331/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c6a.4xlarge.json b/daft-parquet/results/20260331/c6a.4xlarge.json index abb9ea8566..80193e0848 100644 --- a/daft-parquet/results/20260331/c6a.4xlarge.json +++ b/daft-parquet/results/20260331/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c6a.large.json b/daft-parquet/results/20260331/c6a.large.json index a33eea6d71..bd28c614fb 100644 --- a/daft-parquet/results/20260331/c6a.large.json +++ b/daft-parquet/results/20260331/c6a.large.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c6a.metal.json b/daft-parquet/results/20260331/c6a.metal.json index 30c089b49c..54fc93dc02 100644 --- a/daft-parquet/results/20260331/c6a.metal.json +++ b/daft-parquet/results/20260331/c6a.metal.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c6a.xlarge.json b/daft-parquet/results/20260331/c6a.xlarge.json index 37eb1e2ceb..c2165f1eb8 100644 --- a/daft-parquet/results/20260331/c6a.xlarge.json +++ b/daft-parquet/results/20260331/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c7a.metal-48xl.json b/daft-parquet/results/20260331/c7a.metal-48xl.json index 2458af6f07..882b46aa6c 100644 --- a/daft-parquet/results/20260331/c7a.metal-48xl.json +++ b/daft-parquet/results/20260331/c7a.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/c8g.4xlarge.json b/daft-parquet/results/20260331/c8g.4xlarge.json index 44cc87ac71..ae128a5263 100644 --- a/daft-parquet/results/20260331/c8g.4xlarge.json +++ b/daft-parquet/results/20260331/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260331/t3a.small.json b/daft-parquet/results/20260331/t3a.small.json index cca641f53f..aa38f8d48a 100644 --- a/daft-parquet/results/20260331/t3a.small.json +++ b/daft-parquet/results/20260331/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","stateless","serverless","embedded"], + "tags": ["Rust","stateless","embedded","in-memory"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/daft-parquet/results/20260509/c6a.4xlarge.json b/daft-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..353f71a6ff --- /dev/null +++ b/daft-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Daft (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","stateless","embedded","in-memory"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [0.281, 0.046, 0.045], + [0.136, 0.04, 0.039], + [0.204, 0.108, 0.098], + [0.267, 0.117, 0.114], + [0.95, 0.708, 0.7], + [0.971, 0.915, 0.924], + [0.066, 0.049, 0.038], + [0.107, 0.042, 0.041], + [2.494, 2.393, 2.455], + [1.778, 1.632, 1.627], + [0.676, 0.547, 0.546], + [0.705, 0.556, 0.551], + [1.443, 1.373, 1.343], + [3.519, 3.352, 3.396], + [1.355, 1.276, 1.271], + [0.985, 0.895, 0.898], + [2.805, 2.039, 2.034], + [2.64, 1.79, 1.788], + [5.475, 3.914, 3.927], + [0.219, 0.065, 0.065], + [9.842, 2.147, 2.097], + [11.025, 2.504, 2.514], + [21.609, 4.522, 4.546], + [55.563, 11.379, 11.381], + [2.383, 0.819, 0.797], + [0.862, 0.798, 0.785], + [2.381, 0.864, 0.883], + [9.661, 4.177, 4.162], + [16.539, 16.441, 16.498], + [1.602, 1.788, 1.883], + [2.3, 1.333, 1.338], + [5.87, 1.631, 1.625], + [7.522, 5.498, 5.544], + [11.781, 6.209, 6.181], + [11.01, 5.206, 5.246], + [1.687, 1.55, 1.558], + [0.303, 0.198, 0.212], + [0.167, 0.112, 0.105], + [0.219, 0.128, 0.128], + [0.462, 0.196, 0.175], + [0.097, 0.043, 0.049], + [0.09, 0.051, 0.048], + [0.063, 0.03, 0.034] +] +} + diff --git a/daft-parquet/results/20260509/c6a.metal.json b/daft-parquet/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..d1686d4a23 --- /dev/null +++ b/daft-parquet/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Daft (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","stateless","embedded"], + "load_time": 7, + "data_size": 14779976446, + "result": [ + [0.208, 0.03, 0.031], + [0.091, 0.037, 0.035], + [0.1, 0.053, 0.052], + [0.299, 0.057, 0.054], + [0.833, 0.145, 0.154], + [1.073, 0.193, 0.196], + [0.051, 0.044, 0.042], + [0.062, 0.046, 0.042], + [1.516, 1.199, 1.193], + [1.501, 0.775, 0.747], + [0.587, 0.362, 0.369], + [0.57, 0.348, 0.332], + [0.847, 0.258, 0.263], + [2.307, 0.673, 0.65], + [0.905, 0.263, 0.251], + [0.52, 0.149, 0.15], + [2.436, 0.394, 0.409], + [2.043, 0.329, 0.336], + [4.008, 0.722, 0.72], + [0.158, 0.035, 0.035], + [10.016, 0.49, 0.48], + [11.026, 0.779, 0.779], + [21.593, 1.122, 1.097], + [55.497, 2.216, 2.154], + [2.376, 0.198, 0.192], + [0.971, 0.189, 0.179], + [2.683, 0.209, 0.219], + [9.905, 0.964, 0.972], + [23.087, 19.324, 19.164], + [0.329, 0.278, 0.282], + [2.083, 0.255, 0.27], + [5.476, 0.316, 0.328], + [4.423, 1.066, 1.074], + [9.845, 1.109, 1.089], + [9.795, 1.035, 1.007], + [0.297, 0.265, 0.269], + [0.264, 0.192, 0.175], + [0.143, 0.118, 0.098], + [0.18, 0.151, 0.079], + [0.37, 0.228, 0.213], + [0.095, 0.061, 0.054], + [0.072, 0.061, 0.041], + [0.063, 0.042, 0.041] +] +} + diff --git a/daft-parquet/run.sh b/daft-parquet/run.sh deleted file mode 100755 index 86d1a512c7..0000000000 --- a/daft-parquet/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -mode=${1} - -TRIES=3 -QUERY_COUNT=43 - -declare -a results=() -for ((i=0; i /dev/null - - output=$(python3 query.py $q $mode 2>&1) - IFS=',' read -r t1 t2 t3 <<< "$(echo "$output" | tail -1)" - - results[$((q-1))]="[${t1:-null},${t2:-null},${t3:-null}]" -done - -IFS=, printf '%s,\n' "${results[@]}" | sed '$s/,$//' diff --git a/daft-parquet/server.py b/daft-parquet/server.py new file mode 100644 index 0000000000..33b6cde92e --- /dev/null +++ b/daft-parquet/server.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Daft (single-file parquet) so it conforms to the +ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, casts + types, holds the Daft DataFrame in memory, registers + it as `hits` for daft.sql, returns {"elapsed": ...} + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching callable via daft.sql, returns + {"elapsed": }. + GET /data-size -> file size of hits.parquet at load time. + +The 43 SQL strings come straight from the prior daft-parquet/queries.sql. +""" + +import os +import timeit + +import daft +import uvicorn +from daft import DataType, col +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits = None +data_bytes = 0 + +PARQUET_GLOB = os.environ.get("BENCH_DAFT_PARQUET", "hits.parquet") + + +def _make_runner(sql: str): + return lambda _df: daft.sql(sql).collect() + + +# 43 ClickBench queries — daft.sql against the registered `hits` view. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate) as m1, MAX(EventDate) as m2 FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) as m FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth) AS s0, SUM(ResolutionWidth + 1) AS s1, SUM(ResolutionWidth + 2) AS s2, SUM(ResolutionWidth + 3) AS s3, SUM(ResolutionWidth + 4) AS s4, SUM(ResolutionWidth + 5) AS s5, SUM(ResolutionWidth + 6) AS s6, SUM(ResolutionWidth + 7) AS s7, SUM(ResolutionWidth + 8) AS s8, SUM(ResolutionWidth + 9) AS s9, SUM(ResolutionWidth + 10) AS s10, SUM(ResolutionWidth + 11) AS s11, SUM(ResolutionWidth + 12) AS s12, SUM(ResolutionWidth + 13) AS s13, SUM(ResolutionWidth + 14) AS s14, SUM(ResolutionWidth + 15) AS s15, SUM(ResolutionWidth + 16) AS s16, SUM(ResolutionWidth + 17) AS s17, SUM(ResolutionWidth + 18) AS s18, SUM(ResolutionWidth + 19) AS s19, SUM(ResolutionWidth + 20) AS s20, SUM(ResolutionWidth + 21) AS s21, SUM(ResolutionWidth + 22) AS s22, SUM(ResolutionWidth + 23) AS s23, SUM(ResolutionWidth + 24) AS s24, SUM(ResolutionWidth + 25) AS s25, SUM(ResolutionWidth + 26) AS s26, SUM(ResolutionWidth + 27) AS s27, SUM(ResolutionWidth + 28) AS s28, SUM(ResolutionWidth + 29) AS s29, SUM(ResolutionWidth + 30) AS s30, SUM(ResolutionWidth + 31) AS s31, SUM(ResolutionWidth + 32) AS s32, SUM(ResolutionWidth + 33) AS s33, SUM(ResolutionWidth + 34) AS s34, SUM(ResolutionWidth + 35) AS s35, SUM(ResolutionWidth + 36) AS s36, SUM(ResolutionWidth + 37) AS s37, SUM(ResolutionWidth + 38) AS s38, SUM(ResolutionWidth + 39) AS s39, SUM(ResolutionWidth + 40) AS s40, SUM(ResolutionWidth + 41) AS s41, SUM(ResolutionWidth + 42) AS s42, SUM(ResolutionWidth + 43) AS s43, SUM(ResolutionWidth + 44) AS s44, SUM(ResolutionWidth + 45) AS s45, SUM(ResolutionWidth + 46) AS s46, SUM(ResolutionWidth + 47) AS s47, SUM(ResolutionWidth + 48) AS s48, SUM(ResolutionWidth + 49) AS s49, SUM(ResolutionWidth + 50) AS s50, SUM(ResolutionWidth + 51) AS s51, SUM(ResolutionWidth + 52) AS s52, SUM(ResolutionWidth + 53) AS s53, SUM(ResolutionWidth + 54) AS s54, SUM(ResolutionWidth + 55) AS s55, SUM(ResolutionWidth + 56) AS s56, SUM(ResolutionWidth + 57) AS s57, SUM(ResolutionWidth + 58) AS s58, SUM(ResolutionWidth + 59) AS s59, SUM(ResolutionWidth + 60) AS s60, SUM(ResolutionWidth + 61) AS s61, SUM(ResolutionWidth + 62) AS s62, SUM(ResolutionWidth + 63) AS s63, SUM(ResolutionWidth + 64) AS s64, SUM(ResolutionWidth + 65) AS s65, SUM(ResolutionWidth + 66) AS s66, SUM(ResolutionWidth + 67) AS s67, SUM(ResolutionWidth + 68) AS s68, SUM(ResolutionWidth + 69) AS s69, SUM(ResolutionWidth + 70) AS s70, SUM(ResolutionWidth + 71) AS s71, SUM(ResolutionWidth + 72) AS s72, SUM(ResolutionWidth + 73) AS s73, SUM(ResolutionWidth + 74) AS s74, SUM(ResolutionWidth + 75) AS s75, SUM(ResolutionWidth + 76) AS s76, SUM(ResolutionWidth + 77) AS s77, SUM(ResolutionWidth + 78) AS s78, SUM(ResolutionWidth + 79) AS s79, SUM(ResolutionWidth + 80) AS s80, SUM(ResolutionWidth + 81) AS s81, SUM(ResolutionWidth + 82) AS s82, SUM(ResolutionWidth + 83) AS s83, SUM(ResolutionWidth + 84) AS s84, SUM(ResolutionWidth + 85) AS s85, SUM(ResolutionWidth + 86) AS s86, SUM(ResolutionWidth + 87) AS s87, SUM(ResolutionWidth + 88) AS s88, SUM(ResolutionWidth + 89) AS s89 FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _data_size_bytes() -> int: + import glob + total = 0 + for p in glob.glob(PARQUET_GLOB): + try: + total += os.path.getsize(p) + except OSError: + pass + return total + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, data_bytes + start = timeit.default_timer() + data_bytes = _data_size_bytes() + df = daft.read_parquet(PARQUET_GLOB) + df = df.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) + df = df.with_column("EventDate", col("EventDate").cast(DataType.date())) + df = df.with_column("URL", col("URL").decode("utf-8")) + df = df.with_column("Title", col("Title").decode("utf-8")) + df = df.with_column("Referer", col("Referer").decode("utf-8")) + df = df.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) + df = df.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) + hits = df + # Register so daft.sql can see `hits`. + try: + daft.catalog.register_table("hits", df) # type: ignore[attr-defined] + except Exception: + pass + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if data_bytes: + return {"bytes": int(data_bytes)} + # Fall back to the on-disk size if /load hasn't run yet. + return {"bytes": _data_size_bytes()} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DAFT_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/daft-parquet/start b/daft-parquet/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/daft-parquet/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/daft-parquet/stop b/daft-parquet/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/daft-parquet/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/daft-parquet/template.json b/daft-parquet/template.json index 8aa439289a..0de625c1f2 100644 --- a/daft-parquet/template.json +++ b/daft-parquet/template.json @@ -6,7 +6,7 @@ "tags": [ "Rust", "stateless", - "serverless", - "embedded" + "embedded", + "in-memory" ] } diff --git a/databend/benchmark.sh b/databend/benchmark.sh index 1582a7ec06..739b26841c 100755 --- a/databend/benchmark.sh +++ b/databend/benchmark.sh @@ -1,74 +1,9 @@ #!/bin/bash - -curl -LJO 'https://github.com/datafuselabs/databend/releases/download/v0.9.53-nightly/databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' -tar xzvf 'databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' - -cat > config.toml << CONF -[storage] -type = "fs" - -[storage.fs] -data_path = "./_data" - -[meta] -endpoints = ["127.0.0.1:9191"] -username = "root" -password = "root" -client_timeout_in_second = 60 -auto_sync_interval = 60 -CONF - -# databend starts with meta service -./bin/databend-meta --single > meta.log 2>&1 & -./bin/databend-query -c config.toml > query.log 2>&1 & - -# Load the data -# Docs: https://databend.rs/doc/use-cases/analyze-hits-dataset-with-databend -for _ in {1..600} -do - curl -sS 'http://default@localhost:8124/' --data-binary @create.sql && break - sleep 1 -done - -../download-hits-tsv - -## Aws gp2 write performance is not stable, we must load the data when disk's write around ~500MB/s (Don't know much about the rules of gp2) -# Load Data -START=$(date +%s) -curl -sS -XPUT 'http://root:@127.0.0.1:8000/v1/streaming_load' -H 'insert_sql: insert into hits FILE_FORMAT = (type = TSV)' -F 'upload=@"./hits.tsv"' -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -## in c5.4x large, it's 368s -# {"id":"17477ed9-9f1a-46d9-b6cf-12a5971f4450","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 6m8.975s -# user 0m4.327s -# sys 0m36.185s - -## in c6a.4xlarge it's ~360s -# {"id":"f7506581-a4da-4684-850c-4bd03530314d","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 5m57.800s -# user 0m2.106s -# sys 0m33.507s - -## in c6a.metal it's ~70s -# {"id":"2564bd91-1b36-4cf2-a95e-de46c5aff0c6","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 1m10.347s -# user 0m0.953s -# sys 0m20.401s - - - -## check data is correct -curl -sS 'http://default@localhost:8124/' --data-binary "select count() from hits" - -echo -n "Data size: " -du -bcs _data | grep total -# 20922561953 _data -# 20922561953 total - -# If you wants to get the data size(without metadata and indexes) -# curl 'http://default@localhost:8124/' --data-binary "select humanize_size(bytes_compressed) from fuse_snapshot('default', 'hits') order by timestamp desc limit 1" -# 18.48 GiB - -./run.sh 2>&1 | tee log.txt +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +# databend's first cold start (meta + query coming up together, opening +# every object-store dir) regularly runs past the lib's 300s default; +# the original benchmark used a 600s wait, restore that. +export BENCH_CHECK_TIMEOUT=600 +exec ../lib/benchmark-common.sh diff --git a/databend/check b/databend/check new file mode 100755 index 0000000000..18125ca66f --- /dev/null +++ b/databend/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf 'http://default@localhost:8124/' --data-binary 'SELECT 1' >/dev/null diff --git a/databend/data-size b/databend/data-size new file mode 100755 index 0000000000..2d6c829718 --- /dev/null +++ b/databend/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs _data | grep total | awk '{print $1}' diff --git a/databend/install b/databend/install new file mode 100755 index 0000000000..e04c849bea --- /dev/null +++ b/databend/install @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +if [ ! -d ./bin ]; then + curl -LJO 'https://github.com/datafuselabs/databend/releases/download/v0.9.53-nightly/databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' + tar xzvf 'databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' +fi + +cat > config.toml <<'CONF' +[storage] +type = "fs" + +[storage.fs] +data_path = "./_data" + +[meta] +endpoints = ["127.0.0.1:9191"] +username = "root" +password = "root" +client_timeout_in_second = 60 +auto_sync_interval = 60 +CONF diff --git a/databend/load b/databend/load new file mode 100755 index 0000000000..0c083afbcc --- /dev/null +++ b/databend/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Create the table. +curl -sS 'http://default@localhost:8124/' --data-binary @create.sql + +# Load via the streaming_load HTTP endpoint. +curl -sS -XPUT 'http://root:@127.0.0.1:8000/v1/streaming_load' \ + -H 'insert_sql: insert into hits FILE_FORMAT = (type = TSV)' \ + -F 'upload=@./hits.tsv' + +rm -f hits.tsv +sync diff --git a/databend/query b/databend/query new file mode 100755 index 0000000000..15698283e0 --- /dev/null +++ b/databend/query @@ -0,0 +1,25 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via databend's clickhouse-compatible HTTP. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (curl wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +body=$(mktemp) +trap 'rm -f "$body"' EXIT + +stats=$(curl -sS -o "$body" -w 'HTTP:%{http_code} TIME:%{time_total}\n' \ + 'http://default@localhost:8124' --data "$query") +http_code=$(echo "$stats" | grep -oP 'HTTP:\K[0-9]+') +res=$(echo "$stats" | grep -oP 'TIME:\K[0-9.]+') + +if [ "$http_code" != "200" ] || grep -qiE '"error"|exception|error code' "$body"; then + cat "$body" >&2 + exit 1 +fi + +cat "$body" + +awk -v t="$res" 'BEGIN { printf "%.3f\n", t }' >&2 diff --git a/databend/results/20260509/c6a.4xlarge.json b/databend/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..5e6a1fbe19 --- /dev/null +++ b/databend/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Databend", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 391, + "data_size": 20916855030, + "result": [ + [0.025, 0.006, 0.006], + [0.271, 0.026, 0.025], + [0.629, 0.038, 0.039], + [1.022, 0.037, 0.037], + [1.268, 0.413, 0.405], + [2.263, 0.581, 0.565], + [0.222, 0.02, 0.02], + [0.24, 0.028, 0.028], + [2.25, 0.525, 0.526], + [3.656, 0.6, 0.599], + [1.959, 0.223, 0.225], + [2.277, 0.192, 0.197], + [2.295, 0.659, 0.661], + [3.98, 1.009, 1], + [2.548, 0.786, 0.775], + [1.399, 0.574, 0.567], + [4.703, 2.374, 2.388], + [3.701, 1.35, 1.35], + [8.299, 4.084, 4.133], + [0.249, 0.008, 0.007], + [11.467, 0.442, 0.438], + [13.751, 0.536, 0.535], + [24.468, 1.015, 1.014], + [7.647, 0.246, 0.246], + [0.649, 0.033, 0.032], + [2.034, 0.185, 0.187], + [0.651, 0.034, 0.033], + [11.504, 0.404, 0.402], + [12.141, 11.906, 11.905], + [0.446, 0.064, 0.063], + [5.73, 0.485, 0.485], + [8.043, 0.933, 0.92], + [8.49, 5.202, 5.177], + [12.976, 3.269, 3.287], + [12.958, 3.261, 3.294], + [1.1, 0.52, 0.512], + [0.289, 0.072, 0.069], + [0.216, 0.027, 0.027], + [0.211, 0.019, 0.019], + [0.28, 0.086, 0.086], + [0.224, 0.019, 0.022], + [0.198, 0.015, 0.015], + [0.202, 0.015, 0.014] +] +} + diff --git a/databend/results/20260509/c6a.metal.json b/databend/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..7a50364af2 --- /dev/null +++ b/databend/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Databend", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 122, + "data_size": 21023276502, + "result": [ + [0.049, 0.008, 0.007], + [0.266, 0.017, 0.015], + [1.053, 0.022, 0.02], + [1.398, 0.02, 0.018], + [1.695, 0.225, 0.221], + [2.31, 0.254, 0.247], + [0.252, 0.017, 0.015], + [0.238, 0.021, 0.019], + [2.84, 0.334, 0.316], + [4.325, 0.339, 0.345], + [2.546, 0.35, 0.394], + [2.863, 0.205, 0.202], + [2.348, 0.274, 0.274], + [4.227, 0.556, 0.535], + [2.875, 0.273, 0.281], + [1.733, 0.238, 0.243], + [4.242, 0.489, 0.461], + [4.195, 0.437, 0.403], + [7.066, 0.639, 0.62], + [0.447, 0.009, 0.007], + [11.542, 0.077, 0.074], + [13.891, 0.091, 0.093], + [24.81, 0.477, 0.474], + [11.592, 0.267, 0.27], + [1.448, 0.045, 0.044], + [2.145, 0.04, 0.038], + [1.436, 0.046, 0.044], + [11.593, 0.08, 0.077], + [10.153, 1.541, 1.544], + [0.841, 0.053, 0.051], + [6.271, 0.228, 0.237], + [8.284, 0.284, 0.273], + [6.669, 1.018, 0.986], + [11.942, 0.657, 0.664], + [11.939, 0.665, 0.654], + [1.499, 0.239, 0.24], + [0.407, 0.104, 0.1], + [0.311, 0.03, 0.029], + [0.35, 0.023, 0.021], + [0.519, 0.068, 0.068], + [0.476, 0.024, 0.022], + [0.473, 0.017, 0.015], + [0.296, 0.016, 0.015] +] +} + diff --git a/databend/run.sh b/databend/run.sh deleted file mode 100755 index 98e7662426..0000000000 --- a/databend/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - BODY=$(mktemp) - STATS=$(curl -sS -o "$BODY" -w 'HTTP:%{http_code} TIME:%{time_total}\n' "http://default@localhost:8124" -d "${query}" 2>&1) - CURL_EXIT=$? - HTTP_CODE=$(echo "$STATS" | grep -oP 'HTTP:\K[0-9]+') - RES=$(echo "$STATS" | grep -oP 'TIME:\K[0-9.]+') - - if [[ "$CURL_EXIT" == "0" && "$HTTP_CODE" == "200" && -n "${RES}" ]] && ! grep -qiE '"error"|exception|error code' "$BODY" - then - echo -n "${RES}" - else - echo -n "null" - RES="" - fi - rm -f "$BODY" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/databend/start b/databend/start new file mode 100755 index 0000000000..6b2afaf5fc --- /dev/null +++ b/databend/start @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# Idempotent: if HTTP API is already up, do nothing. +if curl -sSf 'http://default@localhost:8124/' --data-binary 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# databend-query opens a lot of object-store handles during ingest and +# query; the default 1024 fd limit hit "No file descriptors available +# (os error 24)" mid-benchmark. +ulimit -n 65536 || true + +# databend has two daemons: meta service + query service. Query's first +# action is to dial the meta service; if we start them simultaneously +# query crashes on the connection refused, exits, and the bench loop +# then polls a dead process for 300s. Start meta first, wait for the +# raft port (9191) to bind, then bring up query. +nohup ./bin/databend-meta --single > meta.log 2>&1 & +disown + +for _ in $(seq 1 60); do + if (echo > /dev/tcp/127.0.0.1/9191) >/dev/null 2>&1; then break; fi + sleep 1 +done + +nohup ./bin/databend-query -c config.toml > query.log 2>&1 & +disown diff --git a/databend/stop b/databend/stop new file mode 100755 index 0000000000..49788ddc4a --- /dev/null +++ b/databend/stop @@ -0,0 +1,12 @@ +#!/bin/bash + +pkill -x databend-query 2>/dev/null || true +pkill -x databend-meta 2>/dev/null || true + +# Wait briefly, escalate to KILL if needed. +for _ in $(seq 1 15); do + pgrep -x databend-query >/dev/null 2>&1 || pgrep -x databend-meta >/dev/null 2>&1 || exit 0 + sleep 1 +done +pkill -9 -x databend-query 2>/dev/null || true +pkill -9 -x databend-meta 2>/dev/null || true diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index 1d59008b45..3b63e772a6 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -1,51 +1,5 @@ #!/bin/bash - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -WITH_SWAP=false - -if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then - echo "LOW MEMORY MODE" - # Enable swap if not already enabled. This is needed both for rustc and until we have a better - # solution for low memory machines, see - # https://github.com/apache/datafusion/issues/18473 - if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then - echo "Enabling 8G swap" - sudo fallocate -l 8G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - WITH_SWAP=true - fi -fi - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y gcc - -echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 53.1.0 -CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli -export PATH="`pwd`/target/release:$PATH" -cd .. - -echo "Download benchmark target data, partitioned" -../download-hits-parquet-partitioned partitioned - -echo "Run benchmarks for partitioned" -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs partitioned | grep total)" - -if [ "$WITH_SWAP" = true ]; then - echo "Disable swap" - sudo swapoff /swapfile - sudo rm /swapfile -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion-partitioned/check b/datafusion-partitioned/check new file mode 100755 index 0000000000..52f2d25863 --- /dev/null +++ b/datafusion-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DF=arrow-datafusion/target/release/datafusion-cli +"$DF" -c "SELECT 1" >/dev/null diff --git a/datafusion-partitioned/data-size b/datafusion-partitioned/data-size new file mode 100755 index 0000000000..7be44d7921 --- /dev/null +++ b/datafusion-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs partitioned | awk '/total$/ { print $1 }' diff --git a/datafusion-partitioned/install b/datafusion-partitioned/install new file mode 100755 index 0000000000..20f376193e --- /dev/null +++ b/datafusion-partitioned/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -e + +if [ ! -x arrow-datafusion/target/release/datafusion-cli ]; then + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" + + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi + + sudo apt-get update -y + sudo apt-get install -y gcc git + + if [ ! -d arrow-datafusion ]; then + git clone https://github.com/apache/arrow-datafusion.git + fi + cd arrow-datafusion + git checkout 53.1.0 + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --package datafusion-cli --bin datafusion-cli +fi diff --git a/datafusion-partitioned/load b/datafusion-partitioned/load new file mode 100755 index 0000000000..275c2c6ae5 --- /dev/null +++ b/datafusion-partitioned/load @@ -0,0 +1,9 @@ +#!/bin/bash +# datafusion queries the parquet files via an external table at LOCATION +# 'partitioned' (see create.sql). The shared bench_download fetches the +# parquet files into CWD; move them into the expected subdir. +set -e + +mkdir -p partitioned +mv hits_*.parquet partitioned/ 2>/dev/null || true +sync diff --git a/datafusion-partitioned/query b/datafusion-partitioned/query new file mode 100755 index 0000000000..c3625f4b1b --- /dev/null +++ b/datafusion-partitioned/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via datafusion-cli using create.sql +# to define the hits view, then the query. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +DF=arrow-datafusion/target/release/datafusion-cli + +query=$(cat) +tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +trap 'rm -f "$tmp"' EXIT +printf '%s\n' "$query" > "$tmp" + +out=$("$DF" -f create.sql "$tmp" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +printf '%s\n' "$out" | grep -v 'Elapsed' || true + +printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion-partitioned/results/20260509/c6a.4xlarge.json b/datafusion-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..696ecda107 --- /dev/null +++ b/datafusion-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 18, + "data_size": 14737666736, + "result": [ + [0.043, 0.002, 0.002], + [0.095, 0.022, 0.022], + [0.202, 0.081, 0.081], + [0.467, 0.075, 0.076], + [1, 0.737, 0.751], + [1.02, 0.647, 0.653], + [0.061, 0.006, 0.006], + [0.107, 0.024, 0.025], + [0.941, 0.815, 0.816], + [1.645, 1.002, 1.021], + [0.569, 0.175, 0.173], + [0.74, 0.19, 0.19], + [1.251, 0.707, 0.718], + [2.587, 1.435, 1.25], + [1.122, 0.7, 0.693], + [0.886, 0.755, 0.766], + [2.575, 1.603, 1.714], + [2.67, 1.474, 1.474], + [5.564, 3.635, 3.182], + [0.239, 0.064, 0.066], + [10.068, 0.916, 0.912], + [11.456, 1.1, 1.093], + [22.24, 2.199, 2.447], + [52.641, 8.003, 9.193], + [0.367, 0.111, 0.108], + [1.119, 0.202, 0.197], + [0.324, 0.111, 0.113], + [10.413, 1.264, 1.339], + [9.372, 8.935, 8.838], + [0.576, 0.49, 0.475], + [3.067, 0.658, 0.675], + [6.819, 0.822, 0.803], + [4.935, 3.331, 3.107], + [10.146, 3.211, 3.413], + [10.169, 3.248, 3.762], + [1.333, 1.128, 1.018], + [0.31, 0.173, 0.172], + [0.152, 0.053, 0.063], + [0.233, 0.094, 0.094], + [0.527, 0.306, 0.304], + [0.136, 0.029, 0.028], + [0.125, 0.025, 0.024], + [0.117, 0.022, 0.026] +] +} + diff --git a/datafusion-partitioned/run.sh b/datafusion-partitioned/run.sh deleted file mode 100755 index 2e1c36109e..0000000000 --- a/datafusion-partitioned/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -echo $1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "$query" > /tmp/query.sql - - echo -n "[" - for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1| awk '{ print $2 }') - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/datafusion-partitioned/start b/datafusion-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/datafusion-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/datafusion-partitioned/stop b/datafusion-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/datafusion-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/datafusion-vortex-partitioned/benchmark.sh b/datafusion-vortex-partitioned/benchmark.sh index a257b10eb3..7af8d95433 100755 --- a/datafusion-vortex-partitioned/benchmark.sh +++ b/datafusion-vortex-partitioned/benchmark.sh @@ -1,33 +1,6 @@ #!/bin/bash - -set -euo pipefail - -# Install Rust -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -# Install Dependencies -sudo apt-get update -y -sudo apt-get install -y gcc jq build-essential - -# Install Vortex from latest release main branch -git clone https://github.com/spiraldb/vortex.git || true -cd vortex -git checkout 0.44.0 -git submodule update --init -# We build a release version of the benchmarking utility using mimalloc, just like the datafusion-cli -cargo build --release --bin query_bench --package bench-vortex -export PATH="`pwd`/target/release:$PATH" -cd .. - -# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring. -# This will download parquet files (with time and string columns already converted to the logically correct datatype) and generate Vortex files from them. -echo -n "Load time: " -command time -f '%e' query_bench clickbench -i 1 --targets datafusion:vortex --display-format gh-json -q 0 --hide-progress-bar --flavor partitioned - -# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files. -./run.sh partitioned - -echo "Data size: $(find . -name '*.vortex' | xargs wc -c | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# query_bench (the vortex driver) handles its own dataset download/conversion. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion-vortex-partitioned/check b/datafusion-vortex-partitioned/check new file mode 100755 index 0000000000..36fd549725 --- /dev/null +++ b/datafusion-vortex-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +[ -x vortex/target/release/query_bench ] diff --git a/datafusion-vortex-partitioned/data-size b/datafusion-vortex-partitioned/data-size new file mode 100755 index 0000000000..ec6675b0ed --- /dev/null +++ b/datafusion-vortex-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Sum the byte counts of all generated .vortex files. +find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' diff --git a/datafusion-vortex-partitioned/install b/datafusion-vortex-partitioned/install new file mode 100755 index 0000000000..a6ec131cc4 --- /dev/null +++ b/datafusion-vortex-partitioned/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -e + +VORTEX_VERSION=0.44.0 + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source ~/.cargo/env + +sudo apt-get update -y +# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the +# clang freestanding headers (stdbool.h etc.); without libclang-dev the +# build fails with `'stdbool.h' file not found`. +sudo apt-get install -y gcc jq build-essential git clang libclang-dev + +if [ ! -d vortex ]; then + git clone https://github.com/spiraldb/vortex.git +fi +( + cd vortex + git fetch --tags + git checkout "$VORTEX_VERSION" + # See datafusion-vortex/install — submodule update isn't idempotent + # without sync + --force when a previous run left a partial clone. + git submodule sync --recursive + git submodule update --init --recursive --force + cargo build --release --bin query_bench --package bench-vortex +) diff --git a/datafusion-vortex-partitioned/load b/datafusion-vortex-partitioned/load new file mode 100755 index 0000000000..73fa9f9a45 --- /dev/null +++ b/datafusion-vortex-partitioned/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# query_bench fetches Parquet and converts to .vortex on first invocation. +export PATH="$PWD/vortex/target/release:$PATH" + +query_bench clickbench -i 1 \ + --targets datafusion:vortex \ + --display-format gh-json \ + --queries-file ./queries.sql \ + --flavor partitioned \ + --hide-progress-bar \ + -q 0 >/dev/null + +sync diff --git a/datafusion-vortex-partitioned/query b/datafusion-vortex-partitioned/query new file mode 100755 index 0000000000..8a40f770a1 --- /dev/null +++ b/datafusion-vortex-partitioned/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via vortex's `query_bench clickbench`. +# Stdout: JSON result emitted by the driver (gh-json display format). +# Stderr: query runtime in fractional seconds on the last line, derived +# from .value (nanoseconds) in the JSON. +# Exit non-zero on error. +set -e + +export PATH="$PWD/vortex/target/release:$PATH" + +tmp=$(mktemp --suffix=.sql) +trap 'rm -f "$tmp"' EXIT +cat > "$tmp" + +raw=$(RUST_LOG=off query_bench clickbench -i 1 \ + --targets datafusion:vortex \ + --display-format gh-json \ + --queries-file "$tmp" \ + --flavor partitioned \ + --hide-progress-bar \ + -q 0) + +ns=$(printf '%s' "$raw" | jq -r '.value // empty') +if [ -z "$ns" ]; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +awk -v n="$ns" 'BEGIN { printf "%.3f\n", n / 1e9 }' >&2 diff --git a/datafusion-vortex-partitioned/results/20260509/c6a.4xlarge.json b/datafusion-vortex-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..7de07a53a6 --- /dev/null +++ b/datafusion-vortex-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 161, + "data_size": 15812419580, + "result": [ + [0.217, 0.025, 0.025], + [0.21, 0.04, 0.039], + [0.258, 0.076, 0.075], + [0.78, 0.118, 0.118], + [1.274, 0.763, 0.74], + [1.504, 0.77, 0.775], + [0.185, 0.026, 0.025], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [13.425, 0.577, 0.576], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [0.684, 0.428, 0.441], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] +] +} + diff --git a/datafusion-vortex-partitioned/run.sh b/datafusion-vortex-partitioned/run.sh deleted file mode 100755 index fff2523007..0000000000 --- a/datafusion-vortex-partitioned/run.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Check if an argument is provided -if [ "$#" -ne 1 ]; then - echo "Usage: $0 [single|partitioned]" - exit 1 -fi - -# Set the SQL file based on the argument -if [ "$1" == "single" ] || [ "$1" == "partitioned" ]; then - FLAVOR=$1 - echo "Running benchmark for $FLAVOR" -else - echo "Invalid argument. Please use 'single' or 'partitioned'." - exit 1 -fi - -# clear results file -touch results.csv -> results.csv - -TRIES=3 -OS=$(uname -s) - -for query_num in $(seq 0 42); do - sync - - if [ "$OS" = "Linux" ]; then - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - elif [ "$OS" = "Darwin" ]; then - sudo purge - fi - - echo -n "[" - for i in $(seq 1 $TRIES); do - # Parse query results out of the JSON output, which reports the time in ns - RES=$(RUST_LOG=off query_bench clickbench -i 1 --flavor $FLAVOR --targets datafusion:vortex --display-format gh-json --queries-file ./queries.sql -q $query_num --hide-progress-bar | jq ".value / 1000000000") - - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${query_num},${i},${RES}" >> results.csv - done - echo "]," -done diff --git a/datafusion-vortex-partitioned/start b/datafusion-vortex-partitioned/start new file mode 100755 index 0000000000..cdc9ab3ccf --- /dev/null +++ b/datafusion-vortex-partitioned/start @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded CLI — no server lifecycle. +exit 0 diff --git a/datafusion-vortex-partitioned/stop b/datafusion-vortex-partitioned/stop new file mode 100755 index 0000000000..cdc9ab3ccf --- /dev/null +++ b/datafusion-vortex-partitioned/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded CLI — no server lifecycle. +exit 0 diff --git a/datafusion-vortex/benchmark.sh b/datafusion-vortex/benchmark.sh index 4160f145c9..f2216ef3b0 100755 --- a/datafusion-vortex/benchmark.sh +++ b/datafusion-vortex/benchmark.sh @@ -1,33 +1,6 @@ #!/bin/bash - -set -euo pipefail - -# Install Rust -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -# Install Dependencies -sudo apt-get update -y -sudo apt-get install -y gcc jq build-essential - -# Install Vortex from latest release main branch -git clone https://github.com/spiraldb/vortex.git || true -cd vortex -git checkout 0.34.0 -git submodule update --init -# We build a release version of the benchmarking utility using mimalloc, just like the datafusion-cli -cargo build --release --bin clickbench --package bench-vortex -export PATH="`pwd`/target/release:$PATH" -cd .. - -# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring. -# This will download parquet files (with time and string columns already converted to the logically correct datatype) and generate Vortex files from them. -echo -n "Load time: " -command time -f '%e' clickbench -i 1 --targets datafusion:vortex --display-format gh-json -q 0 --hide-progress-bar --flavor single - -# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files. -./run.sh single - -echo "Data size: $(find . -name '*.vortex' | xargs wc -c | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# clickbench (the vortex driver) handles its own dataset download/conversion. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion-vortex/check b/datafusion-vortex/check new file mode 100755 index 0000000000..68ed417de6 --- /dev/null +++ b/datafusion-vortex/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Verify the clickbench binary is on PATH. +[ -x vortex/target/release/clickbench ] diff --git a/datafusion-vortex/data-size b/datafusion-vortex/data-size new file mode 100755 index 0000000000..ec6675b0ed --- /dev/null +++ b/datafusion-vortex/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Sum the byte counts of all generated .vortex files. +find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' diff --git a/datafusion-vortex/install b/datafusion-vortex/install new file mode 100755 index 0000000000..b04187dc3e --- /dev/null +++ b/datafusion-vortex/install @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +# 0.34.0 referenced two private spiraldb-owned submodules +# (spiraldb/duckdb and spiraldb/duckdb-rs) under duckdb-vortex/, which +# now 404 on GitHub. From 0.41.0 onward the duckdb dep moved to the +# upstream duckdb/duckdb repo, and 0.42.0+ ship without a .gitmodules +# file at all (vendored / Cargo registry deps). 0.44.0 matches what +# datafusion-vortex-partitioned uses. +VORTEX_VERSION=0.44.0 + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source ~/.cargo/env + +sudo apt-get update -y +# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the +# clang freestanding headers (stdbool.h etc.); without libclang-dev the +# build fails with `'stdbool.h' file not found`. +sudo apt-get install -y gcc jq build-essential git clang libclang-dev + +if [ ! -d vortex ]; then + git clone https://github.com/spiraldb/vortex.git +fi +( + cd vortex + git fetch --tags + git checkout "$VORTEX_VERSION" + # `git submodule update --init` fails with + # "fatal: destination path 'duckdb-vortex/duckdb' exists and is not an + # empty directory" once the submodule has been cloned but isn't fully + # registered (a partial state previous runs leave behind). `sync` + # refreshes the configured URLs and `--force` re-checkouts cleanly, + # which is what we want for an idempotent setup. + git submodule sync --recursive + git submodule update --init --recursive --force + cargo build --release --bin clickbench --package bench-vortex +) diff --git a/datafusion-vortex/load b/datafusion-vortex/load new file mode 100755 index 0000000000..69d6b4c0f2 --- /dev/null +++ b/datafusion-vortex/load @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# clickbench fetches Parquet and converts to .vortex on first invocation. +# Run query 0 once with the bundled queries.sql to trigger conversion; +# the wrapping driver will time the load via wall clock around this script. +export PATH="$PWD/vortex/target/release:$PATH" + +clickbench -i 1 \ + --targets datafusion:vortex \ + --display-format gh-json \ + --queries-file ./queries.sql \ + --flavor single \ + --hide-progress-bar \ + -q 0 >/dev/null + +sync diff --git a/datafusion-vortex/query b/datafusion-vortex/query new file mode 100755 index 0000000000..d3518227ad --- /dev/null +++ b/datafusion-vortex/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the vortex `clickbench` driver. +# Stdout: JSON result emitted by clickbench (gh-json display format). +# Stderr: query runtime in fractional seconds on the last line, derived +# from .value (nanoseconds) in the JSON. +# Exit non-zero on error. +set -e + +export PATH="$PWD/vortex/target/release:$PATH" + +# clickbench addresses queries by index in a queries-file. Stage the input +# query as the only entry in a temp file and pass -q 0. +tmp=$(mktemp --suffix=.sql) +trap 'rm -f "$tmp"' EXIT +cat > "$tmp" + +raw=$(RUST_LOG=off clickbench -i 1 \ + --targets datafusion:vortex \ + --display-format gh-json \ + --queries-file "$tmp" \ + --flavor single \ + --hide-progress-bar \ + -q 0) + +ns=$(printf '%s' "$raw" | jq -r '.value // empty') +if [ -z "$ns" ]; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +awk -v n="$ns" 'BEGIN { printf "%.3f\n", n / 1e9 }' >&2 diff --git a/datafusion-vortex/run.sh b/datafusion-vortex/run.sh deleted file mode 100755 index 62b38d886e..0000000000 --- a/datafusion-vortex/run.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Check if an argument is provided -if [ "$#" -ne 1 ]; then - echo "Usage: $0 [single|partitioned]" - exit 1 -fi - -# Set the SQL file based on the argument -if [ "$1" == "single" ] || [ "$1" == "partitioned" ]; then - FLAVOR=$1 - echo "Running benchmark for $FLAVOR" -else - echo "Invalid argument. Please use 'single' or 'partitioned'." - exit 1 -fi - -# clear results file -touch results.csv -> results.csv - -TRIES=3 -OS=$(uname -s) - -for query_num in $(seq 0 42); do - sync - - if [ "$OS" = "Linux" ]; then - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - elif [ "$OS" = "Darwin" ]; then - sudo purge - fi - - echo -n "[" - for i in $(seq 1 $TRIES); do - # Parse query results out of the JSON output, which reports the time in ns - RES=$(RUST_LOG=off clickbench -i 1 --flavor $FLAVOR --targets datafusion:vortex --display-format gh-json --queries-file ./queries.sql -q $query_num --hide-progress-bar | jq ".value / 1000000000") - - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${query_num},${i},${RES}" >> results.csv - done - echo "]," -done diff --git a/datafusion-vortex/start b/datafusion-vortex/start new file mode 100755 index 0000000000..cdc9ab3ccf --- /dev/null +++ b/datafusion-vortex/start @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded CLI — no server lifecycle. +exit 0 diff --git a/datafusion-vortex/stop b/datafusion-vortex/stop new file mode 100755 index 0000000000..cdc9ab3ccf --- /dev/null +++ b/datafusion-vortex/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded CLI — no server lifecycle. +exit 0 diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index 3296289e72..fc4bacc8f3 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -1,52 +1,5 @@ #!/bin/bash - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -WITH_SWAP=false - -if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then - echo "LOW MEMORY MODE" - # Enable swap if not already enabled. This is needed both for rustc and until we have a better - # solution for low memory machines, see - # https://github.com/apache/datafusion/issues/18473 - if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then - echo "Enabling 8G swap" - sudo fallocate -l 8G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - WITH_SWAP=true - fi -fi - - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y gcc - -echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 53.1.0 -CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli -export PATH="`pwd`/target/release:$PATH" -cd .. - -echo "Download benchmark target data, single file" -../download-hits-parquet-single - -echo "Run benchmarks" -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" - -if [ "$WITH_SWAP" = true ]; then - echo "Disable swap" - sudo swapoff /swapfile - sudo rm /swapfile -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion/check b/datafusion/check new file mode 100755 index 0000000000..52f2d25863 --- /dev/null +++ b/datafusion/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DF=arrow-datafusion/target/release/datafusion-cli +"$DF" -c "SELECT 1" >/dev/null diff --git a/datafusion/data-size b/datafusion/data-size new file mode 100755 index 0000000000..708c0b72e7 --- /dev/null +++ b/datafusion/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/datafusion/install b/datafusion/install new file mode 100755 index 0000000000..8f4cee6f1e --- /dev/null +++ b/datafusion/install @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +# Build datafusion-cli from source. Idempotent: only build if the binary +# isn't already in arrow-datafusion/target/release/datafusion-cli. + +if [ ! -x arrow-datafusion/target/release/datafusion-cli ]; then + # Rust toolchain. + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" + + # Low-memory hosts need swap to compile datafusion-cli. + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi + + sudo apt-get update -y + sudo apt-get install -y gcc git + + if [ ! -d arrow-datafusion ]; then + git clone https://github.com/apache/arrow-datafusion.git + fi + cd arrow-datafusion + git checkout 53.1.0 + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --package datafusion-cli --bin datafusion-cli +fi diff --git a/datafusion/load b/datafusion/load new file mode 100755 index 0000000000..96ed6eea58 --- /dev/null +++ b/datafusion/load @@ -0,0 +1,6 @@ +#!/bin/bash +# datafusion queries hits.parquet directly via an external table created +# inline in each query (see create.sql, executed by ./query). No persistent +# database to load. +set -e +sync diff --git a/datafusion/query b/datafusion/query new file mode 100755 index 0000000000..65cc944ea0 --- /dev/null +++ b/datafusion/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via datafusion-cli using create.sql +# to define the hits view, then the query. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +DF=arrow-datafusion/target/release/datafusion-cli + +query=$(cat) +tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +trap 'rm -f "$tmp"' EXIT +printf '%s\n' "$query" > "$tmp" + +out=$("$DF" -f create.sql "$tmp" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +# Print everything that's not an "Elapsed" timing line as the result. +printf '%s\n' "$out" | grep -v 'Elapsed' || true + +# datafusion-cli prints `... Elapsed X.YYY seconds.` for each statement; the +# last one is for the actual query. +printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion/results/20260509/c6a.4xlarge.json b/datafusion/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..de464ccb39 --- /dev/null +++ b/datafusion/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DataFusion (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 10, + "data_size": 14779976446, + "result": [ + [0.064, 0.001, 0.001], + [0.141, 0.035, 0.036], + [0.194, 0.063, 0.062], + [0.347, 0.067, 0.068], + [0.943, 0.675, 0.668], + [0.871, 0.724, 0.718], + [0.1, 0.006, 0.006], + [0.16, 0.038, 0.037], + [0.996, 0.814, 0.839], + [1.317, 0.928, 0.907], + [0.476, 0.183, 0.181], + [0.544, 0.199, 0.202], + [0.963, 0.759, 0.763], + [2.446, 1.127, 1.133], + [0.977, 0.742, 0.737], + [0.941, 0.781, 0.78], + [2.525, 1.515, 1.512], + [2.479, 1.485, 1.503], + [4.858, 2.946, 2.969], + [0.238, 0.078, 0.077], + [9.914, 0.968, 0.951], + [11.295, 1.208, 1.177], + [22.273, 2.921, 2.952], + [55.791, 9.639, 9.645], + [2.638, 0.406, 0.402], + [0.787, 0.317, 0.316], + [2.641, 0.419, 0.414], + [9.774, 1.227, 1.177], + [9.977, 9.316, 9.296], + [0.584, 0.484, 0.471], + [2.78, 0.738, 0.741], + [6.244, 0.89, 0.886], + [4.484, 3.026, 3.018], + [10.016, 3.313, 3.3], + [10.068, 3.301, 3.298], + [1.173, 1.052, 1.059], + [0.338, 0.162, 0.159], + [0.228, 0.109, 0.111], + [0.324, 0.112, 0.11], + [0.591, 0.288, 0.284], + [0.202, 0.042, 0.042], + [0.18, 0.039, 0.039], + [0.174, 0.038, 0.038] +] +} + diff --git a/datafusion/run.sh b/datafusion/run.sh deleted file mode 100755 index cd1059ac31..0000000000 --- a/datafusion/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -echo $1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "$query" > /tmp/query.sql - - echo -n "[" - for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1 | awk '{ print $2 }') - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/datafusion/start b/datafusion/start new file mode 100755 index 0000000000..e53151aba8 --- /dev/null +++ b/datafusion/start @@ -0,0 +1,3 @@ +#!/bin/bash +# datafusion-cli is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/datafusion/stop b/datafusion/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/datafusion/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/doris-parquet/benchmark.sh b/doris-parquet/benchmark.sh index e287ac3e1c..6a7f45d3a1 100755 --- a/doris-parquet/benchmark.sh +++ b/doris-parquet/benchmark.sh @@ -1,94 +1,5 @@ #!/bin/bash - -set -e - -# This benchmark should run on Ubuntu 22.04 - -# Install -url='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-3.0.5-bin-x64.tar.gz' -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop Doris and remove it first if execute this script multiple times -set +e -"$dir_name"/apache-doris-3.0.5-bin-x64/fe/bin/stop_fe.sh -"$dir_name"/apache-doris-3.0.5-bin-x64/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$dir_name/apache-doris-3.0.5-bin-x64" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk -sudo apt-get install -y mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -# Download Parquet files -../download-hits-parquet-partitioned "$DORIS_HOME/be" - -# Run the queries -mysql -h127.1 -P9030 -uroot -vvv < create.sql - -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Load time: 0" -echo "Data size: $(find "$DORIS_HOME/be/" -name '*.parquet' | xargs wc -c | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/doris-parquet/check b/doris-parquet/check new file mode 100755 index 0000000000..c6e836c8c1 --- /dev/null +++ b/doris-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/doris-parquet/data-size b/doris-parquet/data-size new file mode 100755 index 0000000000..992250bc68 --- /dev/null +++ b/doris-parquet/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +find "$DORIS_HOME/be/" -name 'hits_*.parquet' -printf '%s\n' \ + | awk '{ s += $1 } END { print s+0 }' diff --git a/doris-parquet/install b/doris-parquet/install new file mode 100755 index 0000000000..7dc603e916 --- /dev/null +++ b/doris-parquet/install @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 22.04+ +ROOT=$(pwd) +URL='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-3.0.5-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/apache-doris-3.0.5-bin-x64" + +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +sudo systemctl disable unattended-upgrades 2>/dev/null || true +sudo systemctl stop unattended-upgrades 2>/dev/null || true +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/doris-parquet/load b/doris-parquet/load new file mode 100755 index 0000000000..def51c13f0 --- /dev/null +++ b/doris-parquet/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME + +# The dataset must be visible to the BE process (TVF reads local files +# relative to the BE working dir). +"$ROOT/../lib/download-hits-parquet-partitioned" "$DORIS_HOME/be" + +# Create the view that wraps a local() TVF over the parquet files. Idempotent +# (CREATE OR REPLACE / IF NOT EXISTS). +mysql -h127.0.0.1 -P9030 -uroot < "$ROOT/create.sql" + +# Pre-set parquet flags to match original benchmark. +mysql -h127.0.0.1 -P9030 -uroot \ + -e 'set global enable_parquet_filter_by_min_max=true; set global enable_parquet_lazy_materialization=true;' + +# Note: data files remain — for parquet-as-storage there's no separate copy, +# the TVF reads them directly. +sync diff --git a/doris-parquet/query b/doris-parquet/query new file mode 100755 index 0000000000..67aebec9ca --- /dev/null +++ b/doris-parquet/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against Doris's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Clear the FE/BE caches before each query (parquet path). +curl -sS http://127.0.0.1:8040/api/clear_cache/all >/dev/null 2>&1 || true + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/doris-parquet/results/20260509/c6a.4xlarge.json b/doris-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..e043a316e0 --- /dev/null +++ b/doris-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Apache Doris (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 72, + "data_size": 14737666736, + "result": [ + [3.15, 0.23, 0.23], + [0.43, 0.11, 0.12], + [0.41, 0.19, 0.17], + [0.69, 0.21, 0.2], + [1.03, 0.77, 0.74], + [1.29, 1.03, 1.01], + [0.35, 0.14, 0.12], + [0.46, 0.14, 0.12], + [1.33, 1.08, 1.12], + [1.42, 0.91, 0.87], + [0.92, 0.43, 0.42], + [0.86, 0.47, 0.45], + [1.14, 0.75, 0.7], + [2.5, 1.63, 1.61], + [1.49, 1.07, 1.06], + [1.19, 1.01, 0.94], + [3.82, 2.87, 2.97], + [2.41, 0.44, 0.42], + [6.43, 3.2, 3.24], + [0.42, 0.14, 0.12], + [9.81, 1.02, 1.01], + [11.55, 1.27, 1.22], + [22.35, 2.67, 2.71], + [53.98, 6.66, 6.61], + [3.01, 0.47, 0.46], + [1.01, 0.3, 0.3], + [3.02, 0.47, 0.47], + [9.89, 1.08, 1.1], + [10.12, 9.78, 9.83], + [0.5, 0.19, 0.16], + [2.63, 0.9, 0.87], + [6.4, 1.18, 1.17], + [5.71, 4.62, 4.75], + [11.74, 5.06, 5.06], + [11.69, 5.03, 4.89], + [1.14, 0.77, 0.75], + [0.45, 0.13, 0.13], + [0.38, 0.13, 0.13], + [0.36, 0.11, 0.09], + [0.33, 0.11, 0.08], + [0.36, 0.11, 0.1], + [0.35, 0.1, 0.09], + [0.33, 0.1, 0.09] +] +} + diff --git a/doris-parquet/run.sh b/doris-parquet/run.sh deleted file mode 100755 index a6438a8b70..0000000000 --- a/doris-parquet/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -mysql -h127.1 -P9030 -uroot -e 'set global enable_parquet_filter_by_min_max=true; set global enable_parquet_lazy_materialization=true;' -while read -r query; do - curl -sS http://127.0.0.1:8040/api/clear_cache/all - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done -done < queries.sql diff --git a/doris-parquet/start b/doris-parquet/start new file mode 100755 index 0000000000..79ac837273 --- /dev/null +++ b/doris-parquet/start @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME"/fe/bin/start_fe.sh --daemon +"$DORIS_HOME"/be/bin/start_be.sh --daemon + +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done diff --git a/doris-parquet/stop b/doris-parquet/stop new file mode 100755 index 0000000000..d8d0385b7a --- /dev/null +++ b/doris-parquet/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME"/fe/bin/stop_fe.sh 2>/dev/null || true +"$DORIS_HOME"/be/bin/stop_be.sh 2>/dev/null || true +exit 0 diff --git a/doris/benchmark.sh b/doris/benchmark.sh index 3f445758cf..6a7f45d3a1 100755 --- a/doris/benchmark.sh +++ b/doris/benchmark.sh @@ -1,255 +1,5 @@ #!/bin/bash -set -e - -# This benchmark should run on Ubuntu 20.04 - -# Install -ROOT=$(pwd) - -if [[ -n "$1" ]]; then - url="$1" -else - url='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-4.1.0-rc01-bin-x64.tar.gz' -fi -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop Doris and remove it first if execute this script multiple times -set +e -"$dir_name"/"$dir_name"/fe/bin/stop_fe.sh -"$dir_name"/"$dir_name"/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$ROOT/$dir_name/$dir_name" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -# Disable internal caches so that the cold run (1st of 3 tries) is actually cold. -# Without this, the BE process keeps decoded data in its own in-memory page cache -# (`storage_page_cache`, default ~20% of RAM) and segment cache, which `drop_caches` -# does not clear, so first-run timings reflect a warm cache and underreport -# cold-run latency. -printf "\ndisable_storage_page_cache = true\n" >> "$DORIS_HOME"/be/conf/be.conf -printf "\nsegment_cache_capacity = 0\n" >> "$DORIS_HOME"/be/conf/be.conf -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - -# Create Database and table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -sleep 5 -mysql -h 127.0.0.1 -P9030 -uroot hits <"$ROOT"/create.sql - -# Download data -BE_DATA_DIR="$DORIS_HOME/be/" - -"$ROOT"/../download-hits-parquet-partitioned "$BE_DATA_DIR/user_files_secure" - -BE_ID=$(mysql -h127.0.0.1 -P9030 -uroot -N -e 'show backends' | awk '{print $1}' | head -1) - -CORES=$(nproc) -PARALLEL_NUM=$((CORES / 4)) -if [ "$PARALLEL_NUM" -lt 1 ]; then - echo "Computed parallel_pipeline_task_num ($PARALLEL_NUM) is less than 1 based on $CORES cores; clamping to 1." - PARALLEL_NUM=1 -fi -echo "Setting parallel_pipeline_task_num to $PARALLEL_NUM (cpu cores: $CORES, computed as CORES/4 with min 1)" - -echo "start loading hits.parquet using TVF, estimated to take about 3 minutes ..." -START=$(date +%s) -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SET parallel_pipeline_task_num = $PARALLEL_NUM;\ -INSERT INTO hits SELECT - CounterID, - DATE_ADD('1970-01-01', INTERVAL EventDate DAY) AS EventDate, - UserID, - FROM_UNIXTIME(EventTime) AS EventTime, - WatchID, - JavaEnable, - Title, - GoodEvent, - ClientIP, - RegionID, - CounterClass, - OS, - UserAgent, - URL, - Referer, - IsRefresh, - RefererCategoryID, - RefererRegionID, - URLCategoryID, - URLRegionID, - ResolutionWidth, - ResolutionHeight, - ResolutionDepth, - FlashMajor, - FlashMinor, - FlashMinor2, - NetMajor, - NetMinor, - UserAgentMajor, - UserAgentMinor, - CookieEnable, - JavascriptEnable, - IsMobile, - MobilePhone, - MobilePhoneModel, - Params, - IPNetworkID, - TraficSourceID, - SearchEngineID, - SearchPhrase, - AdvEngineID, - IsArtifical, - WindowClientWidth, - WindowClientHeight, - ClientTimeZone, - FROM_UNIXTIME(ClientEventTime) AS ClientEventTime, - SilverlightVersion1, - SilverlightVersion2, - SilverlightVersion3, - SilverlightVersion4, - PageCharset, - CodeVersion, - IsLink, - IsDownload, - IsNotBounce, - FUniqID, - OriginalURL, - HID, - IsOldCounter, - IsEvent, - IsParameter, - DontCountHits, - WithHash, - HitColor, - FROM_UNIXTIME(LocalEventTime) AS LocalEventTime, - Age, - Sex, - Income, - Interests, - Robotness, - RemoteIP, - WindowName, - OpenerName, - HistoryLength, - BrowserLanguage, - BrowserCountry, - SocialNetwork, - SocialAction, - HTTPError, - SendTiming, - DNSTiming, - ConnectTiming, - ResponseStartTiming, - ResponseEndTiming, - FetchTiming, - SocialSourceNetworkID, - SocialSourcePage, - ParamPrice, - ParamOrderID, - ParamCurrency, - ParamCurrencyID, - OpenstatServiceName, - OpenstatCampaignID, - OpenstatAdID, - OpenstatSourceID, - UTMSource, - UTMMedium, - UTMCampaign, - UTMContent, - UTMTerm, - FromTag, - HasGCLID, - RefererHash, - URLHash, - CLID -FROM local( - \"file_path\" = \"user_files_secure/hits_*.parquet\", - \"backend_id\" = \"$BE_ID\", - \"format\" = \"parquet\" -) -" -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "$LOADTIME" > loadtime - - -du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size -echo "Data size: $(cat storage_size)" - -mysql -h 127.0.0.1 -P9030 -uroot hits -e "set global enable_sql_cache = false" -# Dataset contains 99997497 rows, storage size is about 13319588503 bytes -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" - -# Run queries -TRIES=3 -while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" 2>&1 | tee -a log.txt - done -done /dev/null diff --git a/doris/data-size b/doris/data-size new file mode 100755 index 0000000000..e99a148afc --- /dev/null +++ b/doris/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +du -bs "$DORIS_HOME"/be/storage/ | cut -f1 diff --git a/doris/install b/doris/install new file mode 100755 index 0000000000..05c69d8f3d --- /dev/null +++ b/doris/install @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 20.04+ +ROOT=$(pwd) +URL='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-4.1.0-rc01-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/$dir_name" + +# Idempotent: skip if already extracted. +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" + + # Disable internal caches so cold runs are actually cold. + printf "\ndisable_storage_page_cache = true\n" >> "$DORIS_HOME"/be/conf/be.conf + printf "\nsegment_cache_capacity = 0\n" >> "$DORIS_HOME"/be/conf/be.conf +fi + +# Install dependencies (idempotent — apt-get is fine to re-run). +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +sudo systemctl disable unattended-upgrades 2>/dev/null || true +sudo systemctl stop unattended-upgrades 2>/dev/null || true + +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/doris/load b/doris/load new file mode 100755 index 0000000000..e836b0a034 --- /dev/null +++ b/doris/load @@ -0,0 +1,142 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +sleep 5 +mysql -h127.0.0.1 -P9030 -uroot hits < "$ROOT/create.sql" + +BE_DATA_DIR="$DORIS_HOME/be/" +"$ROOT/../lib/download-hits-parquet-partitioned" "$BE_DATA_DIR/user_files_secure" + +BE_ID=$(mysql -h127.0.0.1 -P9030 -uroot -N -e 'show backends' | awk '{print $1}' | head -1) +CORES=$(nproc) +PARALLEL_NUM=$((CORES / 4)) +[ "$PARALLEL_NUM" -lt 1 ] && PARALLEL_NUM=1 + +mysql -h127.0.0.1 -P9030 -uroot hits -e "SET parallel_pipeline_task_num = $PARALLEL_NUM;\ +INSERT INTO hits SELECT + CounterID, + DATE_ADD('1970-01-01', INTERVAL EventDate DAY) AS EventDate, + UserID, + FROM_UNIXTIME(EventTime) AS EventTime, + WatchID, + JavaEnable, + Title, + GoodEvent, + ClientIP, + RegionID, + CounterClass, + OS, + UserAgent, + URL, + Referer, + IsRefresh, + RefererCategoryID, + RefererRegionID, + URLCategoryID, + URLRegionID, + ResolutionWidth, + ResolutionHeight, + ResolutionDepth, + FlashMajor, + FlashMinor, + FlashMinor2, + NetMajor, + NetMinor, + UserAgentMajor, + UserAgentMinor, + CookieEnable, + JavascriptEnable, + IsMobile, + MobilePhone, + MobilePhoneModel, + Params, + IPNetworkID, + TraficSourceID, + SearchEngineID, + SearchPhrase, + AdvEngineID, + IsArtifical, + WindowClientWidth, + WindowClientHeight, + ClientTimeZone, + FROM_UNIXTIME(ClientEventTime) AS ClientEventTime, + SilverlightVersion1, + SilverlightVersion2, + SilverlightVersion3, + SilverlightVersion4, + PageCharset, + CodeVersion, + IsLink, + IsDownload, + IsNotBounce, + FUniqID, + OriginalURL, + HID, + IsOldCounter, + IsEvent, + IsParameter, + DontCountHits, + WithHash, + HitColor, + FROM_UNIXTIME(LocalEventTime) AS LocalEventTime, + Age, + Sex, + Income, + Interests, + Robotness, + RemoteIP, + WindowName, + OpenerName, + HistoryLength, + BrowserLanguage, + BrowserCountry, + SocialNetwork, + SocialAction, + HTTPError, + SendTiming, + DNSTiming, + ConnectTiming, + ResponseStartTiming, + ResponseEndTiming, + FetchTiming, + SocialSourceNetworkID, + SocialSourcePage, + ParamPrice, + ParamOrderID, + ParamCurrency, + ParamCurrencyID, + OpenstatServiceName, + OpenstatCampaignID, + OpenstatAdID, + OpenstatSourceID, + UTMSource, + UTMMedium, + UTMCampaign, + UTMContent, + UTMTerm, + FromTag, + HasGCLID, + RefererHash, + URLHash, + CLID +FROM local( + \"file_path\" = \"user_files_secure/hits_*.parquet\", + \"backend_id\" = \"$BE_ID\", + \"format\" = \"parquet\" +) +" + +mysql -h127.0.0.1 -P9030 -uroot hits -e "set global enable_sql_cache = false" + +# Clean up downloaded parquet inputs. +rm -f "$BE_DATA_DIR"/user_files_secure/hits_*.parquet +sync diff --git a/doris/query b/doris/query new file mode 100755 index 0000000000..ed73900d97 --- /dev/null +++ b/doris/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against Doris's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Use mysql -vvv which prints "X rows in set (Y.YY sec)" or "ERROR ...". +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +# Strip the timing line from stdout output. +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +# Parse the last "(X.XX sec)" or "X min Y.ZZ sec" line. +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/doris/results/20260509/c6a.4xlarge.json b/doris/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..7ee132e186 --- /dev/null +++ b/doris/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Apache Doris", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 200, + "data_size": 13781521519, + "result": [ + [0.39, 0.01, 0.01], + [1.22, 0.06, 0.06], + [1.54, 0.1, 0.1], + [1.98, 0.18, 0.18], + [1.99, 0.37, 0.41], + [2.43, 0.94, 0.94], + [0.36, 0.04, 0.01], + [1.24, 0.07, 0.07], + [3.47, 0.51, 0.52], + [5.83, 1.72, 1.7], + [3.34, 0.23, 0.23], + [3.54, 0.25, 0.25], + [2.97, 0.69, 0.71], + [5.27, 1.28, 1.26], + [3.63, 0.88, 0.9], + [2.07, 0.56, 0.57], + [4.68, 1.69, 1.7], + [4.55, 0.38, 0.39], + [6.96, 2.77, 2.9], + [0.25, 0.06, 0.03], + [8.54, 1.1, 1.09], + [10.64, 1.08, 1.1], + [21.32, 2.06, 2.08], + [9.46, 1.2, 1.22], + [4.97, 0.42, 0.41], + [2.84, 0.43, 0.41], + [5, 0.42, 0.42], + [9.01, 1.99, 1.96], + [10.1, 9.51, 9.54], + [1.18, 0.14, 0.12], + [6.83, 0.56, 0.56], + [9.38, 0.78, 0.83], + [6.59, 3.72, 3.81], + [9.05, 5.27, 5.3], + [9.13, 5.36, 5.34], + [1.78, 0.62, 0.63], + [2.99, 0.17, 0.13], + [2.88, 0.12, 0.1], + [2.65, 0.11, 0.1], + [3.03, 0.24, 0.25], + [2.52, 0.1, 0.09], + [2.82, 0.1, 0.1], + [2.29, 0.08, 0.09] +] +} + diff --git a/doris/results/20260509/c6a.metal.json b/doris/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..a0ac001c0a --- /dev/null +++ b/doris/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Apache Doris", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 128, + "data_size": 13770189784, + "result": [ + [0.4, 0, 0.02], + [1.08, 0.05, 0.05], + [1.5, 0.07, 0.06], + [1.8, 0.09, 0.08], + [1.8, 0.1, 0.08], + [2.14, 0.21, 0.19], + [0.78, 0.02, 0.01], + [1, 0.07, 0.05], + [3.38, 0.16, 0.15], + [4.66, 0.73, 0.64], + [2.52, 0.11, 0.1], + [3.28, 0.12, 0.12], + [2.21, 0.19, 0.17], + [4.47, 0.35, 0.28], + [2.7, 0.2, 0.21], + [1.89, 0.15, 0.14], + [4.17, 0.37, 0.35], + [4.15, 0.12, 0.12], + [5.55, 0.56, 0.5], + [0.35, 0.03, 0.03], + [8.89, 0.36, 0.37], + [9.93, 0.37, 0.38], + [19.9, 0.75, 0.71], + [9.38, 0.46, 0.42], + [3.77, 0.11, 0.12], + [2.1, 0.1, 0.1], + [4.11, 0.13, 0.12], + [9.02, 0.53, 0.5], + [7.46, 1.04, 1.03], + [1.27, 0.11, 0.09], + [5.74, 0.18, 0.16], + [7.55, 0.21, 0.2], + [5.18, 0.78, 0.75], + [8.55, 1.15, 1.12], + [8.94, 1.09, 1.12], + [1.21, 0.18, 0.16], + [2.1, 0.07, 0.06], + [1.77, 0.08, 0.06], + [1.97, 0.07, 0.06], + [2.23, 0.13, 0.11], + [1.91, 0.06, 0.05], + [2.08, 0.06, 0.06], + [1.66, 0.07, 0.06] +] +} + diff --git a/doris/start b/doris/start new file mode 100755 index 0000000000..0ad1d90861 --- /dev/null +++ b/doris/start @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +# Idempotent: if FE replies, do nothing. +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME"/fe/bin/start_fe.sh --daemon +"$DORIS_HOME"/be/bin/start_be.sh --daemon + +# Wait for FE. +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +# Add backend to cluster (idempotent — ignore "already exists"). +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +# Wait for BE. +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done diff --git a/doris/stop b/doris/stop new file mode 100755 index 0000000000..d8d0385b7a --- /dev/null +++ b/doris/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME"/fe/bin/stop_fe.sh 2>/dev/null || true +"$DORIS_HOME"/be/bin/stop_be.sh 2>/dev/null || true +exit 0 diff --git a/drill/benchmark.sh b/drill/benchmark.sh index 4728ccaf78..fc4bacc8f3 100755 --- a/drill/benchmark.sh +++ b/drill/benchmark.sh @@ -1,14 +1,5 @@ -# Install - -sudo apt-get update -y -sudo apt-get install -y docker.io - -../download-hits-parquet-single - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '\([\d\.]+ seconds\)|Errors' | sed -r -e 's/Errors:/null/; s/^.+\(([.0-9]+) seconds\)/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/drill/check b/drill/check new file mode 100755 index 0000000000..f73b795397 --- /dev/null +++ b/drill/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo docker image inspect apache/drill >/dev/null diff --git a/drill/data-size b/drill/data-size new file mode 100755 index 0000000000..708c0b72e7 --- /dev/null +++ b/drill/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/drill/install b/drill/install new file mode 100755 index 0000000000..cf1bd68686 --- /dev/null +++ b/drill/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi + +# Pre-pull the apache/drill image so query timing isn't dominated by image +# pull on the first run. +sudo docker pull apache/drill diff --git a/drill/load b/drill/load new file mode 100755 index 0000000000..8ef141f24c --- /dev/null +++ b/drill/load @@ -0,0 +1,5 @@ +#!/bin/bash +# Drill queries hits.parquet directly via the dfs filesystem plugin (mounted +# into the docker container per query). No persistent DB to load. +set -e +sync diff --git a/drill/query b/drill/query new file mode 100755 index 0000000000..0215d8acca --- /dev/null +++ b/drill/query @@ -0,0 +1,56 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Apache Drill (in a one-shot +# docker container) against hits.parquet (mounted as /hits.parquet). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) + +# Drill needs the full dfs path, not a bare table name. +HITS='dfs.`/hits.parquet`' +query=${query//hits/$HITS} + +# Hand the query to drill-embedded via a mounted SQL file. The previous +# approach piped the query to stdin terminated with \r, which sqlline +# doesn't treat as a line terminator on a non-tty pipe — drill saw the +# query as a single buffered line that was never finalized, hit EOF, +# and exited with the prompt still showing the query. Result: every +# query produced no `(... seconds)` line and the lib recorded null. +tmpdir=$(mktemp -d) +trap 'rm -rf "$tmpdir"' EXIT +# Ensure the script ends with `;` followed by a newline. +printf '%s\n' "${query%;};" > "$tmpdir/q.sql" + +out=$(sudo docker run --rm \ + -v "$(pwd)/hits.parquet:/hits.parquet:ro" \ + -v "$tmpdir/q.sql:/q.sql:ro" \ + --entrypoint /bin/sh apache/drill \ + -c '$DRILL_HOME/bin/drill-embedded --run=/q.sql' 2>&1) && status=0 || status=$? + +# Sniff for failure even if docker run returned 0: sqlline emits +# "Aborting command set", "Error", "No current connection", or a Java +# stack trace on a failed query but doesn't always propagate to the +# exit code. +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | \ + grep -qE '^(Error|Aborting command set|No current connection|java\.|Could not initialize)'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +# sqlline echoes "1/1 SELECT ..." (the script prelude) and +# "(N rows in X.YYY seconds)" alongside the result rows; strip both +# for the result body, then pull the last "(... seconds)" for the timing. +printf '%s\n' "$out" \ + | grep -vE '^[0-9]+/[0-9]+ |\([0-9]+ rows? in [0-9.]+ seconds?\)|^Apache Drill|^"' \ + || true + +secs=$(printf '%s\n' "$out" | grep -oE '\([0-9]+ rows? in [0-9.]+ seconds?\)' \ + | grep -oE '[0-9.]+ seconds?' | grep -oE '[0-9.]+' | tail -n1) + +if [ -z "$secs" ]; then + echo "no '(N rows in X.YYY seconds)' marker in drill output" >&2 + exit 1 +fi + +echo "$secs" >&2 diff --git a/drill/run.sh b/drill/run.sh deleted file mode 100755 index f7168ac10f..0000000000 --- a/drill/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cat queries.sql | sed -r -e 's@hits@dfs.`/hits.parquet`@g' | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "${query}" - sudo docker run -i --rm --name drill -v $(pwd)/hits.parquet:/hits.parquet apache/drill <<< "${query}"$'\r'"${query}"$'\r'"${query}"$'\r' - echo -done diff --git a/drill/start b/drill/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/drill/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/drill/stop b/drill/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/drill/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/druid/benchmark.sh b/druid/benchmark.sh index c18e545dd2..ce50709e36 100755 --- a/druid/benchmark.sh +++ b/druid/benchmark.sh @@ -1,44 +1,8 @@ -#!/bin/bash -e - -sudo apt-get update -y -sudo apt-get install -y openjdk-11-jdk -sudo update-alternatives --config java - -# Install - -VERSION=33.0.0 - -wget -O"apache-druid-${VERSION}-bin.tar.gz" "https://dlcdn.apache.org/druid/${VERSION}/apache-druid-${VERSION}-bin.tar.gz" -tar xf apache-druid-${VERSION}-bin.tar.gz -./apache-druid-${VERSION}/bin/verify-java - -# Have to increase indexer memory limit -sed -i 's MaxDirectMemorySize=1g MaxDirectMemorySize=5g g' apache-druid-$VERSION/conf/druid/single-server/medium/middleManager/runtime.properties - -# Disable cache to test query performance -sed -i 's druid.historical.cache.useCache=true druid.historical.cache.useCache=false g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -sed -i 's druid.historical.cache.populateCache=true druid.historical.cache.populateCache=false g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -sed -i 's druid.processing.buffer.sizeBytes=500MiB druid.processing.buffer.sizeBytes=1000MiB g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties - -echo "druid.query.groupBy.maxMergingDictionarySize=5000000000" >> apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -# Druid launcher does not start Druid as a daemon. Run it in background -./apache-druid-${VERSION}/bin/start-single-server-medium & - -# Load the data - -../download-hits-tsv - -echo -n "Load time: " -command time -f '%e' ./apache-druid-${VERSION}/bin/post-index-task --file ingest.json --url http://localhost:8081 - -# The command above will fail due to timeout but still continue to run in background. -# The loading time should be checked from the logs. - -# Run the queries -./run.sh - -# stop Druid services -kill %1 - -echo -n "Data size: " -du -bcs ./apache-druid-${VERSION}/var | grep total +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +# Druid degrades after some queries; the shared driver restarts between +# queries when restartable=yes (matches the original `pkill -f historical` +# hack now folded into stop). +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/druid/check b/druid/check new file mode 100755 index 0000000000..1c5210451b --- /dev/null +++ b/druid/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Trivial query against the Druid SQL endpoint. +RES=$(curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + -d '{"query": "SELECT 1"}') + +[ "$RES" = "200" ] diff --git a/druid/data-size b/druid/data-size new file mode 100755 index 0000000000..3a668c8b8d --- /dev/null +++ b/druid/data-size @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +du -bcs "./${DRUID_DIR}/var" | awk '/total$/ {print $1}' diff --git a/druid/install b/druid/install new file mode 100755 index 0000000000..a0782c75e7 --- /dev/null +++ b/druid/install @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +# 33.0.0 was retired from the dlcdn.apache.org mirror; bump to a +# currently-published Druid release. +VERSION=37.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +if [ ! -d "$DRUID_DIR" ]; then + # Druid 33+ requires Java 17 or 21; openjdk-11 trips its + # `verify-java` check on startup with "Druid requires Java 17 or 21". + sudo apt-get update -y + sudo apt-get install -y openjdk-17-jdk curl + + if [ ! -f "${DRUID_DIR}-bin.tar.gz" ]; then + wget --continue --progress=dot:giga -O"${DRUID_DIR}-bin.tar.gz" \ + "https://dlcdn.apache.org/druid/${VERSION}/${DRUID_DIR}-bin.tar.gz" + fi + tar xf "${DRUID_DIR}-bin.tar.gz" + "./${DRUID_DIR}/bin/verify-java" + + # Have to increase indexer memory limit + sed -i 's MaxDirectMemorySize=1g MaxDirectMemorySize=5g g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/middleManager/runtime.properties" + + # Disable cache to test query performance + sed -i 's druid.historical.cache.useCache=true druid.historical.cache.useCache=false g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + sed -i 's druid.historical.cache.populateCache=true druid.historical.cache.populateCache=false g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + sed -i 's druid.processing.buffer.sizeBytes=500MiB druid.processing.buffer.sizeBytes=1000MiB g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + + if ! grep -q '^druid.query.groupBy.maxMergingDictionarySize' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties"; then + echo "druid.query.groupBy.maxMergingDictionarySize=5000000000" \ + >> "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + fi +fi diff --git a/druid/load b/druid/load new file mode 100755 index 0000000000..c3984df395 --- /dev/null +++ b/druid/load @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +# post-index-task posts the spec; ingestion runs asynchronously and the +# command may exit non-zero on its own polling timeout while the task keeps +# running. We treat that as success-with-warning; check the logs for the +# actual completion. Idempotent: re-running is safe (Druid replaces the +# datasource). +"./${DRUID_DIR}/bin/post-index-task" --file ingest.json --url http://localhost:8081 || true + +# Wait until the hits datasource is queryable. +for _ in $(seq 1 600); do + cnt=$(curl -sf -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + -d '{"query": "SELECT COUNT(*) FROM hits"}' 2>/dev/null \ + | grep -oE '[0-9]+' | head -n1) || cnt="" + if [ -n "$cnt" ] && [ "$cnt" -gt 0 ]; then + break + fi + sleep 5 +done + +rm -f hits.tsv +sync diff --git a/druid/query b/druid/query new file mode 100755 index 0000000000..c1001f6083 --- /dev/null +++ b/druid/query @@ -0,0 +1,40 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Druid's SQL HTTP endpoint. +# Stdout: query result JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) +# Druid uses __time and dislikes trailing semicolons. +query=$(printf '%s' "$query" | sed -e 's EventTime __time g' | tr -d ';') + +# Build JSON request safely. +req=$(printf '%s' "$query" | python3 -c ' +import json, sys +q = sys.stdin.read() +print(json.dumps({"query": q, "context": {"timeout": 1000000}})) +') + +tmp_body=$(mktemp) +trap 'rm -f "$tmp_body"' EXIT + +# -w prints time_total and http_code on the last line. -o sends body to file. +status_line=$(curl -s -o "$tmp_body" \ + -w '%{http_code} %{time_total}\n' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + --data-binary "$req") + +http_code=$(echo "$status_line" | awk '{print $1}') +time_total=$(echo "$status_line" | awk '{print $2}') + +cat "$tmp_body" + +if [ "$http_code" != "200" ]; then + echo "druid query failed: HTTP $http_code" >&2 + exit 1 +fi + +# Print runtime in fractional seconds as the last stderr line. +printf '%s\n' "$time_total" >&2 diff --git a/druid/run.sh b/druid/run.sh deleted file mode 100755 index 2ffb72838d..0000000000 --- a/druid/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -TRIES=3 -cat queries.sql | while read -r query; do - sync - for i in $(seq 1 100); do - CHECK=$(curl -o /dev/null -w '%{http_code}' -s -XPOST -H'Content-Type: application/json' http://localhost:8888/druid/v2/sql/ -d @check.json }) - [[ "$CHECK" == "200" ]] && break - sleep 1 - done - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo -n "[" - for i in $(seq 1 $TRIES); do - echo "{\"query\":\"$query\", \"context\": {\"timeout\": 1000000} }"| sed -e 's EventTime __time g' | tr -d ';' > query.json - curl -w '%{http_code} %{time_total}\n' -s -XPOST -H'Content-Type: application/json' http://localhost:8888/druid/v2/sql/ -d @query.json | awk '{ if ($1!="200") { printf "null" } }' - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," - - # Ugly hack to measure independently queries. Otherwise some queries make Druid degraded and results are incorrect. For example after Q13 even SELECT 1 works for 7 seconds - pkill -f historical - sleep 3 -done diff --git a/druid/start b/druid/start new file mode 100755 index 0000000000..bde3b1ead5 --- /dev/null +++ b/druid/start @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +# Idempotent: if router is responsive, do nothing. +if curl -sf -o /dev/null http://localhost:8888/status 2>/dev/null; then + exit 0 +fi + +# Druid launcher does not start Druid as a daemon. Run it in background, with +# its own session so the start script can exit and leave Druid running. +nohup "./${DRUID_DIR}/bin/start-single-server-medium" \ + >> druid.log 2>&1 < /dev/null & +disown diff --git a/druid/stop b/druid/stop new file mode 100755 index 0000000000..e6ce67b959 --- /dev/null +++ b/druid/stop @@ -0,0 +1,8 @@ +#!/bin/bash + +# Kill all Druid processes (the launcher and child JVMs). +pkill -f 'start-single-server-medium' 2>/dev/null || true +pkill -f 'druid' 2>/dev/null || true +# Small grace period so subsequent ./start binds cleanly. +sleep 2 +exit 0 diff --git a/duckdb-dataframe/benchmark.sh b/duckdb-dataframe/benchmark.sh index 087381c449..fc4bacc8f3 100755 --- a/duckdb-dataframe/benchmark.sh +++ b/duckdb-dataframe/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas duckdb pyarrow - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-dataframe/check b/duckdb-dataframe/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/duckdb-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/duckdb-dataframe/data-size b/duckdb-dataframe/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/duckdb-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/duckdb-dataframe/install b/duckdb-dataframe/install new file mode 100755 index 0000000000..aa19117417 --- /dev/null +++ b/duckdb-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas duckdb pyarrow fastapi uvicorn diff --git a/duckdb-dataframe/load b/duckdb-dataframe/load new file mode 100755 index 0000000000..ceba6becac --- /dev/null +++ b/duckdb-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/duckdb-dataframe/query b/duckdb-dataframe/query new file mode 100755 index 0000000000..a4e1524300 --- /dev/null +++ b/duckdb-dataframe/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running duckdb server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/duckdb-dataframe/query.py b/duckdb-dataframe/query.py deleted file mode 100755 index 0139a83e5e..0000000000 --- a/duckdb-dataframe/query.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -import pandas as pd -import timeit -import datetime -import subprocess -import duckdb - -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") - -dataframe_size = hits.memory_usage().sum() - -# print("Dataframe(numpy) size:", dataframe_size, "bytes") - -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") - -# fix all object columns to string -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) - -queries = [] -with open("queries.sql") as f: - queries = f.readlines() - -conn = duckdb.connect() -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - times = [] - for _ in range(3): - start = timeit.default_timer() - result = conn.execute(q).fetchall() - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) diff --git a/duckdb-dataframe/server.py b/duckdb-dataframe/server.py new file mode 100644 index 0000000000..4fe187577c --- /dev/null +++ b/duckdb-dataframe/server.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around DuckDB (running over a pandas DataFrame) so it +conforms to the ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + DuckDB against the loaded DataFrame, returns + {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) + +The 43 SQL strings come straight from the prior duck-dataframe queries.sql. +""" + +import os +import timeit + +import duckdb +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits: pd.DataFrame | None = None # noqa: F841 — referenced by DuckDB by name +conn = None + + +def _make_runner(sql: str): + return lambda _df: conn.execute(sql).fetchall() + + +# 43 ClickBench queries — DuckDB resolves `hits` from the session by name. SQL +# strings come straight from the prior duckdb-dataframe/queries.sql. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, conn + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + # DuckDB picks up pandas DataFrames from globals by name; bind explicitly + # too so the connection sees `hits`. + conn = duckdb.connect() + conn.register("hits", hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + conn.execute(sql).fetchall() + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DUCKDB_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/duckdb-dataframe/start b/duckdb-dataframe/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/duckdb-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/duckdb-dataframe/stop b/duckdb-dataframe/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/duckdb-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/duckdb-datalake-partitioned/benchmark.sh b/duckdb-datalake-partitioned/benchmark.sh index a24d2e2dd6..33e6ce27ba 100755 --- a/duckdb-datalake-partitioned/benchmark.sh +++ b/duckdb-datalake-partitioned/benchmark.sh @@ -1,22 +1,6 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: 14737666736" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-datalake-partitioned/check b/duckdb-datalake-partitioned/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb-datalake-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-datalake-partitioned/data-size b/duckdb-datalake-partitioned/data-size new file mode 100755 index 0000000000..7fcf527501 --- /dev/null +++ b/duckdb-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored remotely in S3 — fixed size (100 partitioned parquet files). +echo 14737666736 diff --git a/duckdb-datalake-partitioned/install b/duckdb-datalake-partitioned/install new file mode 100755 index 0000000000..d58738fec1 --- /dev/null +++ b/duckdb-datalake-partitioned/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi diff --git a/duckdb-datalake-partitioned/load b/duckdb-datalake-partitioned/load new file mode 100755 index 0000000000..03aecdd6fc --- /dev/null +++ b/duckdb-datalake-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql installs httpfs and defines a VIEW directly over S3 partitioned +# parquet — no local data is loaded. Persist the view in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-datalake-partitioned/query b/duckdb-datalake-partitioned/query new file mode 100755 index 0000000000..0af71bda63 --- /dev/null +++ b/duckdb-datalake-partitioned/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over remote S3 partitioned parquet files). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-datalake-partitioned/results/20260509/c6a.metal.json b/duckdb-datalake-partitioned/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..7b8b6cbe6f --- /dev/null +++ b/duckdb-datalake-partitioned/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (data lake, partitioned)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 2, + "data_size": 14737666736, + "result": [ + [0.612, 0.617, 0.532], + [0.682, 0.649, 5.434], + [5.529, 5.475, 5.48], + [0.72, 5.409, 5.378], + [0.908, 0.832, 5.523], + [5.713, 0.984, 5.685], + [0.63, 0.715, 0.629], + [0.68, 0.65, 5.417], + [5.588, 0.869, 0.829], + [1.096, 5.587, 5.569], + [5.562, 5.507, 0.798], + [5.602, 0.887, 5.47], + [1.075, 0.94, 0.981], + [1.233, 1.126, 5.831], + [5.739, 0.991, 1.002], + [5.64, 0.87, 0.87], + [5.863, 1.149, 1.05], + [1.246, 1.117, 1.086], + [5.943, 5.94, 1.383], + [0.689, 0.61, 0.687], + [6.037, 1.427, 1.524], + [1.533, 1.646, 1.532], + [2.157, 2.015, 2.011], + [2.709, 2.247, 2.743], + [0.85, 5.626, 0.813], + [0.805, 0.804, 0.847], + [0.776, 0.695, 0.565], + [1.538, 5.905, 1.494], + [2.272, 6.631, 2.175], + [5.454, 5.383, 0.654], + [5.653, 1.02, 5.623], + [1.278, 1.064, 1.183], + [1.882, 6.305, 1.876], + [2.118, 6.066, 2.284], + [6.531, 2.191, 1.989], + [0.933, 5.57, 5.847], + [0.698, 0.651, 0.628], + [0.635, 0.578, 0.585], + [0.666, 0.606, 0.637], + [0.99, 0.923, 0.863], + [0.624, 0.531, 0.53], + [0.549, 0.508, 0.49], + [0.542, 0.438, 0.446] +] +} + diff --git a/duckdb-datalake-partitioned/run.sh b/duckdb-datalake-partitioned/run.sh deleted file mode 100755 index 61d016b4da..0000000000 --- a/duckdb-datalake-partitioned/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-datalake-partitioned/start b/duckdb-datalake-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-datalake-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake-partitioned/stop b/duckdb-datalake-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-datalake-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake/benchmark.sh b/duckdb-datalake/benchmark.sh index c0c96825c5..33e6ce27ba 100755 --- a/duckdb-datalake/benchmark.sh +++ b/duckdb-datalake/benchmark.sh @@ -1,22 +1,6 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: 14779976446" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-datalake/check b/duckdb-datalake/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb-datalake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-datalake/data-size b/duckdb-datalake/data-size new file mode 100755 index 0000000000..351ceea7bf --- /dev/null +++ b/duckdb-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored remotely in S3 — fixed size (single parquet). +echo 14779976446 diff --git a/duckdb-datalake/install b/duckdb-datalake/install new file mode 100755 index 0000000000..d58738fec1 --- /dev/null +++ b/duckdb-datalake/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi diff --git a/duckdb-datalake/load b/duckdb-datalake/load new file mode 100755 index 0000000000..376474284c --- /dev/null +++ b/duckdb-datalake/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql installs httpfs and defines a VIEW directly over S3 — no local +# data is loaded. Persist the view in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-datalake/query b/duckdb-datalake/query new file mode 100755 index 0000000000..492e963569 --- /dev/null +++ b/duckdb-datalake/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over a remote S3 parquet). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-datalake/results/20260509/c6a.metal.json b/duckdb-datalake/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..4fca0731b4 --- /dev/null +++ b/duckdb-datalake/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (data lake, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [0.229, 0.145, 0.149], + [0.598, 5.257, 5.305], + [0.641, 0.575, 0.544], + [5.335, 0.566, 5.539], + [0.735, 0.679, 0.679], + [0.91, 0.871, 0.843], + [0.514, 0.489, 0.49], + [0.531, 0.502, 0.479], + [0.819, 0.768, 0.718], + [0.858, 0.808, 0.821], + [0.72, 5.322, 0.694], + [0.739, 0.667, 0.67], + [0.908, 0.898, 5.513], + [1.055, 0.98, 1.013], + [0.892, 5.425, 0.874], + [0.764, 5.398, 5.401], + [1.063, 5.493, 0.992], + [5.611, 0.941, 5.811], + [5.893, 1.29, 1.207], + [0.579, 0.526, 0.695], + [1.394, 1.384, 1.42], + [1.506, 1.386, 1.395], + [2.317, 2.016, 1.928], + [2.685, 6.793, 7.156], + [0.982, 0.801, 0.781], + [0.686, 5.308, 5.346], + [0.711, 0.752, 0.833], + [1.546, 1.35, 6.151], + [2.217, 2.158, 2.237], + [5.394, 5.309, 0.532], + [5.581, 0.85, 0.85], + [1.122, 1.133, 5.782], + [6.342, 1.528, 1.544], + [1.883, 1.784, 1.799], + [1.857, 1.727, 6.057], + [0.756, 0.667, 0.665], + [0.647, 0.586, 0.585], + [0.501, 0.426, 0.418], + [0.631, 0.559, 0.578], + [1.03, 0.937, 0.948], + [0.497, 0.35, 0.379], + [0.434, 0.354, 0.332], + [0.388, 0.318, 0.342] +] +} + diff --git a/duckdb-datalake/run.sh b/duckdb-datalake/run.sh deleted file mode 100755 index 61d016b4da..0000000000 --- a/duckdb-datalake/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-datalake/start b/duckdb-datalake/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-datalake/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake/stop b/duckdb-datalake/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-datalake/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-memory/benchmark.sh b/duckdb-memory/benchmark.sh index 7c201d6bc0..fc4bacc8f3 100755 --- a/duckdb-memory/benchmark.sh +++ b/duckdb-memory/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install duckdb psutil - -# Load the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -v ./query.py 2>&1 | tee log.txt - -echo -n "Load time: " -cat log.txt | grep -P '^\d|Killed|Segmentation' | head -n1 - -cat log.txt | grep -P '^\d|Killed|Segmentation' | tail -n+2 | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -grep -F 'Maximum resident set size' log.txt | grep -o -P '\d+$' | awk '{ print $1 * 1024 }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-memory/check b/duckdb-memory/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/duckdb-memory/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/duckdb-memory/data-size b/duckdb-memory/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/duckdb-memory/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/duckdb-memory/install b/duckdb-memory/install new file mode 100755 index 0000000000..ac813e6168 --- /dev/null +++ b/duckdb-memory/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet duckdb psutil fastapi uvicorn diff --git a/duckdb-memory/load b/duckdb-memory/load new file mode 100755 index 0000000000..ceba6becac --- /dev/null +++ b/duckdb-memory/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/duckdb-memory/memory.py b/duckdb-memory/memory.py deleted file mode 100755 index ffe43331d4..0000000000 --- a/duckdb-memory/memory.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 - -import duckdb - -# Load the data to determine the memory use. -# This load is not timed. -con = duckdb.connect(read_only=False) - -# enable the progress bar -con.execute('PRAGMA enable_progress_bar;') -con.execute('PRAGMA enable_print_progress_bar;') -# disable preservation of insertion order -con.execute("SET preserve_insertion_order = false;") - -con.execute(open("create.sql").read()) -con.execute(open("load.sql").read()) diff --git a/duckdb-memory/query b/duckdb-memory/query new file mode 100755 index 0000000000..a4e1524300 --- /dev/null +++ b/duckdb-memory/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running duckdb server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/duckdb-memory/query.py b/duckdb-memory/query.py deleted file mode 100755 index b22f1d650d..0000000000 --- a/duckdb-memory/query.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 - -import duckdb -import timeit -import sys -import os - -con = duckdb.connect(':memory:') - -# enable the progress bar -con.execute('PRAGMA enable_progress_bar;') -con.execute('PRAGMA enable_print_progress_bar;') -# disable preservation of insertion order -con.execute("SET preserve_insertion_order = false;") - -# perform the actual load -print("Will load the data") -start = timeit.default_timer() -con.execute(open("create.sql").read()) -con.execute(open("load.sql").read()) -end = timeit.default_timer() -print(round(end - start, 3)) - -with open('queries.sql', 'r') as file: - for query in file: - print(query) - - for try_num in range(3): - start = timeit.default_timer() - results = con.sql(query).fetchall() - end = timeit.default_timer() - print(round(end - start, 3)) - del results diff --git a/duckdb-memory/server.py b/duckdb-memory/server.py new file mode 100644 index 0000000000..3ec69eceef --- /dev/null +++ b/duckdb-memory/server.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around DuckDB's compressed in-memory storage so it +conforms to the ClickBench install/start/check/stop/load/query interface. + +Each /query runs against a long-lived DuckDB connection that holds the +hits table in compressed memory, so we don't pay re-load cost per query +(which is what made the previous "embedded Python per query" version +unusable — every query was measuring full parquet ingestion time). + +Routes: + GET /health -> 200 OK once the server is up. + POST /load -> reads hits.parquet from CWD into the + compressed_mem :memory: schema. Returns + {"elapsed": }. + POST /query -> body: SQL text. Runs against the loaded table. + Returns {"elapsed": }. + GET /data-size -> returns process RSS in bytes (proxy for the + in-memory compressed footprint). +""" + +import os +import resource +import timeit + +import duckdb +import psutil +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +conn: duckdb.DuckDBPyConnection | None = None + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global conn + start = timeit.default_timer() + conn = duckdb.connect() + # preserve_insertion_order=false lets the loader use a cheaper insert + # path. create.sql does its own `ATTACH ':memory:' AS compressed_mem + # (COMPRESS); USE compressed_mem;` to set up the compressed in-memory + # database that's the whole point of this entry vs. plain duckdb. + conn.execute("SET preserve_insertion_order = false;") + conn.execute(open("create.sql").read()) + conn.execute(open("load.sql").read()) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if conn is None: + raise HTTPException(status_code=409, detail="hits not loaded; POST /load first") + sql = (await request.body()).decode("utf-8").strip() + if not sql: + raise HTTPException(status_code=400, detail="empty query") + start = timeit.default_timer() + conn.execute(sql).fetchall() + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.get("/data-size") +def data_size(): + # DuckDB's compressed_mem has no on-disk footprint, so report the + # server process RSS (peak so far). This mirrors what the previous + # memory.py + `time -v` did, and matches what clickbench + # convention expects (an integer byte count). + rss = psutil.Process().memory_info().rss + # Also check resource.getrusage — on Linux ru_maxrss is in kB and + # tracks the high-water mark across the process lifetime. + peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + peak = peak_kb * 1024 + return {"bytes": max(rss, peak)} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DUCKDB_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/duckdb-memory/start b/duckdb-memory/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/duckdb-memory/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/duckdb-memory/stop b/duckdb-memory/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/duckdb-memory/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/duckdb-parquet-partitioned/benchmark.sh b/duckdb-parquet-partitioned/benchmark.sh index d2db40aacd..3b63e772a6 100755 --- a/duckdb-parquet-partitioned/benchmark.sh +++ b/duckdb-parquet-partitioned/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-partitioned - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-parquet-partitioned/check b/duckdb-parquet-partitioned/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-parquet-partitioned/data-size b/duckdb-parquet-partitioned/data-size new file mode 100755 index 0000000000..2d6921ab6d --- /dev/null +++ b/duckdb-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/duckdb-parquet-partitioned/install b/duckdb-parquet-partitioned/install new file mode 100755 index 0000000000..d58738fec1 --- /dev/null +++ b/duckdb-parquet-partitioned/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi diff --git a/duckdb-parquet-partitioned/load b/duckdb-parquet-partitioned/load new file mode 100755 index 0000000000..54176f57c7 --- /dev/null +++ b/duckdb-parquet-partitioned/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# create.sql defines a VIEW over hits_*.parquet — no ingestion happens. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-parquet-partitioned/query b/duckdb-parquet-partitioned/query new file mode 100755 index 0000000000..a21f567705 --- /dev/null +++ b/duckdb-parquet-partitioned/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over hits_*.parquet). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-parquet-partitioned/results/20260509/c6a.4xlarge.json b/duckdb-parquet-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..38468d5841 --- /dev/null +++ b/duckdb-parquet-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 22, + "data_size": 14737666736, + "result": [ + [0.077, 0.037, 0.037], + [0.098, 0.028, 0.028], + [0.169, 0.059, 0.06], + [0.611, 0.065, 0.062], + [1.648, 0.368, 0.366], + [1.45, 0.541, 0.505], + [0.097, 0.046, 0.046], + [0.1, 0.03, 0.03], + [1.126, 0.474, 0.475], + [1.671, 0.588, 0.58], + [0.723, 0.14, 0.132], + [1.505, 0.16, 0.151], + [1.746, 0.573, 0.566], + [3.156, 0.943, 0.946], + [1.328, 0.613, 0.62], + [0.83, 0.456, 0.454], + [2.953, 1.095, 1.085], + [2.732, 0.828, 0.832], + [5.221, 2.01, 1.974], + [0.155, 0.034, 0.032], + [10.668, 0.87, 0.869], + [11.935, 0.822, 0.821], + [20.646, 1.662, 1.658], + [4.25, 0.437, 0.433], + [0.218, 0.141, 0.136], + [1.683, 0.297, 0.29], + [0.318, 0.101, 0.098], + [10.94, 0.751, 0.754], + [10.102, 8.491, 8.487], + [0.144, 0.059, 0.06], + [2.302, 0.553, 0.554], + [6.066, 0.697, 0.679], + [5.397, 2.018, 2.027], + [10.055, 2.36, 2.375], + [10.045, 2.406, 2.435], + [0.78, 0.675, 0.681], + [0.191, 0.114, 0.116], + [0.147, 0.091, 0.087], + [0.133, 0.063, 0.062], + [0.385, 0.222, 0.211], + [0.114, 0.037, 0.038], + [0.102, 0.036, 0.033], + [0.114, 0.047, 0.045] +] +} + diff --git a/duckdb-parquet-partitioned/results/20260509/c6a.metal.json b/duckdb-parquet-partitioned/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..683582ba71 --- /dev/null +++ b/duckdb-parquet-partitioned/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 64, + "data_size": 14737666736, + "result": [ + [0.341, 0.336, 0.329], + [0.095, 0.068, 0.057], + [0.126, 0.078, 0.071], + [0.467, 0.107, 0.082], + [1.206, 0.324, 0.306], + [1.309, 0.383, 0.355], + [0.093, 0.053, 0.071], + [0.13, 0.092, 0.099], + [0.839, 0.387, 0.403], + [1.41, 0.533, 0.51], + [0.559, 0.225, 0.227], + [1.036, 0.264, 0.307], + [1.426, 0.454, 0.433], + [2.448, 0.685, 0.68], + [1.033, 0.472, 0.472], + [0.568, 0.32, 0.336], + [2.479, 0.628, 0.645], + [2.261, 0.673, 0.658], + [4.147, 0.913, 0.935], + [0.205, 0.067, 0.066], + [9.945, 0.295, 0.278], + [11.592, 0.312, 0.35], + [19.702, 0.92, 0.718], + [11.414, 0.624, 0.563], + [0.538, 0.14, 0.519], + [1.61, 0.165, 0.173], + [1.849, 0.149, 0.136], + [10.165, 0.32, 0.316], + [8.903, 1.583, 1.572], + [0.123, 0.087, 0.091], + [2.259, 0.427, 0.433], + [6.004, 0.59, 0.555], + [4.671, 1.37, 1.423], + [9.964, 1.009, 0.992], + [9.974, 0.94, 1.072], + [0.341, 0.359, 0.333], + [0.2, 0.144, 0.129], + [0.151, 0.089, 0.094], + [0.163, 0.084, 0.082], + [0.389, 0.24, 0.241], + [0.152, 0.096, 0.076], + [0.133, 0.101, 0.085], + [0.133, 0.079, 0.076] +] +} + diff --git a/duckdb-parquet-partitioned/run.sh b/duckdb-parquet-partitioned/run.sh deleted file mode 100755 index 61d016b4da..0000000000 --- a/duckdb-parquet-partitioned/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-parquet-partitioned/start b/duckdb-parquet-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-parquet-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-parquet-partitioned/stop b/duckdb-parquet-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-parquet-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-parquet/benchmark.sh b/duckdb-parquet/benchmark.sh index e8ad1d8940..fc4bacc8f3 100755 --- a/duckdb-parquet/benchmark.sh +++ b/duckdb-parquet/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-parquet/check b/duckdb-parquet/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-parquet/data-size b/duckdb-parquet/data-size new file mode 100755 index 0000000000..1aecba4a18 --- /dev/null +++ b/duckdb-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits*.parquet | awk '/total$/ { print $1 }' diff --git a/duckdb-parquet/install b/duckdb-parquet/install new file mode 100755 index 0000000000..d58738fec1 --- /dev/null +++ b/duckdb-parquet/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi diff --git a/duckdb-parquet/load b/duckdb-parquet/load new file mode 100755 index 0000000000..99b8db36e2 --- /dev/null +++ b/duckdb-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# create.sql defines a VIEW over hits.parquet — no ingestion happens, the +# parquet file is read in place at query time. We persist the view in +# hits.db so subsequent query invocations see it. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-parquet/query b/duckdb-parquet/query new file mode 100755 index 0000000000..46a748cb7e --- /dev/null +++ b/duckdb-parquet/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (which +# contains a VIEW over hits.parquet). +# Stdout: query result. Stderr: query runtime in fractional seconds on the +# last line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-parquet/results/20260509/c6a.4xlarge.json b/duckdb-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..1005dfdbf3 --- /dev/null +++ b/duckdb-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 1, + "data_size": 14779976446, + "result": [ + [0.086, 0.052, 0.053], + [0.145, 0.07, 0.069], + [0.198, 0.093, 0.093], + [0.507, 0.1, 0.101], + [1.377, 0.389, 0.39], + [1.215, 0.571, 0.535], + [0.15, 0.08, 0.081], + [0.163, 0.072, 0.07], + [1.032, 0.485, 0.491], + [1.435, 0.595, 0.59], + [0.621, 0.177, 0.176], + [1.176, 0.193, 0.195], + [1.506, 0.58, 0.582], + [2.801, 0.932, 0.939], + [0.953, 0.64, 0.634], + [0.599, 0.437, 0.436], + [2.405, 1.049, 1.046], + [2.163, 0.826, 0.835], + [4.529, 1.872, 1.862], + [0.205, 0.073, 0.073], + [9.719, 0.917, 0.911], + [11.094, 0.856, 0.857], + [20.327, 1.708, 1.722], + [2.686, 0.435, 0.441], + [0.286, 0.184, 0.18], + [0.806, 0.344, 0.339], + [0.269, 0.17, 0.167], + [9.531, 0.79, 0.801], + [8.945, 8.685, 8.677], + [0.194, 0.094, 0.089], + [2.233, 0.587, 0.602], + [5.802, 0.7, 0.701], + [5.13, 1.955, 1.96], + [10.011, 2.228, 2.218], + [10.041, 2.343, 2.318], + [0.686, 0.542, 0.546], + [0.3, 0.162, 0.175], + [0.237, 0.138, 0.141], + [0.237, 0.111, 0.112], + [0.508, 0.277, 0.277], + [0.2, 0.079, 0.078], + [0.168, 0.078, 0.075], + [0.179, 0.088, 0.092] +] +} + diff --git a/duckdb-parquet/results/20260509/c6a.metal.json b/duckdb-parquet/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..d7af05748e --- /dev/null +++ b/duckdb-parquet/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [0.1, 0.048, 0.05], + [0.151, 0.083, 0.079], + [0.152, 0.089, 0.103], + [0.358, 0.123, 0.122], + [0.96, 0.306, 0.342], + [0.965, 0.415, 0.41], + [0.124, 0.078, 0.077], + [0.153, 0.08, 0.084], + [0.772, 0.407, 0.409], + [1.086, 0.507, 0.534], + [0.464, 0.237, 0.252], + [0.75, 0.28, 0.292], + [1.109, 0.409, 0.406], + [2.216, 0.641, 0.626], + [0.975, 0.445, 0.446], + [0.472, 0.371, 0.372], + [2.144, 0.66, 0.617], + [2.136, 0.64, 0.669], + [4.006, 0.982, 0.928], + [0.182, 0.1, 0.097], + [9.385, 0.276, 0.279], + [10.936, 0.354, 0.359], + [19.562, 0.678, 0.667], + [10.695, 0.551, 0.643], + [2.035, 0.264, 0.247], + [0.693, 0.216, 0.233], + [2.026, 0.171, 0.226], + [9.701, 0.359, 0.348], + [8.206, 1.593, 1.632], + [0.154, 0.106, 0.091], + [2.027, 0.486, 0.463], + [5.51, 0.602, 0.674], + [4.678, 1.412, 1.658], + [9.83, 1.112, 1.041], + [9.755, 1.02, 1.055], + [0.371, 0.312, 0.347], + [0.271, 0.177, 0.175], + [0.212, 0.149, 0.146], + [0.226, 0.119, 0.117], + [0.491, 0.297, 0.302], + [0.203, 0.102, 0.089], + [0.159, 0.095, 0.086], + [0.177, 0.094, 0.097] +] +} + diff --git a/duckdb-parquet/run.sh b/duckdb-parquet/run.sh deleted file mode 100755 index 61d016b4da..0000000000 --- a/duckdb-parquet/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-parquet/start b/duckdb-parquet/start new file mode 100755 index 0000000000..c1d4b2fca8 --- /dev/null +++ b/duckdb-parquet/start @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/duckdb-parquet/stop b/duckdb-parquet/stop new file mode 100755 index 0000000000..7af43b828e --- /dev/null +++ b/duckdb-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/duckdb-vortex-partitioned/benchmark.sh b/duckdb-vortex-partitioned/benchmark.sh index 0353f04f1d..3b63e772a6 100755 --- a/duckdb-vortex-partitioned/benchmark.sh +++ b/duckdb-vortex-partitioned/benchmark.sh @@ -1,53 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config - -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --no-modify-path - -export CC=clang -export CXX=clang++ -git clone https://github.com/vortex-data/duckdb-vortex --recursive -cd duckdb-vortex -git fetch --tags -git checkout v0.44.0 -git submodule update --init --recursive -GEN=ninja NATIVE_ARCH=1 LTO=thin make -export PATH="`pwd`/build/release/:$PATH" -cd .. - -# Load the data -../download-hits-parquet-partitioned - -# Convert parquet files to vortex partitioned -echo -n "Load time: " -seq 0 99 | command time -f '%e' xargs -P"$(nproc)" -I{} bash -c ' - if [ ! -f "hits_{}.vortex" ]; then - duckdb -c " - COPY ( - SELECT * - REPLACE ( - make_date(EventDate) AS EventDate, - epoch_ms(EventTime * 1000) as EventTime - ) - FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True) - ) - TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX) - " - fi -' - -echo -n "Load time: " -command time -f '%e' duckdb hits-partitioned.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')"; - -# Run the queries -echo 'partitioned' - -./run.sh 'hits-partitioned.db' 2>&1 | tee log-p.txt -cat log-p.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -bcs hits_*.vortex | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-vortex-partitioned/check b/duckdb-vortex-partitioned/check new file mode 100755 index 0000000000..43c60a61e2 --- /dev/null +++ b/duckdb-vortex-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" +"$DUCKDB" -c "SELECT 1" >/dev/null diff --git a/duckdb-vortex-partitioned/data-size b/duckdb-vortex-partitioned/data-size new file mode 100755 index 0000000000..4bb0a059a6 --- /dev/null +++ b/duckdb-vortex-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.vortex | awk '/total$/ { print $1 }' diff --git a/duckdb-vortex-partitioned/install b/duckdb-vortex-partitioned/install new file mode 100755 index 0000000000..9306aac5a4 --- /dev/null +++ b/duckdb-vortex-partitioned/install @@ -0,0 +1,38 @@ +#!/bin/bash +set -e + +# Build duckdb-vortex from source. Idempotent. +if [ -x duckdb-vortex/build/release/duckdb ]; then + exit 0 +fi + +# vcpkg fails the build with `unable to read $HOME` when cloud-init runs +# this script with HOME unset. cloud-init.sh.in already exports it, but +# stamp HOME locally too so the build works regardless of which +# cloud-init template generated the wrapper. +export HOME="${HOME:-/root}" + +sudo apt-get update -y +sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain stable --no-modify-path +fi +# shellcheck disable=SC1091 +[ -f "$HOME/.cargo/env" ] && source "$HOME/.cargo/env" + +export CC=clang +export CXX=clang++ + +if [ ! -d duckdb-vortex ]; then + git clone https://github.com/vortex-data/duckdb-vortex --recursive +fi + +( + cd duckdb-vortex + git fetch --tags + git checkout v0.44.0 + git submodule update --init --recursive + GEN=ninja NATIVE_ARCH=1 LTO=thin make +) diff --git a/duckdb-vortex-partitioned/load b/duckdb-vortex-partitioned/load new file mode 100755 index 0000000000..282755fc01 --- /dev/null +++ b/duckdb-vortex-partitioned/load @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" + +# Convert each parquet partition to a vortex file in parallel. +seq 0 99 | xargs -P"$(nproc)" -I{} bash -c ' + if [ ! -f "hits_{}.vortex" ]; then + "'"$DUCKDB"'" -c " + COPY ( + SELECT * + REPLACE ( + make_date(EventDate) AS EventDate, + epoch_ms(EventTime * 1000) AS EventTime + ) + FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True) + ) + TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX) + " + fi +' + +# Build a persistent VIEW over the resulting vortex files. +rm -f hits.db +"$DUCKDB" hits.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')" + +# Free the source parquet files. +rm -f hits_*.parquet +sync diff --git a/duckdb-vortex-partitioned/query b/duckdb-vortex-partitioned/query new file mode 100755 index 0000000000..3e67e19772 --- /dev/null +++ b/duckdb-vortex-partitioned/query @@ -0,0 +1,21 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the custom-built duckdb-vortex +# binary against hits.db (a VIEW over hits_*.vortex). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" +query=$(cat) + +out=$("$DUCKDB" hits.db -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-vortex-partitioned/run.sh b/duckdb-vortex-partitioned/run.sh deleted file mode 100755 index 71bd5c4a5c..0000000000 --- a/duckdb-vortex-partitioned/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb "$1" "${cli_params[@]}" -done; diff --git a/duckdb-vortex-partitioned/start b/duckdb-vortex-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-vortex-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex-partitioned/stop b/duckdb-vortex-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-vortex-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex/benchmark.sh b/duckdb-vortex/benchmark.sh index 5234cb9b82..fc4bacc8f3 100755 --- a/duckdb-vortex/benchmark.sh +++ b/duckdb-vortex/benchmark.sh @@ -1,31 +1,5 @@ #!/bin/bash - -set -Eeuo pipefail - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -duckdb -c "INSTALL vortex;" - -# Load the data -../download-hits-parquet-single - -# Convert parquet files to Vortex -echo -n "Load time: " -command time -f '%e' duckdb -c "LOAD vortex; COPY 'hits.parquet' TO 'hits.vortex' (FORMAT vortex);" - -# Create view and macro -echo -n "Load time: " -command time -f '%e' duckdb hits-single.db -f create.sql - -echo 'single' - -./run.sh 'hits-single.db' 2>&1 | tee log-s.txt -cat log-s.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.vortex)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-vortex/check b/duckdb-vortex/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb-vortex/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-vortex/data-size b/duckdb-vortex/data-size new file mode 100755 index 0000000000..2dbaf40fad --- /dev/null +++ b/duckdb-vortex/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits.vortex | awk 'END { print $1 }' diff --git a/duckdb-vortex/install b/duckdb-vortex/install new file mode 100755 index 0000000000..193fe2e6f8 --- /dev/null +++ b/duckdb-vortex/install @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# duckdb writes its extension cache under $HOME/.duckdb/extensions. With +# HOME unset (cloud-init occasionally runs scripts that way) the +# `INSTALL vortex` below ends up writing to /.duckdb and later sibling +# scripts (load/query) can't find the extension. The previous +# `export HOME=${HOME:=~}` was inside the install-duckdb branch, so +# re-runs with duckdb already on PATH still hit the bug. Pin HOME +# unconditionally at the top. +export HOME="${HOME:-/root}" + +if ! command -v duckdb >/dev/null 2>&1; then + curl https://install.duckdb.org | sh + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi + +# Install the vortex extension (idempotent — safe to re-run). +duckdb -c "INSTALL vortex;" diff --git a/duckdb-vortex/load b/duckdb-vortex/load new file mode 100755 index 0000000000..5fa24aacfd --- /dev/null +++ b/duckdb-vortex/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +# Convert source parquet to Vortex format. +duckdb -c "LOAD vortex; COPY 'hits.parquet' TO 'hits.vortex' (FORMAT vortex);" + +# Create the persistent VIEW (over hits.vortex) in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql + +# Source parquet no longer needed. +rm -f hits.parquet +sync diff --git a/duckdb-vortex/query b/duckdb-vortex/query new file mode 100755 index 0000000000..5eef5331ae --- /dev/null +++ b/duckdb-vortex/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over hits.vortex). The vortex extension is loaded for each invocation. +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "LOAD vortex;" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-vortex/results/20260509/c6a.metal.json b/duckdb-vortex/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..bc3c7688dc --- /dev/null +++ b/duckdb-vortex/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Vortex, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 227, + "data_size": 15731820628, + "result": [ + [0.054, 0.031, 0.031], + [0.096, 0.063, 0.061], + [0.321, 0.093, 0.091], + [0.884, 0.208, 0.198], + [0.985, 0.414, 0.421], + [1.05, 0.42, 0.395], + [0.057, 0.025, 0.026], + [0.112, 0.069, 0.069], + [1.25, 0.507, 0.486], + [1.561, 0.59, 0.614], + [1.052, 0.31, 0.316], + [1.026, 0.314, 0.32], + [1.055, 0.549, 0.624], + [2.859, 1.052, 1.087], + [1.297, 0.67, 0.677], + [1.004, 0.386, 0.386], + [2.651, 0.897, 0.891], + [2.638, 0.93, 0.9], + [3.733, 1.221, 1.118], + [0.702, 0.126, 0.127], + [14.876, 3.365, 3.287], + [17.04, 3.92, 2.619], + [22.002, 3.311, 4.366], + [44.907, 20.978, 21.33], + [0.826, 0.334, 0.376], + [0.995, 0.485, 0.376], + [0.874, 0.326, 0.37], + [14.992, 3.38, 3.405], + [12.702, 3.596, 3.226], + [0.321, 0.084, 0.083], + [2.409, 0.704, 0.704], + [5.434, 0.992, 1.01], + [4.132, 1.374, 1.365], + [15.087, 2.701, 2.885], + [15.045, 2.692, 2.761], + [0.696, 0.335, 0.357], + [0.785, 0.568, 0.58], + [0.564, 0.354, 0.347], + [0.727, 0.565, 0.585], + [1.493, 1.325, 1.264], + [0.33, 0.168, 0.17], + [0.15, 0.093, 0.098], + [0.123, 0.065, 0.063] +] +} + diff --git a/duckdb-vortex/run.sh b/duckdb-vortex/run.sh deleted file mode 100755 index 30484964b6..0000000000 --- a/duckdb-vortex/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -Eeuo pipefail - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("LOAD vortex;") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb "$1" "${cli_params[@]}" -done; diff --git a/duckdb-vortex/start b/duckdb-vortex/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-vortex/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex/stop b/duckdb-vortex/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/duckdb-vortex/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb/benchmark.sh b/duckdb/benchmark.sh index 767d7fc536..fc4bacc8f3 100755 --- a/duckdb/benchmark.sh +++ b/duckdb/benchmark.sh @@ -1,24 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -storage_version latest -f create.sql -f load.sql - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb/check b/duckdb/check new file mode 100755 index 0000000000..3c457f3f1e --- /dev/null +++ b/duckdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb/data-size b/duckdb/data-size new file mode 100755 index 0000000000..b0e7eef3b9 --- /dev/null +++ b/duckdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.db diff --git a/duckdb/install b/duckdb/install new file mode 100755 index 0000000000..e7508ab285 --- /dev/null +++ b/duckdb/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh + # The DuckDB installer drops the binary into ~/.duckdb/cli/latest/duckdb + # and only suggests adding it to PATH. Symlink into /usr/local/bin so + # sibling scripts (check, load, query) find it unconditionally. + sudo ln -sf "$HOME/.duckdb/cli/latest/duckdb" /usr/local/bin/duckdb +fi diff --git a/duckdb/load.sql b/duckdb/load old mode 100644 new mode 100755 similarity index 53% rename from duckdb/load.sql rename to duckdb/load index 24891835f9..ae8ee21ff0 --- a/duckdb/load.sql +++ b/duckdb/load @@ -1,3 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: blow away any prior partial DB. +rm -f hits.db + +# Build the database from the source parquet via inline create + load DDL. +duckdb hits.db -storage_version latest <<'SQL' +.read create.sql INSERT INTO hits SELECT * REPLACE ( make_date(EventDate) AS EventDate, @@ -5,3 +14,7 @@ SELECT * REPLACE ( epoch_ms(ClientEventTime * 1000) AS ClientEventTime, epoch_ms(LocalEventTime * 1000) AS LocalEventTime) FROM read_parquet('hits.parquet', binary_as_string=True); +SQL + +rm -f hits.parquet +sync diff --git a/duckdb/query b/duckdb/query new file mode 100755 index 0000000000..51d155afd8 --- /dev/null +++ b/duckdb/query @@ -0,0 +1,25 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db. +# Stdout: query result (boxed format). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# duckdb writes both the result and the `Run Time (s): real X.XXX` line to +# stdout; capture, split, redirect timing to stderr. +out=$(duckdb hits.db -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +# Stdout: everything except the Run Time line. +printf '%s\n' "$out" | grep -v '^Run Time ' + +# Stderr: the timing in seconds. +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb/results/20260509/c6a.4xlarge.json b/duckdb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..976abb0fda --- /dev/null +++ b/duckdb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded"], + "load_time": 122, + "data_size": 20456681472, + "result": [ + [0.051, 0.018, 0.018], + [0.149, 0.041, 0.04], + [0.255, 0.076, 0.075], + [0.258, 0.089, 0.09], + [0.5, 0.352, 0.345], + [0.983, 0.343, 0.343], + [0.11, 0.032, 0.031], + [0.158, 0.038, 0.039], + [0.896, 0.456, 0.456], + [1.599, 0.618, 0.625], + [0.533, 0.155, 0.155], + [0.853, 0.179, 0.176], + [0.984, 0.415, 0.417], + [2.306, 0.799, 0.794], + [1.376, 0.478, 0.48], + [0.567, 0.394, 0.4], + [2.219, 0.891, 0.891], + [1.987, 0.657, 0.67], + [4.545, 1.656, 1.655], + [0.162, 0.051, 0.05], + [10.667, 0.722, 0.717], + [12.305, 0.77, 0.768], + [15.39, 1.601, 1.043], + [0.574, 0.363, 0.359], + [0.191, 0.074, 0.071], + [0.893, 0.167, 0.167], + [0.188, 0.07, 0.074], + [10.814, 0.661, 0.659], + [8.63, 6.489, 6.478], + [0.184, 0.068, 0.07], + [2.788, 0.413, 0.411], + [5.824, 0.621, 0.619], + [4.946, 2.024, 2.05], + [11.16, 2.049, 2.042], + [11.179, 2.211, 2.215], + [0.628, 0.473, 0.473], + [0.147, 0.055, 0.055], + [0.11, 0.038, 0.039], + [0.141, 0.043, 0.04], + [0.226, 0.099, 0.098], + [0.141, 0.041, 0.042], + [0.117, 0.041, 0.04], + [0.109, 0.039, 0.04] +] +} + diff --git a/duckdb/results/20260509/c6a.metal.json b/duckdb/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..6ac3fef9b5 --- /dev/null +++ b/duckdb/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","embedded"], + "load_time": 102, + "data_size": 20635201536, + "result": [ + [0.047, 0.017, 0.018], + [0.101, 0.047, 0.044], + [0.133, 0.066, 0.066], + [0.972, 0.081, 0.083], + [0.965, 0.278, 0.291], + [1.737, 0.395, 0.399], + [0.086, 0.032, 0.03], + [0.088, 0.042, 0.04], + [1.473, 0.35, 0.357], + [2.493, 0.478, 0.515], + [1.147, 0.27, 0.255], + [1.498, 0.293, 0.29], + [1.712, 0.392, 0.392], + [3.295, 0.585, 0.597], + [2.256, 0.434, 0.438], + [0.65, 0.301, 0.281], + [3.263, 0.561, 0.573], + [3.271, 0.577, 0.572], + [6.186, 1.009, 1.062], + [0.14, 0.056, 0.059], + [15.567, 0.597, 0.598], + [17.581, 0.782, 0.769], + [16.145, 0.986, 0.892], + [2.826, 0.991, 0.992], + [0.503, 0.182, 0.181], + [1.757, 0.199, 0.196], + [0.49, 0.182, 0.166], + [15.552, 0.656, 0.741], + [11.912, 1.182, 1.156], + [0.117, 0.071, 0.07], + [4.394, 0.468, 0.479], + [8.396, 0.819, 0.774], + [6.258, 1.279, 1.268], + [15.421, 1.207, 1.191], + [15.585, 1.151, 1.194], + [0.299, 0.258, 0.26], + [0.117, 0.055, 0.056], + [0.078, 0.045, 0.045], + [0.115, 0.046, 0.044], + [0.21, 0.112, 0.112], + [0.117, 0.054, 0.054], + [0.106, 0.046, 0.044], + [0.096, 0.044, 0.042] +] +} + diff --git a/duckdb/run.sh b/duckdb/run.sh deleted file mode 100755 index 25aee48efe..0000000000 --- a/duckdb/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb/start b/duckdb/start new file mode 100755 index 0000000000..c1d4b2fca8 --- /dev/null +++ b/duckdb/start @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/duckdb/stop b/duckdb/stop new file mode 100755 index 0000000000..7af43b828e --- /dev/null +++ b/duckdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/elasticsearch/benchmark.sh b/elasticsearch/benchmark.sh index e50248e128..eec9d34a89 100755 --- a/elasticsearch/benchmark.sh +++ b/elasticsearch/benchmark.sh @@ -1,80 +1,6 @@ #!/bin/bash - -# Install prerequisite packages -sudo apt-get update -y -sudo apt-get install -y apt-transport-https ca-certificates wget gpg time jq bc - -# Add Elastic's signing key -wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg - -# Add the repository for version 9.x -echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/9.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-9.x.list - -# Update package list and install -sudo apt-get update -y -sudo apt-get install -y elasticsearch - -sudo /bin/systemctl daemon-reload -sudo /bin/systemctl enable elasticsearch.service -sudo systemctl start elasticsearch.service - -# Disable security (all other are default configs) -cat << EOF > elasticsearch.yml -path.data: /var/lib/elasticsearch -path.logs: /var/log/elasticsearch - -# Disable security features -xpack.security.enabled: false -xpack.security.http.ssl.enabled: false -xpack.security.transport.ssl.enabled: false - -cluster.initial_master_nodes: ["clickbench"] -http.host: 0.0.0.0 -EOF - -sudo cp elasticsearch.yml /etc/elasticsearch/elasticsearch.yml - -# Restart Elasticsearch with the updated configs -sudo systemctl restart elasticsearch.service - - -# Check Elasticsearch is alive - you should get a JSON response -curl -sS -X GET 'http://localhost:9200' - - -###### Create index with mappings mirroring data types in ClickHouse - -# Note: Field types were mapped as closely as possible to https://github.com/ClickHouse/ClickBench/blob/main/clickhouse/create.sql I chose "keyword" because queries are not taking advantage of freetext search. - -# Note: Elasticsearch does not have the concept of a primary key, but it does have an "index sorting" feature, which is intended to help in analytical use cases where sort order on disk matters. I set it to the same parameters as primary key for the ClickHouse tests https://github.com/ClickHouse/ClickBench/blob/main/clickhouse/create.sql - -curl -sS -X PUT "http://localhost:9200/hits?pretty" -H 'Content-Type: application/json' -d @mapping.json - - -###### Data loading (JSON dump via ES Bulk API insert) - -# Download the data -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' - -START=$(date +%s) - -# Reads and loads all the data into Elasticsearch -python3 load.py - -# check on progress -curl -sS -X GET "http://localhost:9200/hits/_stats/docs?pretty" - -# Makes sure all data is flushed to disk -curl -sS -X GET "http://localhost:9200/_flush?pretty" - -# when data loading is finished, to get all stats run -# For Load time, look at: bulk.total_time_in_millis -# For Data size, look at: store.total_data_set_size_in_bytes -curl -sS -X GET "http://localhost:9200/hits/_stats?pretty" | tee stats.json -echo "Data size: $(jq -r '._all.total.store.total_data_set_size_in_bytes' stats.json)" - -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -###### Run the queries -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Source data is gzipped NDJSON, fetched directly inside ./load. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/elasticsearch/check b/elasticsearch/check new file mode 100755 index 0000000000..ca1249471d --- /dev/null +++ b/elasticsearch/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf 'http://localhost:9200' >/dev/null diff --git a/elasticsearch/data-size b/elasticsearch/data-size new file mode 100755 index 0000000000..5a1637919f --- /dev/null +++ b/elasticsearch/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +curl -sS -X GET 'http://localhost:9200/hits/_stats' \ + | jq -r '._all.total.store.total_data_set_size_in_bytes' diff --git a/elasticsearch/install b/elasticsearch/install new file mode 100755 index 0000000000..532a63d6d3 --- /dev/null +++ b/elasticsearch/install @@ -0,0 +1,39 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y apt-transport-https ca-certificates wget gpg time jq bc python3 python3-pip + +# Elastic signing key + repo (idempotent if already present). +if [ ! -f /usr/share/keyrings/elasticsearch-keyring.gpg ]; then + wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \ + | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg +fi + +if [ ! -f /etc/apt/sources.list.d/elastic-9.x.list ]; then + echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/9.x/apt stable main" \ + | sudo tee /etc/apt/sources.list.d/elastic-9.x.list + sudo apt-get update -y +fi + +sudo apt-get install -y elasticsearch + +# load.py uses requests. +pip3 install --quiet --break-system-packages requests || pip3 install --quiet requests + +# Disable security; bind on all interfaces (matches original config). +cat </dev/null +path.data: /var/lib/elasticsearch +path.logs: /var/log/elasticsearch + +xpack.security.enabled: false +xpack.security.http.ssl.enabled: false +xpack.security.transport.ssl.enabled: false + +cluster.initial_master_nodes: ["clickbench"] +http.host: 0.0.0.0 +EOF + +sudo /bin/systemctl daemon-reload +sudo systemctl enable elasticsearch.service +sudo systemctl restart elasticsearch.service diff --git a/elasticsearch/load b/elasticsearch/load new file mode 100755 index 0000000000..f9c5074805 --- /dev/null +++ b/elasticsearch/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Fetch source data (NDJSON, gzipped). load.py reads it directly. +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +# Idempotent: drop existing index. +curl -sS -X DELETE "http://localhost:9200/hits" >/dev/null 2>&1 || true + +# Create index with explicit field types (mirrors ClickHouse). +curl -sS -X PUT "http://localhost:9200/hits?pretty" \ + -H 'Content-Type: application/json' -d @mapping.json >/dev/null + +# Bulk load NDJSON (gzipped) via the ES Bulk API. +python3 load.py + +# Force a flush so on-disk size is final. +curl -sS -X GET "http://localhost:9200/_flush?pretty" >/dev/null + +rm -f hits.json.gz +sync diff --git a/elasticsearch/load.py b/elasticsearch/load.py index 43cbd72131..5fa9800fa2 100644 --- a/elasticsearch/load.py +++ b/elasticsearch/load.py @@ -11,7 +11,9 @@ # Precompute action metadata line once ACTION_META_BYTES = (json.dumps({"index": {"_index": INDEX}}) + "\n").encode("utf-8") -REQUEST_TIMEOUT = 30 # seconds +# 30 s wasn't enough — bulk inserts hit `requests.exceptions.ReadTimeout` +# once the index grew large and ES had to flush + merge mid-batch. +REQUEST_TIMEOUT = 300 # seconds def bulk_stream(docs): diff --git a/elasticsearch/query b/elasticsearch/query new file mode 100755 index 0000000000..ffd08d7311 --- /dev/null +++ b/elasticsearch/query @@ -0,0 +1,42 @@ +#!/bin/bash +# Reads a SQL query from stdin, translates it to Elasticsearch DSL via the +# /_sql/translate endpoint, then runs it against /_search. +# +# Stdout: ES JSON response. +# Stderr: query runtime in fractional seconds on the last line (the "took" +# field from the search response, in milliseconds). +# Exit non-zero on error. +set -e + +query=$(cat) + +# Clear query cache to keep tries comparable. +curl -sS -X POST 'http://localhost:9200/hits/_cache/clear' >/dev/null 2>&1 || true + +# Translate SQL -> DSL. +sql_json=$(jq -nc --arg q "$query" '{query: $q}') +dsl=$(curl -sS -X POST 'http://localhost:9200/_sql/translate' \ + -H 'Content-Type: application/json' -d "$sql_json") + +if printf '%s\n' "$dsl" | jq -e 'has("error")' >/dev/null 2>&1; then + printf '%s\n' "$dsl" >&2 + exit 1 +fi + +resp=$(curl -sS -X GET 'http://localhost:9200/hits/_search' \ + -H 'Content-Type: application/json' -d "$dsl") + +if printf '%s\n' "$resp" | jq -e 'has("error")' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" + +took_ms=$(printf '%s\n' "$resp" | jq -r '.took // empty') +if [ -z "$took_ms" ] || [ "$took_ms" = "null" ]; then + echo "no .took in elasticsearch response" >&2 + exit 1 +fi + +awk -v m="$took_ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/elasticsearch/run.sh b/elasticsearch/run.sh deleted file mode 100755 index 5c2607aadf..0000000000 --- a/elasticsearch/run.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat 'queries.sql' | while read -r QUERY; do - sync && echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo -n "[" - - for i in $(seq 1 $TRIES); do - - # clear query cache between runs - curl -X POST 'http://localhost:9200/hits/_cache/clear?pretty' &>/dev/null - - JSON="{\"query\" : \"$QUERY\" }" - - # translate query to DSL - DSL=$(curl -s -X POST "http://localhost:9200/_sql/translate?pretty" -H 'Content-Type: application/json' -d"$JSON" ) - - # start external timer - START=`date +%s.%N` - - # Run DSL directly through search API - ES_RSP=$(curl -s -X GET "http://localhost:9200/hits/_search" -H 'Content-Type: application/json' -d"$DSL" ) - - # run query through SQL API (choosing not to use SQL API directly, because it stalls some queries w/o feedback or cancelling, e.g. 6, 13-15, 17, 31-36) - # curl -X POST 'http://localhost:9200/_sql?format=txt&pretty' -H 'Content-Type: application/json' -d"$JSON" #&>/dev/null - - # calculate timing outside of Elasticsearch (needed for runs through SQL API which does not return the time it took to run) - END=`date +%s.%N` - RES=$( echo "$END - $START" | bc -l ) - - # retrieve timing from Elastic Search API "took" parameter and convert to seconds - ES_TIME=$(echo $ES_RSP | jq -r '.took') - ES_TIME=$(echo "scale=4; $ES_TIME / 1000" | bc) - - # output ES_TIME to console (it's more accurate), and if ES returned an error, print null - [[ "$( jq 'has("error")' <<< $ES_RSP )" == "true" ]] && echo -n "null" || echo -n "$ES_TIME" - [[ "$i" != $TRIES ]] && echo -n ", " - - done - - echo "]," - -done; diff --git a/elasticsearch/start b/elasticsearch/start new file mode 100755 index 0000000000..8fa3183341 --- /dev/null +++ b/elasticsearch/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if curl -sSf 'http://localhost:9200' >/dev/null 2>&1; then + exit 0 +fi +sudo systemctl start elasticsearch.service diff --git a/elasticsearch/stop b/elasticsearch/stop new file mode 100755 index 0000000000..db18bc1953 --- /dev/null +++ b/elasticsearch/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop elasticsearch.service || true diff --git a/firebolt-parquet-partitioned/benchmark.sh b/firebolt-parquet-partitioned/benchmark.sh index a037d47792..2f5a3e0d82 100755 --- a/firebolt-parquet-partitioned/benchmark.sh +++ b/firebolt-parquet-partitioned/benchmark.sh @@ -3,7 +3,7 @@ # Download the partitioned hits parquet files echo "Downloading dataset..." rm -rf data -../download-hits-parquet-partitioned data +../lib/download-hits-parquet-partitioned data # Start the container sudo apt-get install -y docker.io jq diff --git a/firebolt-parquet/benchmark.sh b/firebolt-parquet/benchmark.sh index b94b84c73c..70055b8f28 100755 --- a/firebolt-parquet/benchmark.sh +++ b/firebolt-parquet/benchmark.sh @@ -3,7 +3,7 @@ # Download the hits.parquet file echo "Downloading dataset..." rm -rf data -../download-hits-parquet-single data +../lib/download-hits-parquet-single data # Start the container sudo apt-get install -y docker.io jq diff --git a/firebolt/benchmark.sh b/firebolt/benchmark.sh index 98ce84d905..b4e324dfa3 100755 --- a/firebolt/benchmark.sh +++ b/firebolt/benchmark.sh @@ -3,7 +3,7 @@ # Download the hits.parquet file echo "Downloading dataset..." rm -rf data -../download-hits-parquet-single data +../lib/download-hits-parquet-single data # Start the container sudo apt-get install -y docker.io jq diff --git a/gizmosql/benchmark.sh b/gizmosql/benchmark.sh index 558cb647d8..b851876173 100755 --- a/gizmosql/benchmark.sh +++ b/gizmosql/benchmark.sh @@ -1,50 +1,5 @@ #!/bin/bash - -# needed by DuckDB -export HOME=/home/ubuntu - -# Install requirements -sudo apt-get update -y -sudo apt-get install -y curl unzip netcat-openbsd - -# Install the GizmoSQL server and client (gizmosql_client is the CLI shell) -# via the official one-line installer (https://install.gizmosql.com). -curl -fsSL https://install.gizmosql.com/install.sh | sh -export PATH="$HOME/.local/bin:$PATH" - -# Source our env vars and utility functions for starting/stopping gizmosql server -. util.sh - -# Start the GizmoSQL server in the background -start_gizmosql - -# Create the table -gizmosql_client --file create.sql - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -time gizmosql_client --file load.sql - -stop_gizmosql - -# Run the queries -./run.sh 2>&1 | tee log.txt - -# Remove carriage returns from the log -sed -i 's/\r$//' log.txt - -echo -n "Data size: " -wc -c clickbench.db - -cat log.txt | \ - grep -E 'Run Time: [0-9.]+s|Killed|Segmentation' | \ - sed -E 's/.*Run Time: ([0-9.]+)s.*/\1/; s/.*(Killed|Segmentation).*/null/' | \ - awk '{ - if (NR % 3 == 1) printf "["; - if ($1 == "null") printf "null"; - else printf $1; - if (NR % 3 == 0) printf "],\n"; - else printf ", "; - }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/gizmosql/check b/gizmosql/check new file mode 100755 index 0000000000..d166664815 --- /dev/null +++ b/gizmosql/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# A simple TCP probe is the cheapest health check; the JDBC arrow-flight URI +# is auth-gated, so we just check the listener is up. +exec nc -z localhost 31337 diff --git a/gizmosql/data-size b/gizmosql/data-size new file mode 100755 index 0000000000..536ed36d93 --- /dev/null +++ b/gizmosql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +wc -c < clickbench.db | awk '{print $1}' diff --git a/gizmosql/install b/gizmosql/install new file mode 100755 index 0000000000..982f201fe0 --- /dev/null +++ b/gizmosql/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y curl unzip netcat-openbsd + +# Install gizmosql_server + gizmosql_client via the official one-line +# installer (https://install.gizmosql.com). It handles arch/OS detection +# and installs to ~/.local/bin. +if ! command -v gizmosql_server >/dev/null 2>&1; then + curl -fsSL https://install.gizmosql.com/install.sh | sh + if [ -d "$HOME/.local/bin" ]; then + sudo install -m 0755 "$HOME/.local/bin/gizmosql_server" "$HOME/.local/bin/gizmosql_client" /usr/local/bin/ + fi +fi diff --git a/gizmosql/load b/gizmosql/load new file mode 100755 index 0000000000..7e06b0d725 --- /dev/null +++ b/gizmosql/load @@ -0,0 +1,16 @@ +#!/bin/bash +set -eu + +. ./util.sh + +# Idempotent: blow away any prior database; the server has it open, so +# stop it first to release the file lock. +stop_gizmosql || true +rm -f clickbench.db +start_gizmosql + +gizmosql_client --file create.sql +gizmosql_client --file load.sql + +rm -f hits.parquet +sync diff --git a/gizmosql/query b/gizmosql/query new file mode 100755 index 0000000000..937e567bb0 --- /dev/null +++ b/gizmosql/query @@ -0,0 +1,37 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via gizmosql_client. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# gizmosql_client's ".timer on" "Run Time: s" output). +# Exit non-zero on error. +set -e + +. ./util.sh + +query=$(cat) + +# .timer on: emit "Run Time: s" per statement. +# .mode trash: discard result rows so timing parsing isn't polluted. +script=$(printf '.timer on\n.mode trash\n%s\n' "$query") + +raw=$(printf '%s' "$script" | gizmosql_client 2>&1) && exit_code=0 || exit_code=$? + +clean=$(printf '%s\n' "$raw" | tr -d '\r') + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$clean" | grep -qiE 'error|exception|Killed|Segmentation'; then + printf '%s\n' "$clean" >&2 + exit 1 +fi + +printf '%s\n' "$clean" + +secs=$(printf '%s\n' "$clean" \ + | grep -oP 'Run Time:\s*\K[0-9.]+' \ + | tail -n1) + +if [ -z "$secs" ]; then + echo "no timing in gizmosql_client output" >&2 + exit 1 +fi + +awk -v s="$secs" 'BEGIN { printf "%.3f\n", s }' >&2 diff --git a/gizmosql/run.sh b/gizmosql/run.sh deleted file mode 100755 index 5364db9507..0000000000 --- a/gizmosql/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Source our env vars -. util.sh - -TRIES=3 -TEMP_SQL_FILE="/tmp/benchmark_queries_$$.sql" - -# Ensure server is stopped on script exit -trap stop_gizmosql EXIT - -# Read queries from file -mapfile -t queries < queries.sql - -echo "Running benchmark with ${#queries[@]} queries, ${TRIES} tries each..." - -for query in "${queries[@]}"; do - > "${TEMP_SQL_FILE}" - - # Clear Linux memory caches to ensure fair benchmark comparisons - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - # Start the GizmoSQL server - start_gizmosql - - # Enable timer and discard result rows (we only care about Run Time) - echo ".timer on" >> "${TEMP_SQL_FILE}" - echo ".mode trash" >> "${TEMP_SQL_FILE}" - - # Add a comment to identify the query in the output - echo "-- Query: ${query}" >> "${TEMP_SQL_FILE}" - - # Repeat each query TRIES times - for i in $(seq 1 ${TRIES}); do - echo "${query}" >> "${TEMP_SQL_FILE}" - done - - # Execute the query script (timer output goes to stderr; merge to stdout) - gizmosql_client --file "${TEMP_SQL_FILE}" 2>&1 - - # Stop the server before next query - stop_gizmosql -done - -# Clean up -rm -f "${TEMP_SQL_FILE}" diff --git a/gizmosql/start b/gizmosql/start new file mode 100755 index 0000000000..29d3910ce2 --- /dev/null +++ b/gizmosql/start @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +. ./util.sh + +# Idempotent: if port 31337 is already open, do nothing. +if nc -z localhost 31337 2>/dev/null; then + exit 0 +fi + +start_gizmosql diff --git a/gizmosql/stop b/gizmosql/stop new file mode 100755 index 0000000000..5a4f08d1cc --- /dev/null +++ b/gizmosql/stop @@ -0,0 +1,7 @@ +#!/bin/bash + +. ./util.sh +stop_gizmosql || true + +# Belt-and-braces: kill any leftover gizmosql_server process. +pkill -x gizmosql_server 2>/dev/null || true diff --git a/gizmosql/util.sh b/gizmosql/util.sh index da6b3e2363..327ac9af7f 100755 --- a/gizmosql/util.sh +++ b/gizmosql/util.sh @@ -5,7 +5,17 @@ export GIZMOSQL_HOST=localhost export GIZMOSQL_PORT=31337 export GIZMOSQL_USER=clickbench export GIZMOSQL_PASSWORD=clickbench -PID_FILE="/tmp/gizmosql_server_$$.pid" +# Fixed PID-file path so start/stop/load/query all resolve to the same file +# even though each one sources util.sh in its own subshell. +PID_FILE="${PWD}/gizmosql_server.pid" + +# Wait for the server to become reachable. Used by stop after kill, and by +# load before reusing the database. +wait_for_gizmosql() { + while ! nc -z "${GIZMOSQL_HOST}" "${GIZMOSQL_PORT}" 2>/dev/null; do + sleep 1 + done +} # Function to start the GizmoSQL server start_gizmosql() { diff --git a/glaredb-partitioned/benchmark.sh b/glaredb-partitioned/benchmark.sh index b33b578e26..3b63e772a6 100755 --- a/glaredb-partitioned/benchmark.sh +++ b/glaredb-partitioned/benchmark.sh @@ -1,31 +1,5 @@ -#!/usr/bin/env bash - -set -e - -repo_root=$(git rev-parse --show-toplevel) -script_dir=$(dirname "$0") - -if [[ "$(basename "$repo_root")" == "glaredb" ]]; then - # Inside glaredb repo, build from source. - cargo build --release --bin glaredb - cp "${repo_root}/target/release/glaredb" "${script_dir}/glaredb" -else - # Not in glaredb repo, use prebuilt binary. - export GLAREDB_INSTALL_DIR="${script_dir}" - export GLAREDB_VERSION="v25.5.11" - curl -fsSL https://glaredb.com/install.sh | sh -fi - -# Get the data. -"${script_dir}/../download-hits-parquet-partitioned" "${script_dir}/data" -pushd "${script_dir}/data" -echo "Data size: $(du -bcs hits*.parquet | grep total)" -echo "Load time: 0" -popd - -# Ensure working directory is the script dir. The view that gets created uses a -# relative path. -pushd "${script_dir}" - -./run.sh partitioned -cat results.json +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/glaredb-partitioned/check b/glaredb-partitioned/check new file mode 100755 index 0000000000..bf7e530008 --- /dev/null +++ b/glaredb-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./glaredb -c "SELECT 1" >/dev/null diff --git a/glaredb-partitioned/data-size b/glaredb-partitioned/data-size new file mode 100755 index 0000000000..400c518435 --- /dev/null +++ b/glaredb-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs data | awk '/total$/ { print $1 }' diff --git a/glaredb-partitioned/install b/glaredb-partitioned/install new file mode 100755 index 0000000000..5891568bc2 --- /dev/null +++ b/glaredb-partitioned/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if [ ! -x ./glaredb ]; then + export GLAREDB_INSTALL_DIR="$(pwd)" + export GLAREDB_VERSION="v25.5.11" + curl -fsSL https://glaredb.com/install.sh | sh +fi diff --git a/glaredb-partitioned/load b/glaredb-partitioned/load new file mode 100755 index 0000000000..c2afca99a4 --- /dev/null +++ b/glaredb-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +# glaredb-partitioned's create.sql references ./data/hits_*.parquet, so move +# the partitioned files into the expected subdir. +set -e + +mkdir -p data +mv hits_*.parquet data/ 2>/dev/null || true +sync diff --git a/glaredb-partitioned/query b/glaredb-partitioned/query new file mode 100755 index 0000000000..09c4727a9f --- /dev/null +++ b/glaredb-partitioned/query @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +query=$(cat) + +out=$(./glaredb --init create.sql -c ".timer on" -c "$query" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +printf '%s\n' "$out" | grep -v '^Execution duration' || true + +printf '%s\n' "$out" | awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' | tail -n1 >&2 diff --git a/glaredb-partitioned/results/20250525/c6a.4xlarge.json b/glaredb-partitioned/results/20250525/c6a.4xlarge.json index a3d329dac1..9ef19ca722 100644 --- a/glaredb-partitioned/results/20250525/c6a.4xlarge.json +++ b/glaredb-partitioned/results/20250525/c6a.4xlarge.json @@ -7,8 +7,7 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ], "load_time": 0, "data_size": 14779976446, diff --git a/glaredb-partitioned/results/20250710/c6a.4xlarge.json b/glaredb-partitioned/results/20250710/c6a.4xlarge.json index b23c2db754..376f8b8c44 100644 --- a/glaredb-partitioned/results/20250710/c6a.4xlarge.json +++ b/glaredb-partitioned/results/20250710/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","serverless"], + "tags": ["Rust"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/glaredb-partitioned/results/20250712/c8g.4xlarge.json b/glaredb-partitioned/results/20250712/c8g.4xlarge.json index d2e40ad9d7..b5c151ce7c 100644 --- a/glaredb-partitioned/results/20250712/c8g.4xlarge.json +++ b/glaredb-partitioned/results/20250712/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","serverless"], + "tags": ["Rust"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/glaredb-partitioned/results/20260509/c6a.4xlarge.json b/glaredb-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..fd8630adb4 --- /dev/null +++ b/glaredb-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "GlareDB (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [0.035, 0.02, 0.021], + [0.121, 0.106, 0.098], + [0.247, 0.213, 0.201], + [0.526, 0.145, 0.145], + [1.644, 0.937, 0.946], + [1.389, 1.153, 1.14], + [0.122, 0.098, 0.108], + [0.146, 0.11, 0.14], + [1.519, 1.462, 1.481], + [1.904, 1.767, 1.745], + [0.717, 0.605, 0.603], + [0.823, 0.701, 0.706], + [1.497, 1.279, 1.302], + [3.56, 2.385, 2.332], + [1.643, 1.429, 1.407], + [1.35, 1.297, 1.267], + [3.577, 2.549, 2.515], + [3.282, 2.179, 2.198], + [6.696, 4.668, 4.615], + [0.258, 0.153, 0.164], + [10.992, 1.909, 2.019], + [12.067, 1.59, 1.582], + [23.397, 3.276, 3.127], + [59.383, 16.249, 16.165], + [2.826, 1.191, 1.157], + [1.337, 1.239, 1.28], + [2.868, 1.401, 1.388], + [9.664, 2.836, 2.751], + [11.64, 11.804, 11.656], + [4.828, 4.791, 4.816], + [2.386, 1.461, 1.377], + [6.278, 1.717, 1.746], + [8.024, null, null], + [11.086, 4.464, 4.413], + [11.085, 4.639, 4.652], + [1.442, 1.384, 1.379], + [0.259, 0.177, 0.176], + [0.195, 0.147, 0.148], + [0.171, 0.087, 0.094], + [0.432, 0.318, 0.318], + [0.103, 0.057, 0.058], + [0.099, 0.053, 0.051], + [0.094, 0.059, 0.078] +] +} + diff --git a/glaredb-partitioned/run.sh b/glaredb-partitioned/run.sh deleted file mode 100755 index 05d513367b..0000000000 --- a/glaredb-partitioned/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -set -eu -set -o pipefail - -TRIES=3 -QUERY_NUM=0 - -echo "[" > results.json -echo "query_num,iteration,duration" > results.csv - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "${QUERY_NUM}: ${query}" - - [ "${QUERY_NUM}" != 0 ] && echo "," >> results.json - echo -n " [" >> results.json - - for i in $(seq 1 $TRIES); do - output=$(./glaredb --init create.sql -c ".timer on" -c "${query}") - duration=$(awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' <<< "$output") - - echo "$output" - - if [ -z "${duration}" ]; then - echo "Query failed" - exit 1 - fi - - # JSON results - echo -n "${duration}" >> results.json - [ "${i}" != "${TRIES}" ] && echo -n "," >> results.json - - # CSV results - echo "${QUERY_NUM},${i},${duration}" >> results.csv - done - - echo -n "]" >> results.json - - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "" >> results.csv -echo "" >> results.json -echo "]" >> results.json diff --git a/glaredb-partitioned/start b/glaredb-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/glaredb-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb-partitioned/stop b/glaredb-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/glaredb-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb-partitioned/template.json b/glaredb-partitioned/template.json index eb916baa49..55d47c71f5 100644 --- a/glaredb-partitioned/template.json +++ b/glaredb-partitioned/template.json @@ -4,7 +4,6 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ] } diff --git a/glaredb/benchmark.sh b/glaredb/benchmark.sh index 89ec7011ec..fc4bacc8f3 100755 --- a/glaredb/benchmark.sh +++ b/glaredb/benchmark.sh @@ -1,32 +1,5 @@ -#!/usr/bin/env bash - -set -e - -repo_root=$(git rev-parse --show-toplevel) -script_dir=$(dirname "$0") - -if [[ "$(basename "$repo_root")" == "glaredb" ]]; then - # Inside glaredb repo, build from source. - cargo build --release --bin glaredb - cp "${repo_root}/target/release/glaredb" "${script_dir}/glaredb" -else - # Not in glaredb repo, use prebuilt binary. - export GLAREDB_INSTALL_DIR="${script_dir}" - export GLAREDB_VERSION="v25.5.11" - curl -fsSL https://glaredb.com/install.sh | sh -fi - -# Get the data. -"${script_dir}/../download-hits-parquet-single" "${script_dir}/data" -pushd "${script_dir}/data" -echo "Data size: $(du -bcs hits*.parquet | grep total)" -popd - -# Ensure working directory is the script dir. The view that gets created uses a -# relative path. -pushd "${script_dir}" - -./run.sh single -cat results.json - -echo "Load time: 0" +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/glaredb/check b/glaredb/check new file mode 100755 index 0000000000..bf7e530008 --- /dev/null +++ b/glaredb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./glaredb -c "SELECT 1" >/dev/null diff --git a/glaredb/data-size b/glaredb/data-size new file mode 100755 index 0000000000..400c518435 --- /dev/null +++ b/glaredb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs data | awk '/total$/ { print $1 }' diff --git a/glaredb/install b/glaredb/install new file mode 100755 index 0000000000..b60f252546 --- /dev/null +++ b/glaredb/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Idempotent: only download glaredb if not already present. +if [ ! -x ./glaredb ]; then + export GLAREDB_INSTALL_DIR="$(pwd)" + export GLAREDB_VERSION="v25.5.11" + curl -fsSL https://glaredb.com/install.sh | sh +fi diff --git a/glaredb/load b/glaredb/load new file mode 100755 index 0000000000..06c148945b --- /dev/null +++ b/glaredb/load @@ -0,0 +1,8 @@ +#!/bin/bash +# glaredb's create.sql references ./data/hits.parquet, so move the parquet +# file into the expected subdir. +set -e + +mkdir -p data +mv hits.parquet data/ 2>/dev/null || true +sync diff --git a/glaredb/query b/glaredb/query new file mode 100755 index 0000000000..4e88d17e02 --- /dev/null +++ b/glaredb/query @@ -0,0 +1,19 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via glaredb with create.sql as init. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) + +out=$(./glaredb --init create.sql -c ".timer on" -c "$query" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +# glaredb prints "Execution duration (s): X.YYY"; everything else is result. +printf '%s\n' "$out" | grep -v '^Execution duration' || true + +printf '%s\n' "$out" | awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' | tail -n1 >&2 diff --git a/glaredb/results/20240202/c6a.4xlarge.json b/glaredb/results/20240202/c6a.4xlarge.json index 18998180e4..cf8e921e7d 100644 --- a/glaredb/results/20240202/c6a.4xlarge.json +++ b/glaredb/results/20240202/c6a.4xlarge.json @@ -7,8 +7,7 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ], "load_time": 0, "data_size": 14779976446, diff --git a/glaredb/results/20240202/c6a.metal.json b/glaredb/results/20240202/c6a.metal.json index 559912d8aa..1662fcc99a 100644 --- a/glaredb/results/20240202/c6a.metal.json +++ b/glaredb/results/20240202/c6a.metal.json @@ -5,8 +5,7 @@ "cluster_size": 1, "comment": "", "tags": [ - "Rust", - "serverless" + "Rust" ], "load_time": 0, "data_size": 14779976446, diff --git a/glaredb/results/20250506/c6a.4xlarge.json b/glaredb/results/20250506/c6a.4xlarge.json index 25945a9e41..1d484b824d 100644 --- a/glaredb/results/20250506/c6a.4xlarge.json +++ b/glaredb/results/20250506/c6a.4xlarge.json @@ -7,8 +7,7 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ], "load_time": 0, "data_size": 14779976446, diff --git a/glaredb/results/20250525/c6a.4xlarge.json b/glaredb/results/20250525/c6a.4xlarge.json index 4501f9ab4b..77c23296ae 100644 --- a/glaredb/results/20250525/c6a.4xlarge.json +++ b/glaredb/results/20250525/c6a.4xlarge.json @@ -7,8 +7,7 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ], "load_time": 0, "data_size": 14779976446, diff --git a/glaredb/results/20250710/c6a.4xlarge.json b/glaredb/results/20250710/c6a.4xlarge.json index adac2c95d8..228d9c3cfe 100644 --- a/glaredb/results/20250710/c6a.4xlarge.json +++ b/glaredb/results/20250710/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","serverless"], + "tags": ["Rust"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/glaredb/results/20250712/c8g.4xlarge.json b/glaredb/results/20250712/c8g.4xlarge.json index e5f3ed8537..9c6dcb5b31 100644 --- a/glaredb/results/20250712/c8g.4xlarge.json +++ b/glaredb/results/20250712/c8g.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["Rust","serverless"], + "tags": ["Rust"], "load_time": 0, "data_size": 14779976446, "result": [ diff --git a/glaredb/results/20260509/c6a.4xlarge.json b/glaredb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..b5edbddf08 --- /dev/null +++ b/glaredb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "GlareDB (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [0.048, 0.04, 0.037], + [0.135, 0.106, 0.111], + [0.239, 0.218, 0.214], + [0.376, 0.153, 0.155], + [1.032, 0.98, 0.957], + [1.067, 0.98, 0.99], + [0.128, 0.121, 0.107], + [0.154, 0.12, 0.12], + [1.504, 1.439, 1.436], + [1.862, 1.739, 1.745], + [0.569, 0.478, 0.478], + [0.654, 0.578, 0.554], + [1.065, 0.992, 0.966], + [3.029, 2.024, 2.034], + [1.177, 1.084, 1.072], + [1.397, 1.304, 1.315], + [3.058, 2.39, 2.407], + [2.809, 2.026, 2.039], + [5.983, 4.46, 4.449], + [0.245, 0.176, 0.174], + [9.676, 1.533, 1.704], + [11.338, 1.536, 1.469], + [22.16, 3.567, 3.611], + [55.942, 14.356, 14.394], + [2.659, 0.816, 0.819], + [1.012, 0.912, 0.919], + [2.696, 1.062, 1.053], + [9.61, 1.567, 1.521], + [12.315, 10.787, 11.983], + [4.75, 4.699, 4.713], + [2.306, 1.233, 1.224], + [6.061, 1.538, 1.545], + [7.539, 6.149, 6.175], + [11.066, 3.814, 3.826], + [11.011, 3.958, 3.971], + [1.427, 1.363, 1.393], + [0.265, 0.203, 0.19], + [0.204, 0.155, 0.154], + [0.203, 0.124, 0.123], + [0.459, 0.356, 0.344], + [0.118, 0.075, 0.074], + [0.114, 0.071, 0.072], + [0.116, 0.077, 0.078] +] +} + diff --git a/glaredb/results/20260509/c6a.metal.json b/glaredb/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..4673817ed3 --- /dev/null +++ b/glaredb/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "GlareDB (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust"], + "load_time": 1, + "data_size": 14779976446, + "result": [ + [0.053, 0.06, 0.058], + [0.068, 0.061, 0.058], + [0.103, 0.105, 0.113], + [0.244, 0.111, 0.101], + [1.101, 1.13, 1.079], + [0.948, 0.928, 0.948], + [0.062, 0.073, 0.058], + [0.322, 0.33, 0.325], + [1.88, 1.864, 1.818], + [2.172, 2.248, 2.186], + [0.966, 0.977, 0.937], + [1.065, 1.016, 1.032], + [1.07, 1.062, 1.06], + [2.507, 2.255, 2.197], + [1.197, 1.164, 1.195], + [1.551, 1.471, 1.539], + [2.343, 2.092, 2.013], + [2.136, 1.69, 1.721], + [4.426, 2.645, 2.616], + [0.174, 0.126, 0.114], + [9.608, 3.757, 3.6], + [11.105, 1.56, 1.571], + [21.753, 3.148, 3.113], + [56.564, 6.104, 6.017], + [2.439, 0.645, 0.707], + [1.049, 1.06, 1.053], + [2.474, 0.764, 0.754], + [9.514, 1.834, 1.948], + [27.155, 20.104, 23.7], + [1.185, 1.18, 1.144], + [2.171, 1.199, 1.201], + [5.581, 1.528, 1.601], + [4.871, 2.984, 2.973], + [10.024, 2.381, 2.403], + [10.183, 2.565, 2.522], + [1.319, 1.351, 1.325], + [0.614, 0.563, 0.567], + [0.485, 0.471, 0.491], + [0.458, 0.404, 0.394], + [0.979, 0.888, 0.896], + [0.385, 0.361, 0.386], + [0.369, 0.361, 0.358], + [0.411, 0.379, 0.365] +] +} + diff --git a/glaredb/run.sh b/glaredb/run.sh deleted file mode 100755 index 05d513367b..0000000000 --- a/glaredb/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -set -eu -set -o pipefail - -TRIES=3 -QUERY_NUM=0 - -echo "[" > results.json -echo "query_num,iteration,duration" > results.csv - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "${QUERY_NUM}: ${query}" - - [ "${QUERY_NUM}" != 0 ] && echo "," >> results.json - echo -n " [" >> results.json - - for i in $(seq 1 $TRIES); do - output=$(./glaredb --init create.sql -c ".timer on" -c "${query}") - duration=$(awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' <<< "$output") - - echo "$output" - - if [ -z "${duration}" ]; then - echo "Query failed" - exit 1 - fi - - # JSON results - echo -n "${duration}" >> results.json - [ "${i}" != "${TRIES}" ] && echo -n "," >> results.json - - # CSV results - echo "${QUERY_NUM},${i},${duration}" >> results.csv - done - - echo -n "]" >> results.json - - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "" >> results.csv -echo "" >> results.json -echo "]" >> results.json diff --git a/glaredb/start b/glaredb/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/glaredb/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb/stop b/glaredb/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/glaredb/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb/template.json b/glaredb/template.json index 4b6effcb2e..5039ecf5f7 100644 --- a/glaredb/template.json +++ b/glaredb/template.json @@ -4,7 +4,6 @@ "hardware": "cpu", "tuned": "no", "tags": [ - "Rust", - "serverless" + "Rust" ] } diff --git a/greenplum/benchmark.sh b/greenplum/benchmark.sh index 79606afb7e..531bd65038 100755 --- a/greenplum/benchmark.sh +++ b/greenplum/benchmark.sh @@ -1,80 +1,5 @@ #!/bin/bash - -# NOTE: it requires Ubuntu 18.04 -# Greenplum does not install on any newer system. - -echo "This script must be run from gpadmin user. Press enter to continue." -read -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:greenplum/db -sudo apt-get update -y -sudo apt-get install -y greenplum-db-6 -sudo rm -rf /gpmaster /gpdata* -ssh-keygen -t rsa -b 4096 -touch /home/gpadmin/.ssh/authorized_keys -chmod 600 ~/.ssh/authorized_keys -cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys -sudo echo "# kernel.shmall = _PHYS_PAGES / 2 # See Shared Memory Pages -kernel.shmall = 197951838 -# kernel.shmmax = kernel.shmall * PAGE_SIZE -kernel.shmmax = 810810728448 -kernel.shmmni = 4096 -vm.overcommit_memory = 2 # See Segment Host Memory -vm.overcommit_ratio = 95 # See Segment Host Memory - -net.ipv4.ip_local_port_range = 10000 65535 # See Port Settings -kernel.sem = 500 2048000 200 4096 -kernel.sysrq = 1 -kernel.core_uses_pid = 1 -kernel.msgmnb = 65536 -kernel.msgmax = 65536 -kernel.msgmni = 2048 -net.ipv4.tcp_syncookies = 1 -net.ipv4.conf.default.accept_source_route = 0 -net.ipv4.tcp_max_syn_backlog = 4096 -net.ipv4.conf.all.arp_filter = 1 -net.core.netdev_max_backlog = 10000 -net.core.rmem_max = 2097152 -net.core.wmem_max = 2097152 -vm.swappiness = 10 -vm.zone_reclaim_mode = 0 -vm.dirty_expire_centisecs = 500 -vm.dirty_writeback_centisecs = 100 -vm.dirty_background_ratio = 0 # See System Memory -vm.dirty_ratio = 0 -vm.dirty_background_bytes = 1610612736 -vm.dirty_bytes = 4294967296" |sudo tee -a /etc/sysctl.conf -sudo sysctl -p - -echo "* soft nofile 524288 -* hard nofile 524288 -* soft nproc 131072 -* hard nproc 131072" |sudo tee -a /etc/security/limits.conf -echo "RemoveIPC=no" |sudo tee -a /etc/systemd/logind.conf -echo "Now you need to reboot the machine. Press Enter if you already rebooted, or reboot now and run the script once again" -read -source /opt/greenplum-db-*/greenplum_path.sh -cp $GPHOME/docs/cli_help/gpconfigs/gpinitsystem_singlenode . -echo localhost > ./hostlist_singlenode -sed -i "s/MASTER_HOSTNAME=[a-z_]*/MASTER_HOSTNAME=$(hostname)/" gpinitsystem_singlenode -sed -i "s@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2)@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14)@" gpinitsystem_singlenode -sudo mkdir /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 -sudo chmod 777 /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 -gpinitsystem -ac gpinitsystem_singlenode -export MASTER_DATA_DIRECTORY=/gpmaster/gpsne-1/ -../download-hits-tsv -chmod 777 ~ hits.tsv -psql -d postgres -f create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -nohup gpfdist & -echo -n "Load time: " -command time -f '%e' psql -d postgres -t -c "insert into hits select * from hits_ext;" -echo -n "Load time: " -command time -f '%e' psql -d postgres -t -c "ANALYZE hits;" -du -sh /gpdata* -./run.sh 2>&1 | tee log.txt -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/greenplum/check b/greenplum/check new file mode 100755 index 0000000000..b8dabbf607 --- /dev/null +++ b/greenplum/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='gparray' +psql -h localhost -p 5432 -U gpadmin -d demo -c 'SELECT 1' >/dev/null diff --git a/greenplum/create.sql b/greenplum/create.sql index a2ae8a7eea..7070a26a92 100644 --- a/greenplum/create.sql +++ b/greenplum/create.sql @@ -109,8 +109,4 @@ CREATE TABLE hits ) with (appendoptimized=true,orientation=column,compresstype=zstd) DISTRIBUTED BY (UserID); -CREATE INDEX hits_idx on hits using btree (CounterID, EventDate, UserID, EventTime, WatchID); -drop external table if exists hits_ext; -CREATE EXTERNAL TABLE hits_ext (like hits) -LOCATION ('gpfdist://localhost:8080/hits.tsv') -FORMAT 'TEXT'; +CREATE INDEX hits_idx on hits using btree (CounterID, EventDate, UserID, EventTime, WatchID); diff --git a/greenplum/data-size b/greenplum/data-size new file mode 100755 index 0000000000..37aae7e85f --- /dev/null +++ b/greenplum/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='gparray' +psql -h localhost -p 5432 -U gpadmin -d demo -t -A -c "SELECT pg_database_size('demo')" diff --git a/greenplum/install b/greenplum/install new file mode 100755 index 0000000000..d2734fd55c --- /dev/null +++ b/greenplum/install @@ -0,0 +1,25 @@ +#!/bin/bash +# Open-source Greenplum's apt PPA was removed and the upstream +# greenplum-db/gpdb repo was archived in 2024, so the previous native +# install (add-apt-repository ppa:greenplum/db) no longer works on any +# host. Use the community-maintained woblerr/greenplum Docker image +# instead — it bundles a working single-node Greenplum 7 cluster. +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-greenplum} +GREENPLUM_VERSION=${GREENPLUM_VERSION:-7.1.0-ubuntu22.04} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "woblerr/greenplum:$GREENPLUM_VERSION" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -e GREENPLUM_PASSWORD=gparray \ + -p 5432:5432 \ + "woblerr/greenplum:$GREENPLUM_VERSION" diff --git a/greenplum/load b/greenplum/load new file mode 100755 index 0000000000..a5cd4b3db7 --- /dev/null +++ b/greenplum/load @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-greenplum} +export PGPASSWORD='gparray' + +# Schema is created from the host (psql via the mapped port). +psql -h localhost -p 5432 -U gpadmin -d demo -v ON_ERROR_STOP=1 -f create.sql + +# Move the source TSV into the container and load via COPY (the previous +# gpfdist-based load relied on a native install layout that no longer +# exists in the containerized cluster). +sudo docker cp hits.tsv "$CONTAINER_NAME":/tmp/hits.tsv +psql -h localhost -p 5432 -U gpadmin -d demo -v ON_ERROR_STOP=1 -t \ + -c "COPY hits FROM '/tmp/hits.tsv' WITH (FORMAT text);" +psql -h localhost -p 5432 -U gpadmin -d demo -v ON_ERROR_STOP=1 -t \ + -c "ANALYZE hits;" + +sudo docker exec "$CONTAINER_NAME" rm -f /tmp/hits.tsv +rm -f hits.tsv +sync diff --git a/greenplum/query b/greenplum/query new file mode 100755 index 0000000000..0e53bd00c8 --- /dev/null +++ b/greenplum/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the Greenplum +# container's `demo` database. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD='gparray' + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h localhost -p 5432 -U gpadmin -d demo -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/greenplum/results/20260509/c6a.4xlarge.json b/greenplum/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..77536efcb1 --- /dev/null +++ b/greenplum/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Greenplum", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 1234, + "data_size": 19342459105, + "result": [ + [3.11, 2.683, 2.723], + [1.993, 1.873, 1.858], + [4.171, 4.089, 4.036], + [3.06, 3.041, 2.961], + [11.707, 11.581, 11.613], + [28.562, 28.701, 28.27], + [3.312, 3.38, 3.285], + [1.963, 1.878, 1.884], + [18.019, 17.742, 17.721], + [23.2, 22.952, 23.232], + [3.68, 3.541, 3.528], + [3.934, 3.815, 3.801], + [8.259, 8.479, 8.133], + [11.114, 10.707, 10.574], + [9.253, 9.279, 9.099], + [18.956, 19.01, 18.967], + [19.797, 19.573, 19.517], + [11.391, 10.828, 10.877], + [33.484, 32.527, 32.748], + [13.278, 1.209, 1.2], + [12.479, 11.87, 11.916], + [13.363, 12.842, 12.854], + [18.755, 17.513, 17.514], + [61.595, 60.546, 60.573], + [4.296, 4.129, 4.117], + [3.353, 3.256, 3.283], + [4.297, 4.145, 4.124], + [20.175, 21.811, 19.862], + [208.391, 206.364, 205.921], + [52.505, 52.142, 54.065], + [8.251, 8.159, 8.174], + [10.354, 9.932, 10.126], + [51.195, 48.563, 48.085], + [35.022, 33.218, 33.922], + [37.263, 36.419, 35.967], + [19.181, 18.676, 18.683], + [0.635, 0.416, 0.417], + [0.425, 0.209, 0.207], + [0.385, 0.17, 0.169], + [1.162, 0.671, 0.672], + [0.385, 0.162, 0.167], + [0.394, 0.172, 0.172], + [0.44, 0.236, 0.235] +] +} + diff --git a/greenplum/run.sh b/greenplum/run.sh deleted file mode 100755 index 23a2756b7f..0000000000 --- a/greenplum/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - echo '\timing' > /tmp/query_temp.sql - echo "$query" >> /tmp/query_temp.sql - psql -d postgres -t -f /tmp/query_temp.sql 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/greenplum/start b/greenplum/start new file mode 100755 index 0000000000..74b8d10171 --- /dev/null +++ b/greenplum/start @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-greenplum} + +# Idempotent: re-running while the container is up is fine. +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" >/dev/null +fi diff --git a/greenplum/stop b/greenplum/stop new file mode 100755 index 0000000000..ea86f0a71e --- /dev/null +++ b/greenplum/stop @@ -0,0 +1,3 @@ +#!/bin/bash +CONTAINER_NAME=${CONTAINER_NAME:-greenplum} +sudo docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true diff --git a/heavyai/README.md b/heavyai/README.md new file mode 100644 index 0000000000..da53c89e38 --- /dev/null +++ b/heavyai/README.md @@ -0,0 +1,15 @@ +# HeavyDB / Heavy.AI + +## Sourcing the binary (May 2026) + +HEAVY.AI's apt repo and tarball CDN both started returning S3 `AccessDenied`: + + https://releases.heavy.ai/GPG-KEY-heavyai -> 403 + https://releases.heavy.ai/os/apt/dists/... -> 403 + https://releases.heavy.ai/os/tar/... -> 403 + +The source repo at `github.com/heavyai/heavydb` is alive (v9.0.0 released 2025-10-20, not archived) but its GitHub releases ship no compiled artifacts, and a full C++ build is too heavy to run inside cloud-init. + +`omnisci/core-os-cpu:v5.10.2` is the last public Docker image (Feb 2022) — it is OmniSciDB, the immediate predecessor of HeavyDB before the v6.0.0 rename. The benchmark schema and queries are vanilla enough to run unchanged against it. `install` now pulls that image, bind-mounts a `heavyai-storage/` directory, and the rest of the scripts (start / check / load / query / data-size) drive the container via `omnisql` instead of the systemd-managed native install. + +Override `HEAVYAI_VERSION` if you want a different OmniSci tag; the available ones are listed at . diff --git a/heavyai/benchmark.sh b/heavyai/benchmark.sh index 9390733098..6f7582ce15 100755 --- a/heavyai/benchmark.sh +++ b/heavyai/benchmark.sh @@ -1,56 +1,9 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y default-jre-headless apt-transport-https - -sudo apt-get install -y docker.io -sudo docker run -it --rm -v $(pwd):/host ubuntu:18.04 cp /lib/x86_64-linux-gnu/libtinfo.so.5 /host/ -sudo cp libtinfo.so.5 /usr/lib/x86_64-linux-gnu/ - -sudo useradd -U -m heavyai -sudo curl https://releases.heavy.ai/GPG-KEY-heavyai | sudo apt-key add - -echo "deb https://releases.heavy.ai/os/apt/ stable cpu" | sudo tee /etc/apt/sources.list.d/heavyai.list -sudo apt-get update -y -sudo apt-get install heavyai -y - -export HEAVYAI_USER=heavyai -export HEAVYAI_GROUP=heavyai -export HEAVYAI_STORAGE=/var/lib/heavyai -export HEAVYAI_PATH=/opt/heavyai -export HEAVYAI_LOG=/var/lib/heavyai/data/mapd_log - -pushd $HEAVYAI_PATH/systemd -./install_heavy_systemd.sh -popd - -# Press Enter multiple times. - -sudo systemctl start heavydb -sudo systemctl enable heavydb - -# Load the data - -../download-hits-csv -chmod 777 ~ hits.csv - -sudo bash -c "echo 'allowed-import-paths = [\"$(pwd)\"]' > /var/lib/heavyai/heavy.conf_" -sudo bash -c "cat /var/lib/heavyai/heavy.conf >> /var/lib/heavyai/heavy.conf_" -sudo bash -c "mv /var/lib/heavyai/heavy.conf_ /var/lib/heavyai/heavy.conf && chown heavyai /var/lib/heavyai/heavy.conf" -sudo systemctl restart heavydb - -/opt/heavyai/bin/heavysql -t -p HyperInteractive < create.sql -echo -n "Load time: " -command time -f '%e' /opt/heavyai/bin/heavysql -q -t -p HyperInteractive <<< "COPY hits FROM '$(pwd)/hits.csv' WITH (HEADER = 'false');" - -# Loaded: 99997497 recs, Rejected: 0 recs in 572.633 secs - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -du -bcs /var/lib/heavyai/ | grep total - -cat log.txt | grep -P 'Total time|null' | sed -r -e 's/^.*Total time: ([0-9]+) ms$/\1/' | - awk '{ if ($1 == "null") { print } else { print $1 / 1000 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +# omnisci/core-os-cpu's first cold start runs schema migrations, opens +# its catalog, and binds Thrift ports; 600 s wasn't enough on the first +# Docker rewrite run, so allow up to 15 minutes. +export BENCH_CHECK_TIMEOUT=900 +exec ../lib/benchmark-common.sh diff --git a/heavyai/check b/heavyai/check new file mode 100755 index 0000000000..6f05c4695c --- /dev/null +++ b/heavyai/check @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Match the pre-refactor heavysql incantation: no positional db arg, the +# default `omnisci` is implicit. Some 5.10.2 builds of omnisql treat the +# trailing positional arg as a script path rather than a db name. +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} +sudo docker exec -i "$CONTAINER_NAME" /omnisci/bin/omnisql \ + -p HyperInteractive -q -t <<< 'SELECT 1' >/dev/null 2>&1 diff --git a/heavyai/data-size b/heavyai/data-size new file mode 100755 index 0000000000..fac2d5383b --- /dev/null +++ b/heavyai/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# heavyai's storage is bind-mounted from heavyai-storage/. +sudo du -bcs heavyai-storage/ | grep total | awk '{print $1}' diff --git a/heavyai/install b/heavyai/install new file mode 100755 index 0000000000..740e8db9ac --- /dev/null +++ b/heavyai/install @@ -0,0 +1,31 @@ +#!/bin/bash +# HEAVY.AI's apt repo and tarball CDN (releases.heavy.ai/...) both +# started returning S3 AccessDenied. The source repo at +# github.com/heavyai/heavydb is alive but its GitHub releases ship no +# binaries, and a full C++ build is too heavy to run inside cloud-init. +# +# omnisci/core-os-cpu:v5.10.2 (Feb 2022) is the last public Docker +# image — OmniSciDB, the immediate predecessor of HeavyDB before the +# v6.0.0 rename. The schema and queries the benchmark uses are vanilla +# enough to run unchanged. +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} +HEAVYAI_VERSION=${HEAVYAI_VERSION:-v5.10.2} + +sudo apt-get update -y +sudo apt-get install -y docker.io + +sudo docker pull "omnisci/core-os-cpu:$HEAVYAI_VERSION" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +mkdir -p heavyai-storage +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 6274:6274 \ + -p 6273:6273 \ + -v "$(pwd)/heavyai-storage:/omnisci-storage" \ + "omnisci/core-os-cpu:$HEAVYAI_VERSION" diff --git a/heavyai/load b/heavyai/load new file mode 100755 index 0000000000..1b3ceff39e --- /dev/null +++ b/heavyai/load @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} + +# Schema is fed from the host over stdin. +sudo docker exec -i "$CONTAINER_NAME" /omnisci/bin/omnisql \ + -t -p HyperInteractive < create.sql + +# Move the CSV into the container so server-side COPY can read it without +# round-tripping over Thrift. +sudo docker cp hits.csv "$CONTAINER_NAME":/tmp/hits.csv +sudo docker exec -i "$CONTAINER_NAME" /omnisci/bin/omnisql \ + -q -t -p HyperInteractive \ + <<< "COPY hits FROM '/tmp/hits.csv' WITH (HEADER = 'false');" + +sudo docker exec "$CONTAINER_NAME" rm -f /tmp/hits.csv +rm -f hits.csv +sync diff --git a/heavyai/query b/heavyai/query new file mode 100755 index 0000000000..dbf895a69b --- /dev/null +++ b/heavyai/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via omnisql in the heavyai +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed +# from omnisql's "Total time: ms" footer). +# Exit non-zero on error. +set -e + +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} +query=$(cat) + +raw=$(sudo docker exec -i "$CONTAINER_NAME" /omnisci/bin/omnisql \ + -t -p HyperInteractive <<< "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE '^Exception|^Error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +ms=$(printf '%s\n' "$raw" | grep -oP 'Total time:\s*\K[0-9]+(?=\s*ms)' | tail -n1) + +if [ -z "$ms" ]; then + echo "no timing in omnisql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/heavyai/run.sh b/heavyai/run.sh deleted file mode 100755 index 516ad08bac..0000000000 --- a/heavyai/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - /opt/heavyai/bin/heavysql -t -p HyperInteractive <<< "${query}" | grep 'Total time' || echo 'null' - done; -done; diff --git a/heavyai/start b/heavyai/start new file mode 100755 index 0000000000..bfabc3f96d --- /dev/null +++ b/heavyai/start @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} + +# Idempotent: if already running, leave it alone. +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" >/dev/null +fi diff --git a/heavyai/stop b/heavyai/stop new file mode 100755 index 0000000000..b9fb21b20c --- /dev/null +++ b/heavyai/stop @@ -0,0 +1,3 @@ +#!/bin/bash +CONTAINER_NAME=${CONTAINER_NAME:-heavyai} +sudo docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true diff --git a/hologres/.gitignore b/hologres/.gitignore new file mode 100644 index 0000000000..853e630186 --- /dev/null +++ b/hologres/.gitignore @@ -0,0 +1,6 @@ +bin/ +hits_part_* +hits.tsv +load_out.txt +log_queries_*.txt +result_queries_*.txt diff --git a/hologres/benchmark.sh b/hologres/benchmark.sh index ef83844f89..754ae40bae 100755 --- a/hologres/benchmark.sh +++ b/hologres/benchmark.sh @@ -8,10 +8,38 @@ PORT=$4 DATABASE="hits" -# Install dependencies -sudo yum update -y -sudo yum install postgresql-server -y -sudo yum install postgresql-contrib -y +# Install dependencies. Hologres is a managed cloud service, so all this +# host needs is a psql client. Pull a postgres docker image once and run +# psql out of it — works on Ubuntu/Debian/Amazon Linux/RHEL alike. +PSQL_IMAGE="postgres:17-alpine" +if ! command -v docker >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io + elif command -v yum >/dev/null 2>&1; then + sudo yum install -y docker + else + echo "hologres: install docker manually first" >&2 + exit 1 + fi +fi +sudo systemctl start docker 2>/dev/null || sudo service docker start || true +sudo docker pull "$PSQL_IMAGE" + +# Drop a `psql` shim into ./bin/ that wraps `docker run`. Adding the dir to +# PATH lets the rest of this script and run.sh call `psql ...` normally — +# including `command time -f '%e' psql ...`, which would skip a bash +# function but does pick up shims found on PATH. +mkdir -p bin +cat > bin/psql < "$query_file" + +python3 - "$query_file" <<'PY' +import sys +import timeit +from tableauhyperapi import HyperProcess, Telemetry, Connection + +with open(sys.argv[1]) as f: + query = f.read() + +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint) as connection: + connection.execute_command(open("create.sql").read()) + start = timeit.default_timer() + rows = connection.execute_list_query(query) + end = timeit.default_timer() + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/hyper-parquet/query.py b/hyper-parquet/query.py deleted file mode 100755 index 2df4fb3b3e..0000000000 --- a/hyper-parquet/query.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 -import timeit -import sys -import subprocess - -from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode, HyperException - -query = sys.stdin.read() - -with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - with Connection(hyper.endpoint) as connection: - # Hyper only supports temporary external tables, so we need to create them on every query - connection.execute_command(open("create.sql").read()) - for try_num in range(3): - if try_num == 0: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - start = timeit.default_timer() - try: - connection.execute_list_query(query) - print(round(timeit.default_timer() - start, 3)) - except HyperException: - print("null") diff --git a/hyper-parquet/results/20260509/c6a.4xlarge.json b/hyper-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..c65db4e9a4 --- /dev/null +++ b/hyper-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Salesforce Hyper (Parquet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [0.984, 0.319, 0.318], + [0.879, 0.338, 0.337], + [1.297, 0.455, 0.473], + [2.185, 0.41, 0.378], + [2.688, 1.052, 1.019], + [2.848, 0.993, 0.931], + [1.046, 0.397, 0.362], + [0.931, 0.349, 0.31], + [2.956, 1.202, 1.221], + [3.717, 1.465, 1.461], + [2.313, 0.407, 0.411], + [2.34, 0.42, 0.452], + [2.866, 0.977, 0.939], + [4.666, 1.442, 1.424], + [2.377, 0.993, 1.026], + [2.093, 1.07, 1.043], + [3.869, 1.629, 1.656], + [3.751, 1.532, 1.536], + [6.212, 3.083, 3.077], + [1.362, 0.351, 0.334], + [10.559, 1.268, 1.277], + [12.146, 1.312, 1.307], + [20.523, 3.273, 3.306], + [48.187, 4.347, 4.282], + [3.767, 0.719, 0.709], + [1.983, 0.699, 0.673], + [3.784, 0.696, 0.738], + [10.704, 1.483, 1.466], + [9.622, 7.909, 7.883], + [5.319, 4.838, 4.712], + [3.847, 1.013, 1.033], + [7.413, 1.275, 1.248], + [7.903, 4.499, 4.53], + [11.226, 2.495, 2.537], + [11.231, 2.539, 2.51], + [1.767, 1.001, 1.001], + [0.977, 0.361, 0.379], + [0.928, 0.351, 0.399], + [0.948, 0.335, 0.3], + [1.189, 0.466, 0.456], + [0.905, 0.313, 0.317], + [1.105, 0.319, 0.306], + [0.877, 0.308, 0.314] +] +} + diff --git a/hyper-parquet/results/20260509/c6a.metal.json b/hyper-parquet/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..29ecea35e0 --- /dev/null +++ b/hyper-parquet/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Salesforce Hyper (Parquet)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented"], + "load_time": 64, + "data_size": 14737666736, + "result": [ + [0.651, 0.264, 0.266], + [1.019, 0.271, 0.275], + [1.544, 0.294, 0.304], + [2.099, 0.307, 0.301], + [2.103, 0.428, 0.435], + [2.37, 0.482, 0.472], + [0.751, 0.302, 0.283], + [1.299, 0.279, 0.291], + [2.609, 0.512, 0.485], + [3.046, 0.54, 0.542], + [2.06, 0.325, 0.322], + [2.209, 0.316, 0.325], + [2.438, 0.488, 0.504], + [3.772, 0.642, 0.647], + [2.418, 0.479, 0.481], + [2.01, 0.44, 0.445], + [3.736, 0.634, 0.625], + [3.708, 0.54, 0.571], + [5.808, 0.862, 0.89], + [1.646, 0.296, 0.302], + [10.501, 0.536, 0.535], + [11.985, 0.578, 0.55], + [20.369, 0.816, 0.837], + [48.203, 1.702, 1.481], + [3.746, 0.41, 0.409], + [1.958, 0.387, 0.379], + [3.755, 0.444, 0.423], + [10.666, 0.561, 0.575], + [9.218, 2.745, 2.914], + [1.306, 0.918, 0.994], + [3.527, 0.506, 0.502], + [7.064, 0.607, 0.565], + [6.058, 1.17, 1.189], + [10.893, 0.972, 1.051], + [10.861, 0.949, 0.985], + [1.342, 0.459, 0.471], + [0.738, 0.392, 0.345], + [0.729, 0.339, 0.36], + [0.723, 0.313, 0.306], + [0.937, 0.465, 0.501], + [0.676, 0.287, 0.282], + [0.832, 0.306, 0.287], + [0.677, 0.327, 0.306] +] +} + diff --git a/hyper-parquet/run.sh b/hyper-parquet/run.sh deleted file mode 100755 index 64df8c6082..0000000000 --- a/hyper-parquet/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/hyper-parquet/start b/hyper-parquet/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/hyper-parquet/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper-parquet/stop b/hyper-parquet/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/hyper-parquet/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper/benchmark.sh b/hyper/benchmark.sh index 0f6968613e..b0b9f4775a 100755 --- a/hyper/benchmark.sh +++ b/hyper/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install tableauhyperapi - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' ./load.py - -./run.sh | tee log.txt - -cat log.txt | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -du -b hits.hyper +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/hyper/check b/hyper/check new file mode 100755 index 0000000000..23ad27458a --- /dev/null +++ b/hyper/check @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +from tableauhyperapi import HyperProcess, Telemetry, Connection +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint) as connection: + connection.execute_list_query("SELECT 1") +PY diff --git a/hyper/data-size b/hyper/data-size new file mode 100755 index 0000000000..4dce0916ee --- /dev/null +++ b/hyper/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.hyper diff --git a/hyper/install b/hyper/install new file mode 100755 index 0000000000..537a36ca4a --- /dev/null +++ b/hyper/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install tableauhyperapi diff --git a/hyper/load.py b/hyper/load similarity index 70% rename from hyper/load.py rename to hyper/load index 5380f84bd6..a4a1d58cb0 100755 --- a/hyper/load.py +++ b/hyper/load @@ -1,8 +1,20 @@ -#!/usr/bin/env python3 +#!/bin/bash +set -e +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Idempotent: blow away any prior DB. +rm -f hits.hyper + +python3 - <<'PY' from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: with Connection(hyper.endpoint, 'hits.hyper', CreateMode.CREATE_AND_REPLACE) as connection: connection.execute_command(open("create.sql").read()) connection.execute_command("copy hits from 'hits.csv' with (format csv)") +PY + +rm -f hits.csv +sync diff --git a/hyper/query b/hyper/query new file mode 100755 index 0000000000..d0f59d1a0e --- /dev/null +++ b/hyper/query @@ -0,0 +1,35 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via tableau hyperapi against +# hits.hyper. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Stage stdin into a temp file: `python3 - <<'PY'` already consumes stdin to +# read the program, so sys.stdin.read() inside the heredoc returns "". +query_file=$(mktemp) +trap 'rm -f "$query_file"' EXIT +cat > "$query_file" + +python3 - "$query_file" <<'PY' +import sys +import timeit +from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode + +with open(sys.argv[1]) as f: + query = f.read() + +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint, 'hits.hyper', CreateMode.NONE) as connection: + start = timeit.default_timer() + rows = connection.execute_list_query(query) + end = timeit.default_timer() + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/hyper/query.py b/hyper/query.py deleted file mode 100755 index e1833c0e4c..0000000000 --- a/hyper/query.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -import timeit -import sys - -from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode, HyperException - -query = sys.stdin.read() - -with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - with Connection(hyper.endpoint, 'hits.hyper', CreateMode.NONE) as connection: - for _ in range(3): - start = timeit.default_timer() - try: - connection.execute_list_query(query) - print(round(timeit.default_timer() - start, 3)) - except HyperException: - print("null") diff --git a/hyper/results/20260509/c6a.4xlarge.json b/hyper/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..a65c7cd5ea --- /dev/null +++ b/hyper/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Salesforce Hyper", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented"], + "load_time": 660, + "data_size": 18959040512, + "result": [ + [0.081, 0.022, 0.022], + [0.133, 0.03, 0.023], + [0.399, 0.072, 0.067], + [1.075, 0.049, 0.049], + [0.815, 0.606, 0.613], + [1.94, 0.225, 0.225], + [0.112, 0.017, 0.017], + [0.14, 0.035, 0.021], + [2.071, 0.805, 0.791], + [3.42, 0.92, 0.934], + [1.192, 0.072, 0.071], + [1.201, 0.099, 0.099], + [2.084, 0.354, 0.363], + [4.521, 0.836, 0.843], + [2.088, 0.388, 0.387], + [1.475, 0.716, 0.733], + [4.39, 1.081, 1.067], + [4.279, 0.952, 0.956], + [8.359, 2.142, 2.162], + [0.164, 0.013, 0.014], + [14.711, 0.533, 0.533], + [16.084, 0.568, 0.558], + [16.945, 0.507, 0.483], + [7.601, 0.595, 0.593], + [1.134, 0.087, 0.067], + [1.895, 0.105, 0.106], + [1.153, 0.085, 0.083], + [15.13, 0.671, 0.681], + [12.665, 7.623, 7.627], + [0.678, 0.578, 0.577], + [4.797, 0.367, 0.361], + [8.325, 0.542, 0.546], + [7.776, 3.65, 3.712], + [15.252, 1.564, 1.577], + [15.228, 1.572, 1.568], + [0.981, 0.635, 0.639], + [0.169, 0.036, 0.034], + [0.155, 0.021, 0.021], + [0.19, 0.034, 0.036], + [0.216, 0.046, 0.045], + [0.205, 0.019, 0.016], + [0.211, 0.017, 0.017], + [0.158, 0.019, 0.019] +] +} + diff --git a/hyper/results/20260509/c6a.metal.json b/hyper/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..6cd523e846 --- /dev/null +++ b/hyper/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Salesforce Hyper", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented"], + "load_time": 424, + "data_size": 18959040512, + "result": [ + [0.074, 0.017, 0.014], + [0.129, 0.029, 0.029], + [0.394, 0.104, 0.094], + [1.094, 0.032, 0.037], + [0.572, 0.177, 0.164], + [1.859, 0.196, 0.183], + [0.107, 0.023, 0.022], + [0.122, 0.025, 0.025], + [1.779, 0.244, 0.234], + [3.17, 0.329, 0.325], + [1.197, 0.089, 0.089], + [1.205, 0.149, 0.154], + [1.977, 0.209, 0.197], + [4.082, 0.325, 0.324], + [2.003, 0.281, 0.274], + [1.248, 0.17, 0.179], + [4.128, 0.344, 0.355], + [4.032, 0.284, 0.283], + [7.999, 0.574, 0.569], + [0.185, 0.021, 0.022], + [14.693, 0.473, 0.449], + [16.074, 0.643, 0.652], + [16.925, 0.841, 0.839], + [13.522, 1.742, 1.633], + [1.171, 0.155, 0.156], + [1.905, 0.145, 0.143], + [1.158, 0.158, 0.139], + [15.118, 0.487, 0.535], + [12.723, 0.878, 0.846], + [0.306, 0.26, 0.24], + [4.695, 0.369, 0.396], + [8.12, 0.35, 0.326], + [6.243, 0.834, 0.833], + [14.914, 0.71, 0.699], + [14.937, 0.68, 0.741], + [0.816, 0.183, 0.181], + [0.194, 0.05, 0.056], + [0.166, 0.034, 0.032], + [0.197, 0.047, 0.054], + [0.253, 0.068, 0.06], + [0.206, 0.02, 0.021], + [0.284, 0.024, 0.025], + [0.147, 0.029, 0.035] +] +} + diff --git a/hyper/run.sh b/hyper/run.sh deleted file mode 100755 index 64df8c6082..0000000000 --- a/hyper/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/hyper/start b/hyper/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/hyper/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper/stop b/hyper/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/hyper/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/infobright/README.md b/infobright/README.md new file mode 100644 index 0000000000..bdc1d25a2c --- /dev/null +++ b/infobright/README.md @@ -0,0 +1,11 @@ +# Infobright + +## Dead (May 2026) + +Infobright Inc. has been defunct since 2017. The benchmark uses the community Docker image + + docker pull flolas/infobright + +The image still pulls but is unmaintained: the latest run hangs silently part-way through `LOAD DATA LOCAL INFILE` (schema commands succeed, then no further output until the run times out). The existing workaround that truncates the dataset to the first 90M rows (to dodge a row-93557187 parse error) doesn't help with the silent hang. + +There is no maintained image, no upstream successor, and no working binary path. The directory and historical results are kept; nothing here runs anymore. diff --git a/infobright/benchmark.sh b/infobright/benchmark.sh index 67b2f2f3b4..531bd65038 100755 --- a/infobright/benchmark.sh +++ b/infobright/benchmark.sh @@ -1,39 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y docker.io - -mkdir infobright -sudo docker run --name mysql_ib -e MYSQL_ROOT_PASSWORD=mypass -v $(pwd)/infobright:/mnt/mysql_data -p 5029:5029 -p 5555 -d flolas/infobright - -sudo docker run -i --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass -e "CREATE DATABASE test" -sudo docker run -i --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -e "$(cat create.sql)" - -# Load the data - -../download-hits-tsv - -# ERROR 2 (HY000) at line 1: Wrong data or column definition. Row: 93557187, field: 100. -head -n 90000000 hits.tsv > hits90m.tsv - -echo -n "Load time: " -command time -f '%e' sudo docker run -i --rm --volume $(pwd):/workdir --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -e "SET sql_log_bin = 0; - LOAD DATA LOCAL INFILE '/workdir/hits90m.tsv' INTO TABLE test.hits - FIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n' STARTING BY ''" - -# 38m37.466s - -echo -n "Data size: " -sudo docker exec mysql_ib du -bcs /mnt/mysql_data/ /usr/local/infobright-4.0.7-x86_64/cache | grep total - -# 13 760 341 294 - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/' | - awk -F, '{ if ($1 == "null") { print } else { print $1 * 86400 + $2 * 3600 + $3 * 60 + $4 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/infobright/check b/infobright/check new file mode 100755 index 0000000000..65222793e7 --- /dev/null +++ b/infobright/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +sudo docker run --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass \ + -e "SELECT 1" >/dev/null 2>&1 diff --git a/infobright/data-size b/infobright/data-size new file mode 100755 index 0000000000..49f4702e08 --- /dev/null +++ b/infobright/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +sudo docker exec mysql_ib du -bcs /mnt/mysql_data/ /usr/local/infobright-4.0.7-x86_64/cache \ + | grep total | awk '{print $1}' diff --git a/infobright/install b/infobright/install new file mode 100755 index 0000000000..bb091ed988 --- /dev/null +++ b/infobright/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io + +# Pull images up-front so install captures all setup work. +sudo docker pull flolas/infobright +sudo docker pull mysql:5 + +mkdir -p infobright + +# (Re)create the container only if missing. +if ! sudo docker inspect mysql_ib >/dev/null 2>&1; then + sudo docker run --name mysql_ib \ + -e MYSQL_ROOT_PASSWORD=mypass \ + -v "$(pwd)/infobright:/mnt/mysql_data" \ + -p 5029:5029 -p 5555 \ + -d flolas/infobright +fi diff --git a/infobright/load b/infobright/load new file mode 100755 index 0000000000..4d5492cf1a --- /dev/null +++ b/infobright/load @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +MYSQL_RUN="sudo docker run -i --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass" + +$MYSQL_RUN -e "DROP DATABASE IF EXISTS test" +$MYSQL_RUN -e "CREATE DATABASE test" +$MYSQL_RUN --database=test -e "$(cat create.sql)" + +# Infobright errors out on row 93557187 in the full dataset; truncate. +head -n 90000000 hits.tsv > hits90m.tsv + +sudo docker run -i --rm --volume "$(pwd):/workdir" --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test \ + -e "SET sql_log_bin = 0; + LOAD DATA LOCAL INFILE '/workdir/hits90m.tsv' INTO TABLE test.hits + FIELDS TERMINATED BY '\t' ENCLOSED BY '' ESCAPED BY '\\\\' LINES TERMINATED BY '\n' STARTING BY ''" + +rm -f hits.tsv hits90m.tsv +sync diff --git a/infobright/query b/infobright/query new file mode 100755 index 0000000000..b505db1a88 --- /dev/null +++ b/infobright/query @@ -0,0 +1,39 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql:5 client (Docker) against +# the Infobright container's MySQL protocol on :5029. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo docker run -i --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test \ + -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +parsed=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/') + +if [ -z "$parsed" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -F, -v p="$parsed" 'BEGIN { + n = split(p, a, ",") + d = (a[1] == "") ? 0 : a[1] + h = (a[2] == "") ? 0 : a[2] + m = (a[3] == "") ? 0 : a[3] + s = (a[4] == "") ? 0 : a[4] + printf "%.3f\n", d * 86400 + h * 3600 + m * 60 + s +}' >&2 diff --git a/infobright/run.sh b/infobright/run.sh deleted file mode 100755 index b6f176b339..0000000000 --- a/infobright/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo docker run --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -vvv -e "${query}" - done; -done; diff --git a/infobright/start b/infobright/start new file mode 100755 index 0000000000..007dcf1544 --- /dev/null +++ b/infobright/start @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Idempotent: if the container is running and responsive, do nothing. +if sudo docker inspect -f '{{.State.Running}}' mysql_ib 2>/dev/null | grep -q true; then + if sudo docker run --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass \ + -e "SELECT 1" >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo docker start mysql_ib diff --git a/infobright/stop b/infobright/stop new file mode 100755 index 0000000000..8b630229bc --- /dev/null +++ b/infobright/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop mysql_ib >/dev/null 2>&1 || true diff --git a/kinetica/README.md b/kinetica/README.md index 9ee125133f..a39ed7c469 100644 --- a/kinetica/README.md +++ b/kinetica/README.md @@ -17,3 +17,15 @@ All the queries will be executed on behalf of the user `admin` with the password > WARNING: Invalid_Argument: quoted field must end with quote (ColumnIndex:100)(ColumnName:UTMTerm)(ColumnType:char256)(Value:"tatuirovarki_redmond 70 0 -296158784638538920 -8631670417943857411 0) > WARNING: Skipped: 1, inserted : 99997496 records, updated : 0 records. +## Sourcing kisql (May 2026) + +The previously-pinned `kisql` binary at + + https://github.com/kineticadb/kisql/releases/download/v7.1.7.2/kisql + +is gone — that GitHub release was deleted upstream and newer release pages ship no compiled artifacts. The same binary is committed directly to the repo root as a self-extracting bash+jar launcher, so `install` now fetches it from + + https://raw.githubusercontent.com/kineticadb/kisql//kisql + +(default `KISQL_TAG=v7.2.3.17`, matches the 7.2.x server we install). + diff --git a/kinetica/benchmark.sh b/kinetica/benchmark.sh index 046fa0b309..b6523b5835 100755 --- a/kinetica/benchmark.sh +++ b/kinetica/benchmark.sh @@ -1,35 +1,7 @@ -#!/usr/bin/bash - -# Run setup.sh (assume we are running on ubuntu) -./setup-dev-ubuntu.sh - -# download the db -export KINETICA_ADMIN_PASSWORD=admin -curl https://files.kinetica.com/install/kinetica.sh -o kinetica && chmod u+x kinetica && sudo -E ./kinetica start - -# set up the cli -wget --continue --progress=dot:giga https://github.com/kineticadb/kisql/releases/download/v7.1.7.2/kisql - -chmod u+x ./kisql - -export KI_PWD="admin" -CLI="./kisql --host localhost --user admin" - -# download the ds -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -sudo mv hits.tsv.gz ./kinetica-persist/ - -$CLI --file create.sql -$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" - -START=$(date +%s) - -$CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" - -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "Data size: $(du -bcs ./kinetica-persist/gpudb | grep total)" - -# run the queries -./run.sh +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# kinetica downloads hits.tsv.gz directly inside ./load (Kinetica wants the +# gzipped form), so no central download script is used. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/kinetica/check b/kinetica/check new file mode 100755 index 0000000000..1e578951ed --- /dev/null +++ b/kinetica/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export KI_PWD=admin +./kisql --host localhost --user admin --sql 'SELECT 1' >/dev/null 2>&1 diff --git a/kinetica/data-size b/kinetica/data-size new file mode 100755 index 0000000000..57891b0e81 --- /dev/null +++ b/kinetica/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs ./kinetica-persist/gpudb | grep total | awk '{print $1}' diff --git a/kinetica/install b/kinetica/install new file mode 100755 index 0000000000..212d2228cc --- /dev/null +++ b/kinetica/install @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# setup-dev-ubuntu.sh installs docker, java (for kisql), and ripgrep. +./setup-dev-ubuntu.sh + +if [ ! -x ./kinetica ]; then + curl https://files.kinetica.com/install/kinetica.sh -o kinetica + chmod u+x kinetica +fi + +# Bring Kinetica up via the install/start script (idempotent inside). +export KINETICA_ADMIN_PASSWORD=admin +sudo -E ./kinetica start + +# Fetch the SQL CLI. The v7.1.7.2 GitHub release was deleted upstream +# and newer source tags ship no compiled artifacts, but the kisql binary +# is committed directly to the repo root as a self-extracting bash+jar +# launcher — fetch it via raw.githubusercontent.com instead. +KISQL_TAG=${KISQL_TAG:-v7.2.3.17} +if [ ! -x ./kisql ]; then + wget --continue --progress=dot:giga -O kisql \ + "https://raw.githubusercontent.com/kineticadb/kisql/$KISQL_TAG/kisql" + chmod u+x ./kisql +fi diff --git a/kinetica/load b/kinetica/load new file mode 100755 index 0000000000..75630adb1c --- /dev/null +++ b/kinetica/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +export KI_PWD=admin +CLI="./kisql --host localhost --user admin" + +# Kinetica's `load into ... format delimited text` reads the gzipped TSV +# directly from its persist directory, so we fetch the gzip rather than the +# decompressed TSV. +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' +sudo mv hits.tsv.gz ./kinetica-persist/ + +$CLI --file create.sql +$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" + +$CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" + +sudo rm -f ./kinetica-persist/hits.tsv.gz +sync diff --git a/kinetica/query b/kinetica/query new file mode 100755 index 0000000000..0402f739f0 --- /dev/null +++ b/kinetica/query @@ -0,0 +1,39 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via kisql. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# +# kisql 7.2+ prints "Timing (seconds): Connection=X, Query=Y" — we use Y. +# Older kisql emitted "Query Execution Time: sec"; we still accept that. +# Exit non-zero on error. +set -e + +export KI_PWD=admin +query=$(cat) + +raw=$(./kisql --host localhost --user admin --sql "$query" 2>&1) && exit_code=0 || exit_code=$? + +# kisql prints errors to stdout; sniff for them. Anchor to start-of-line +# so the WARNING lines that the load step emits ("Skipped: 1, inserted +# 99997496 records") aren't treated as fatal. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE '^(error|exception)'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# Prefer the kisql 7.2+ format; fall back to the legacy one. +secs=$(printf '%s\n' "$raw" \ + | grep -oE 'Query=[0-9]+(\.[0-9]+)?' | tail -n1 | cut -d= -f2) +if [ -z "$secs" ]; then + secs=$(printf '%s\n' "$raw" | grep -E 'Query Execution Time' | tail -n1 \ + | awk '{print $(NF-1)}') +fi + +if [ -z "$secs" ]; then + echo "no timing in kisql output" >&2 + exit 1 +fi + +awk -v s="$secs" 'BEGIN { printf "%.3f\n", s }' >&2 diff --git a/kinetica/run.sh b/kinetica/run.sh deleted file mode 100755 index 13e03d14c5..0000000000 --- a/kinetica/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -export KI_PWD=admin - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./kisql --host localhost --user admin --sql "$query" 2>&1 | rg 'Query Execution Time' | awk '{print $(NF-1)}' ||:) - - [[ "$?" == "0" && "$RES" != "" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/kinetica/start b/kinetica/start new file mode 100755 index 0000000000..0f831bf404 --- /dev/null +++ b/kinetica/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +export KINETICA_ADMIN_PASSWORD=admin +export KI_PWD=admin + +# Idempotent: if kisql can already speak SELECT 1, do nothing. +if ./kisql --host localhost --user admin --sql 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo -E ./kinetica start diff --git a/kinetica/stop b/kinetica/stop new file mode 100755 index 0000000000..0dd5f40d84 --- /dev/null +++ b/kinetica/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +export KINETICA_ADMIN_PASSWORD=admin +sudo -E ./kinetica stop || true diff --git a/lib/benchmark-common.sh b/lib/benchmark-common.sh new file mode 100755 index 0000000000..57cf4769c6 --- /dev/null +++ b/lib/benchmark-common.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# Shared ClickBench driver. +# +# A per-system benchmark.sh sets a few env vars and then exec's this script. +# This script is designed to be invoked from a system directory (e.g. +# clickhouse/), so all script paths below are relative to the system dir. +# +# Required env: +# BENCH_DOWNLOAD_SCRIPT Name of a top-level download-hits-* script to fetch +# the dataset (e.g. "download-hits-parquet-single"). +# Set to empty string for systems that read directly +# from a remote source (S3 datalake, remote services). +# +# Optional env: +# BENCH_RESTARTABLE "yes" (default) or "no". If "yes", the system is +# stopped+started between every query to neutralize +# warm-process effects. Set "no" for in-process / +# single-binary tools where restart would dominate +# query time (duckdb CLI, sqlite, dataframe wrappers). +# BENCH_TRIES Number of times each query is run. Default 3. +# BENCH_QUERIES_FILE Path to a queries file, one query per line. +# Default "queries.sql" (in the system dir). +# BENCH_CHECK_TIMEOUT Seconds to wait for ./check to succeed. Default 300. + +set -e + +# Defensive HOME export: cloud-init.sh.in stamps it too, but if an +# operator's local checkout predates that fix, the install/load/query +# scripts inherit an empty HOME and tools that follow XDG conventions +# (vcpkg, duckdb extension cache, go mod cache, gizmosql installer) +# fail in confusing ways. Pin to /root so every per-system step has a +# real home directory regardless. +export HOME="${HOME:-/root}" + +# BENCH_DOWNLOAD_SCRIPT must be set (possibly to empty for "no download"). +: "${BENCH_DOWNLOAD_SCRIPT?BENCH_DOWNLOAD_SCRIPT is required (set empty to skip)}" +: "${BENCH_RESTARTABLE:=yes}" +: "${BENCH_TRIES:=3}" +: "${BENCH_QUERIES_FILE:=queries.sql}" +: "${BENCH_CHECK_TIMEOUT:=300}" + +# Resolve the directory containing this script so we can find sibling +# helpers (download-hits-*). +LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +bench_check_loop() { + local i last_err + for i in $(seq 1 "$BENCH_CHECK_TIMEOUT"); do + if last_err=$(./check 2>&1 >/dev/null); then + return 0 + fi + sleep 1 + done + echo "bench: ./check did not succeed within ${BENCH_CHECK_TIMEOUT}s" >&2 + if [ -n "$last_err" ]; then + echo "bench: last ./check stderr was:" >&2 + printf '%s\n' "$last_err" | sed 's/^/ /' >&2 + fi + return 1 +} + +# Wait for ./check to start failing — i.e. the system is actually down, +# not merely told to stop. Engines that mmap their data files (Umbra, +# Hyper, etc.) keep the OS pagecache pinned until the process is gone, +# so we have to wait before drop_caches has any effect. Times out after +# 60s and proceeds anyway. +bench_wait_stopped() { + local i + for i in $(seq 1 60); do + if ! ./check >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + echo "bench: system did not stop within 60s; proceeding anyway" >&2 + return 0 +} + +bench_flush_caches() { + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null +} + +bench_install() { + ./install +} + +bench_start() { + # Tolerate non-zero exit from ./start: many engines' start commands return + # non-zero when the server is already up but leave the system in the + # desired state. The check loop is the authoritative readiness signal. + # + # Silence ./start: many daemons (clickhouse-server, postgres, ...) print + # progress lines to stdout/stderr that would otherwise interleave with + # the parseable [t1,t2,t3]/Load time/Data size lines in the benchmark log. + ./start >/dev/null 2>&1 || true + bench_check_loop +} + +bench_stop() { + # Silence ./stop for the same reason as ./start. + ./stop >/dev/null 2>&1 +} + +bench_download() { + if [ -z "$BENCH_DOWNLOAD_SCRIPT" ]; then + return 0 + fi + "$LIB_DIR/$BENCH_DOWNLOAD_SCRIPT" +} + +bench_load() { + local start_t end_t + start_t=$(date +%s.%N) + ./load + end_t=$(date +%s.%N) + # Print "Load time: " matching the existing log shape that + # play.clickhouse.com expects. + awk -v s="$start_t" -v e="$end_t" 'BEGIN { printf "Load time: %.3f\n", e - s }' + + # Defense against silent partial loads. Several DBs (umbra, mysql, + # postgres, mongodb, cratedb) survive an earlyoom / kernel-OOM kill + # mid-COPY by restarting and exposing a half-empty table; queries + # then run in microseconds against a near-empty result set and the + # run looks green. Catch this here, before we fire 43 meaningless + # query iterations: + # + # 1. Re-run ./check — confirms the server is still up. + # 2. Run ./data-size — confirms enough bytes actually landed on + # disk. ClickBench's hits dataset compresses to >5 GB on every + # system in the catalog, so anything smaller is partial. + # + # bench_main calls ./data-size again later to log the value; we + # don't cache the number here because some systems (those that + # accumulate background compaction or merge files post-load) report + # a meaningfully different size by the time queries finish. + if ! ./check >/dev/null 2>&1; then + echo "bench: ./check failed after ./load — server crashed mid-load?" >&2 + return 1 + fi + + local size + size=$(./data-size 2>/dev/null || echo 0) + if ! [[ "$size" =~ ^[0-9]+$ ]] || [ "$size" -lt 5000000000 ]; then + echo "bench: data-size after load is '${size}' (<5 GB)" >&2 + echo "bench: ClickBench's hits dataset doesn't fit in <5 GB on any" >&2 + echo "bench: system in the catalog; treating this as a partial load" >&2 + echo "bench: (likely an OOM kill mid-COPY)." >&2 + return 1 + fi +} + +# Run a single query script and emit a single JSON-array `[t1,t2,t3],` line. +# Per-try timing is also appended to result.csv as `,,`. +bench_run_query() { + local query="$1" + local query_num="$2" + local i raw_stderr exit_code timing + local results=() + + if [ "$BENCH_RESTARTABLE" = "yes" ]; then + # Order matters: stop, wait until really stopped, then flush + # caches, then start. The naive order (flush, then stop) leaves + # mmap-backed engines (Umbra, DuckDB, Hyper, CedarDB) with their + # data files pinned by the still-running process, so drop_caches + # can't evict the pages — the new instance then re-mmaps those + # same files and the "cold" run reads from a warm page cache. + # Waiting for ./check to fail before flushing makes the cold run + # actually cold even when ./stop returns before the process is + # fully gone. + ./stop >/dev/null 2>&1 || true + bench_wait_stopped + bench_flush_caches + ./start >/dev/null 2>&1 || true + bench_check_loop + else + bench_flush_caches + fi + + for i in $(seq 1 "$BENCH_TRIES"); do + # The query script's contract: stdout = result, stderr's last line = + # fractional seconds, exit 0 on success. + raw_stderr=$(printf '%s\n' "$query" | ./query 2>&1 >/dev/null) && exit_code=0 || exit_code=$? + + if [ "$exit_code" -eq 0 ]; then + # The query script's contract is "fractional seconds on the + # last line", but several systems (pyspark, JVM-based ones, + # anything that prints SparkSession shutdown lines after the + # measurement) emit additional log noise after the timing, + # so plain `tail -n1` was reading "Stopping SparkContext" or + # similar and producing all-null result rows. Pull the LAST + # numeric-looking line instead. + timing=$(printf '%s\n' "$raw_stderr" | grep -E '^[0-9]+(\.[0-9]+)?$' | tail -n1) + [ -z "$timing" ] && timing="null" + else + timing="null" + printf '%s\n' "$raw_stderr" >&2 + fi + results+=("$timing") + echo "${query_num},${i},${timing}" >> result.csv + done + + # Emit "[t1,t2,t3]," for compatibility with the existing log format. + local out="[" + local j + for j in "${!results[@]}"; do + out+="${results[$j]}" + if [ "$j" -lt $((${#results[@]} - 1)) ]; then + out+="," + fi + done + out+="]," + echo "$out" +} + +bench_main() { + bench_install + bench_start + + bench_download + bench_load + + : > result.csv + local query_num=1 + while IFS= read -r query; do + # Skip empty lines. + [ -z "$query" ] && continue + bench_run_query "$query" "$query_num" + query_num=$((query_num + 1)) + done < "$BENCH_QUERIES_FILE" + + # data-size may need the server up (e.g. ClickHouse queries system.tables, + # pandas hits the HTTP server), so report it before stopping. + echo -n "Data size: " + ./data-size + + bench_stop || true +} + +# Only run the full flow when executed directly (or via `exec`). Sourcing the +# file (e.g. for testing individual functions) won't trigger bench_main. +if [ "${BASH_SOURCE[0]}" = "$0" ]; then + bench_main +fi diff --git a/download-hits-csv b/lib/download-hits-csv similarity index 100% rename from download-hits-csv rename to lib/download-hits-csv diff --git a/download-hits-parquet-partitioned b/lib/download-hits-parquet-partitioned similarity index 100% rename from download-hits-parquet-partitioned rename to lib/download-hits-parquet-partitioned diff --git a/download-hits-parquet-single b/lib/download-hits-parquet-single similarity index 100% rename from download-hits-parquet-single rename to lib/download-hits-parquet-single diff --git a/download-hits-tsv b/lib/download-hits-tsv similarity index 100% rename from download-hits-tsv rename to lib/download-hits-tsv diff --git a/locustdb/benchmark.sh b/locustdb/benchmark.sh index ab9f4697c3..93c30798d9 100755 --- a/locustdb/benchmark.sh +++ b/locustdb/benchmark.sh @@ -15,7 +15,7 @@ sudo apt-get install -y g++ capnproto libclang-14-dev cargo build --features "enable_rocksdb" --features "enable_lz4" --release -../../download-hits-csv +../../lib/download-hits-csv target/release/repl --load hits.csv --db-path db diff --git a/mariadb-columnstore/benchmark.sh b/mariadb-columnstore/benchmark.sh index 1fdda45110..531bd65038 100755 --- a/mariadb-columnstore/benchmark.sh +++ b/mariadb-columnstore/benchmark.sh @@ -1,42 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y docker.io -docker run -d -p 3306:3306 --shm-size=512m -e PM1=mcs1 --hostname=mcs1 --name mcs1 mariadb/columnstore -docker exec -i mcs1 provision mcs1 - -export PASSWORD="tsFgm457%3cj" -for _ in {1..300} -do - sudo docker exec mcs1 mariadb -e "GRANT ALL PRIVILEGES ON *.* TO '$(whoami)'@'%' IDENTIFIED BY '${PASSWORD}';" | grep -F 'ERROR' || break - sleep 1 -done - -sudo apt-get install -y mariadb-client - -mysql --password="${PASSWORD}" --host 127.0.0.1 -e "CREATE DATABASE clickbench" -mysql --password="${PASSWORD}" --host 127.0.0.1 clickbench < create.sql - -# Load the data - -../download-hits-tsv - -echo -n "Load time: " -command time -f '%e' mysql --password="${PASSWORD}" --host 127.0.0.1 clickbench -e "SET sql_log_bin = 0; - LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits - FIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n' STARTING BY ''" - -# 41m47.856s - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec mcs1 du -bcs /var/lib/columnstore | grep total - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mariadb-columnstore/check b/mariadb-columnstore/check new file mode 100755 index 0000000000..ffc02755ce --- /dev/null +++ b/mariadb-columnstore/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +PASSWORD=${MCS_PASSWORD:-tsFgm457%3cj} +mariadb --skip-ssl --password="$PASSWORD" --host 127.0.0.1 -e 'SELECT 1' >/dev/null diff --git a/mariadb-columnstore/data-size b/mariadb-columnstore/data-size new file mode 100755 index 0000000000..68582c28d9 --- /dev/null +++ b/mariadb-columnstore/data-size @@ -0,0 +1,3 @@ +#!/bin/bash +set -eu +sudo docker exec mcs1 du -bcs /var/lib/columnstore | grep total | awk '{print $1}' diff --git a/mariadb-columnstore/install b/mariadb-columnstore/install new file mode 100755 index 0000000000..9445fd6cdf --- /dev/null +++ b/mariadb-columnstore/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io mariadb-client + +sudo docker pull mariadb/columnstore diff --git a/mariadb-columnstore/load b/mariadb-columnstore/load new file mode 100755 index 0000000000..9e2705ca1f --- /dev/null +++ b/mariadb-columnstore/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +PASSWORD=${MCS_PASSWORD:-tsFgm457%3cj} +MARIADB="mariadb --skip-ssl --password=$PASSWORD --host 127.0.0.1" + +# Recreate the database + table from a known-empty state. +$MARIADB -e "DROP DATABASE IF EXISTS clickbench" +$MARIADB -e "CREATE DATABASE clickbench" +$MARIADB clickbench < create.sql + +# ColumnStore's recommended bulk loader is `cpimport`, not LOAD DATA +# INFILE — the latter went through the SQL layer in 64KB chunks, +# couldn't keep up with a 75 GB dataset, and would die mid-stream +# with the cryptic "Internal error < 0 (Not system error) from +# storage engine ColumnStore" documented in this entry's README. +# cpimport bulk-imports column-stripe-natively and accepts STDIN, so +# we can pipe the host-side hits.tsv straight in without docker cp. +sudo docker exec -i mcs1 cpimport clickbench hits -s '\t' < hits.tsv + +rm -f hits.tsv +sync diff --git a/mariadb-columnstore/query b/mariadb-columnstore/query new file mode 100755 index 0000000000..e6e17ce6e4 --- /dev/null +++ b/mariadb-columnstore/query @@ -0,0 +1,41 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mariadb client. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed +# from the "N rows in set (X.YYY sec)" footer). +# Exit non-zero on error. +set -e + +PASSWORD=${MCS_PASSWORD:-tsFgm457%3cj} +query=$(cat) + +# -vvv makes mariadb echo "N rows in set (T sec)" for every statement. +raw=$(mariadb --skip-ssl --password="$PASSWORD" --host 127.0.0.1 -vvv \ + clickbench -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR\b'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Stdout: drop the timing/footer lines, keep the actual result body. +printf '%s\n' "$raw" | grep -vE '^[0-9]+ rows? in set|^Empty set|^Bye$|^Reading table information' + +# "N rows in set (M min S sec)" — convert to seconds. +secs=$(printf '%s\n' "$raw" \ + | grep -oP '\(\K(?:[0-9.]+\s+min\s+)?[0-9.]+\s+sec\)' \ + | tail -n1 | sed 's/)$//') +if [ -z "$secs" ]; then + secs=$(printf '%s\n' "$raw" | grep -oP '^Empty set \(\K[0-9.]+\s+sec\)' | tail -n1 | sed 's/)$//') +fi +if [ -z "$secs" ]; then + echo "no timing in mariadb output" >&2 + exit 1 +fi + +awk -v s="$secs" ' +BEGIN { + n = split(s, a, /\s+/) + if (n >= 4 && a[2] == "min") { printf "%.3f\n", a[1] * 60 + a[3] } + else if (n >= 2) { printf "%.3f\n", a[1] } +}' >&2 diff --git a/mariadb-columnstore/results/20260510/c6a.4xlarge.json b/mariadb-columnstore/results/20260510/c6a.4xlarge.json new file mode 100644 index 0000000000..73fc79d4c8 --- /dev/null +++ b/mariadb-columnstore/results/20260510/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "MariaDB ColumnStore", + "date": "2026-05-10", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","lukewarm-cold-run"], + "load_time": 410, + "data_size": 19724847775, + "result": [ + [1.967, 0.531, 1.545], + [0.729, 0.277, 0.284], + [7.897, 7.252, 7.114], + [5.144, 3.738, 3.731], + [7.086, 7.828, 6.751], + [12.725, 10.89, 10.954], + [3.418, 2.799, 4.838], + [0.774, 0.31, 0.328], + [9.59, 8.894, 8.962], + [16.539, 15.208, 15.311], + [4.284, 2.612, 2.711], + [4.42, 2.699, 2.702], + [9.434, 8.6, 7.61], + [16.559, 15.056, 14.316], + [10.931, 7.912, 8.972], + [8.089, 8.704, 7.659], + [19.686, 18.668, 18.687], + [16.638, 16.348, 16.471], + [null, null, null], + [1.12, 0.183, 0.2], + [30.246, 19.283, 19.319], + [19.617, 6.997, 8.001], + [30.377, 27.467, 27.487], + [72.009, 66.32, 66.685], + [7.859, 5.087, 5.092], + [6.806, 5.054, 5.045], + [7.779, 6.108, 5.054], + [32.865, 21.952, 22.025], + [null, null, null], + [376.443, 381.958, 380.017], + [9.148, 6.541, 6.536], + [11.042, 6.912, 6.918], + [null, null, null], + [33.873, 28.439, 29.178], + [35.122, 28.872, 28.652], + [9.281, 9.92, 8.813], + [3.573, 0.889, 0.906], + [3.892, 1.828, 1.839], + [0.552, 0.149, 0.134], + [1.122, 0.611, 0.614], + [0.407, 1.103, 0.1], + [0.408, 0.122, 0.101], + [0.586, 0.303, 0.299] +] +} + diff --git a/mariadb-columnstore/run.sh b/mariadb-columnstore/run.sh deleted file mode 100755 index bf3cd952ef..0000000000 --- a/mariadb-columnstore/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - mysql --password="${PASSWORD}" --host 127.0.0.1 -vvv clickbench -e "${query}" - done; -done; diff --git a/mariadb-columnstore/start b/mariadb-columnstore/start new file mode 100755 index 0000000000..805e55c139 --- /dev/null +++ b/mariadb-columnstore/start @@ -0,0 +1,42 @@ +#!/bin/bash +set -eu + +PASSWORD=${MCS_PASSWORD:-tsFgm457%3cj} + +# Idempotent: if mariadb on :3306 is already responsive, nothing to do. +if mariadb --skip-ssl --password="$PASSWORD" --host 127.0.0.1 -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +if ! sudo docker ps -a --format '{{.Names}}' | grep -qx mcs1; then + sudo docker run -d -p 3306:3306 \ + --shm-size=512m \ + -e PM1=mcs1 \ + --hostname=mcs1 \ + --name mcs1 \ + mariadb/columnstore >/dev/null + # Wait for mariadb-internal to come up enough for the provisioner. + for _ in $(seq 1 60); do + sudo docker exec mcs1 mariadb -e 'SELECT 1' >/dev/null 2>&1 && break + sleep 1 + done + sudo docker exec mcs1 provision mcs1 +else + sudo docker start mcs1 >/dev/null +fi + +# Wait for the server, then grant the running user a password so the rest +# of the per-system scripts can talk to it without sudo'ing into the +# container. GRANT must hit a working server, so retry briefly. +for _ in $(seq 1 60); do + sudo docker exec mcs1 mariadb -e \ + "GRANT ALL PRIVILEGES ON *.* TO '$(whoami)'@'%' IDENTIFIED BY '${PASSWORD}'; FLUSH PRIVILEGES;" \ + >/dev/null 2>&1 && break + sleep 1 +done + +# Make sure we can actually log in. +for _ in $(seq 1 60); do + mariadb --skip-ssl --password="$PASSWORD" --host 127.0.0.1 -e 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/mariadb-columnstore/stop b/mariadb-columnstore/stop new file mode 100755 index 0000000000..4898ac158a --- /dev/null +++ b/mariadb-columnstore/stop @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop mcs1 >/dev/null 2>&1 || true diff --git a/mariadb/benchmark.sh b/mariadb/benchmark.sh index f95cc108e5..531bd65038 100755 --- a/mariadb/benchmark.sh +++ b/mariadb/benchmark.sh @@ -1,34 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mariadb-server -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" - -# size innodb buffer based on available RAM -# use 75% of total -sudo bash -c "awk '/MemTotal/ { printf \"innodb_buffer_pool_size=%.0fG \n\", \$2*0.75/1024/1024 }' /proc/meminfo > /etc/mysql/buffer.conf" - -sudo service mariadb restart - -# Load the data - -../download-hits-tsv - -sudo mariadb -e "CREATE DATABASE test" -sudo mariadb test < create.sql - -echo -n "Load time: " -command time -f '%e' split -l 10000 --filter="sudo mariadb test -e \"SET sql_log_bin = 0; LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE hits;\"" hits.tsv - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mariadb test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/' | - awk -F, '{ if ($1 == "null") { print } else { print $1 * 86400 + $2 * 3600 + $3 * 60 + $4 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mariadb/check b/mariadb/check new file mode 100755 index 0000000000..27dd1cebb4 --- /dev/null +++ b/mariadb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mariadb -e "SELECT 1" >/dev/null diff --git a/mariadb/data-size b/mariadb/data-size new file mode 100755 index 0000000000..c9319de574 --- /dev/null +++ b/mariadb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mariadb test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mariadb/install b/mariadb/install new file mode 100755 index 0000000000..3f080f7947 --- /dev/null +++ b/mariadb/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mariadb-server + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" + +# Size innodb buffer based on available RAM — use 75% of total. +sudo bash -c "awk '/MemTotal/ { printf \"innodb_buffer_pool_size=%.0fG \n\", \$2*0.75/1024/1024 }' /proc/meminfo > /etc/mysql/buffer.conf" + +sudo service mariadb restart diff --git a/mariadb/load b/mariadb/load new file mode 100755 index 0000000000..0f23f43c14 --- /dev/null +++ b/mariadb/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo mariadb -e "DROP DATABASE IF EXISTS test" +sudo mariadb -e "CREATE DATABASE test" +sudo mariadb test < create.sql + +# Stream-load in chunks of 10000 lines (the original benchmark approach). +split -l 10000 --filter="sudo mariadb test -e \"SET sql_log_bin = 0; LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE hits;\"" hits.tsv + +rm -f hits.tsv +sync diff --git a/mariadb/query b/mariadb/query new file mode 100755 index 0000000000..eeb841fd17 --- /dev/null +++ b/mariadb/query @@ -0,0 +1,38 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mariadb -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mariadb's "N rows in set (X.XX sec)" footer; days/hours/min/sec all handled). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mariadb test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +# mariadb may print "(2 days 3 hours 4 min 5.6 sec)" or any subset. +parsed=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/') + +if [ -z "$parsed" ]; then + echo "no timing in mariadb output" >&2 + exit 1 +fi + +awk -F, -v p="$parsed" 'BEGIN { + n = split(p, a, ",") + d = (a[1] == "") ? 0 : a[1] + h = (a[2] == "") ? 0 : a[2] + m = (a[3] == "") ? 0 : a[3] + s = (a[4] == "") ? 0 : a[4] + printf "%.3f\n", d * 86400 + h * 3600 + m * 60 + s +}' >&2 diff --git a/mariadb/run.sh b/mariadb/run.sh deleted file mode 100755 index 7294b21585..0000000000 --- a/mariadb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mariadb test -vvv -e "${query}" - done; -done; diff --git a/mariadb/start b/mariadb/start new file mode 100755 index 0000000000..7c7acd41c9 --- /dev/null +++ b/mariadb/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if sudo mariadb -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mariadb start diff --git a/mariadb/stop b/mariadb/stop new file mode 100755 index 0000000000..bfaaeb9f86 --- /dev/null +++ b/mariadb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mariadb stop || true diff --git a/monetdb/benchmark.sh b/monetdb/benchmark.sh index 5074349510..531bd65038 100755 --- a/monetdb/benchmark.sh +++ b/monetdb/benchmark.sh @@ -1,41 +1,5 @@ #!/bin/bash - -# Install - -echo "deb https://dev.monetdb.org/downloads/deb/ $(lsb_release -cs) monetdb" | sudo tee /etc/apt/sources.list.d/monetdb.list - -sudo wget --output-document=/etc/apt/trusted.gpg.d/monetdb.gpg https://www.monetdb.org/downloads/MonetDB-GPG-KEY.gpg -sudo apt-get update -y -sudo apt-get install -y monetdb5-sql monetdb-client dos2unix net-tools - -sudo monetdbd create /var/lib/monetdb -sudo usermod -a -G monetdb $USER - -for _ in {1..300} -do - sudo monetdb create test && break - sleep 1 -done -sudo monetdb release test - -sudo apt-get install -y expect - -./query.expect "$(cat create.sql)" - -../download-hits-tsv -chmod 777 ~ hits.tsv - -echo -n "Load time: " -command time -f '%e' ./query.expect "COPY INTO hits FROM '$(pwd)/hits.tsv' USING DELIMITERS '\t'" - -# 99997497 affected rows -# clk: 15:39 min - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/monetdb5/ | grep total - -cat log.txt | dos2unix -f | grep -P 'clk|tuple' | - awk '/tuple/ { ok = 1 } /clk/ { if (ok) { if ($3 == "ms") { print $2 / 1000 } else { print $2 } } else { print "null" }; ok = 0 }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/monetdb/check b/monetdb/check new file mode 100755 index 0000000000..402d268684 --- /dev/null +++ b/monetdb/check @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# `-P monetdb` rather than relying on ~/.monetdb so the check works even +# when HOME isn't set (cloud-init runs without one) and mclient can't +# find the dotfile. The default password for the bundled monetdb user +# is "monetdb"; install/load/start use the same. +mclient -u monetdb -P monetdb -d test -s 'SELECT 1' >/dev/null 2>&1 diff --git a/monetdb/data-size b/monetdb/data-size new file mode 100755 index 0000000000..1ac01632c3 --- /dev/null +++ b/monetdb/data-size @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +# install creates the dbfarm under /var/lib/monetdb (the package's +# default), not the older /var/monetdb5/ path the original benchmark +# assumed. +sudo du -bcs /var/lib/monetdb/ | grep total | awk '{print $1}' diff --git a/monetdb/install b/monetdb/install new file mode 100755 index 0000000000..c4180e7e2a --- /dev/null +++ b/monetdb/install @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +echo "deb https://dev.monetdb.org/downloads/deb/ $(lsb_release -cs) monetdb" \ + | sudo tee /etc/apt/sources.list.d/monetdb.list + +sudo wget --output-document=/etc/apt/trusted.gpg.d/monetdb.gpg \ + https://www.monetdb.org/downloads/MonetDB-GPG-KEY.gpg + +sudo apt-get update -y +sudo apt-get install -y monetdb5-sql monetdb-client dos2unix net-tools expect + +# `monetdb5-sql` already creates /var/lib/monetdb (as the monetdb user's +# home dir), so the previous dir-exists guard used to skip `monetdbd +# create` and leave the dbfarm uninitialized. The dbfarm marker file +# isn't reliable across MonetDB versions either, so just try create +# and start unconditionally — both error harmlessly when already done. +sudo monetdbd create /var/lib/monetdb 2>/dev/null || true +sudo monetdbd start /var/lib/monetdb 2>/dev/null || true +sudo usermod -a -G monetdb "$USER" + +# `mclient` looks for credentials in ~/.monetdb (user= / password=). +# Without it the default `monetdb` user gets an interactive password +# prompt, which times out the check loop. Stamp the config now so +# every later mclient call (check, data-size if it uses mclient) just +# works. +sudo install -m 0600 /dev/null /root/.monetdb +sudo tee /root/.monetdb >/dev/null <<'EOF' +user=monetdb +password=monetdb +EOF + +# monetdbd takes a moment to come up; retry creating the test DB. +for _ in {1..300}; do + if sudo monetdb create test 2>/dev/null; then break; fi + sleep 1 +done +sudo monetdb release test || true diff --git a/monetdb/load b/monetdb/load new file mode 100755 index 0000000000..547d3d0f85 --- /dev/null +++ b/monetdb/load @@ -0,0 +1,16 @@ +#!/bin/bash +set -eu + +# Drop and recreate to make idempotent. +sudo monetdb stop test 2>/dev/null || true +sudo monetdb destroy -f test 2>/dev/null || true +sudo monetdb create test +sudo monetdb release test + +chmod 777 ~ hits.tsv + +./query.expect "$(cat create.sql)" +./query.expect "COPY INTO hits FROM '$(pwd)/hits.tsv' USING DELIMITERS '\t'" + +rm -f hits.tsv +sync diff --git a/monetdb/query b/monetdb/query new file mode 100755 index 0000000000..ba61abb712 --- /dev/null +++ b/monetdb/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the expect wrapper around mclient. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mclient `\t clock` output: "clk: ms" or "clk: s"). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(./query.expect "$query" 2>&1) && exit_code=0 || exit_code=$? + +# mclient may print errors but exit 0 via expect; sniff for them. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^!|sql:.*error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Strip CR (mclient is in expect/PTY mode), pass result to stdout. +clean=$(printf '%s\n' "$raw" | dos2unix -f 2>/dev/null || printf '%s\n' "$raw") +printf '%s\n' "$clean" + +# Parse the LAST `clk:` line into seconds. +timing=$(printf '%s\n' "$clean" | grep -E '^clk:' | tail -n1) +if [ -z "$timing" ]; then + echo "no clk timing in monetdb output" >&2 + exit 1 +fi + +awk -v s="$timing" 'BEGIN { + n = split(s, a, " ") + val = a[2]; unit = a[3] + if (unit ~ /ms/) { printf "%.3f\n", val / 1000 } + else if (unit ~ /s/) { printf "%.3f\n", val } + else { printf "%.3f\n", val } +}' >&2 diff --git a/monetdb/run.sh b/monetdb/run.sh deleted file mode 100755 index 57a1a5cbff..0000000000 --- a/monetdb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - ./query.expect "$query" 2>&1 - done; -done; diff --git a/monetdb/start b/monetdb/start new file mode 100755 index 0000000000..310153dd7d --- /dev/null +++ b/monetdb/start @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Idempotent: if mserver is already serving the test DB, do nothing. +# Pass `-P monetdb` so this works regardless of $HOME / .monetdb. +if mclient -u monetdb -P monetdb -d test -s 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo monetdbd start /var/lib/monetdb || true + +# Make sure the database is released (online). +sudo monetdb release test || true diff --git a/monetdb/stop b/monetdb/stop new file mode 100755 index 0000000000..61f019fc21 --- /dev/null +++ b/monetdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo monetdbd stop /var/lib/monetdb 2>/dev/null || true diff --git a/mongodb/benchmark.sh b/mongodb/benchmark.sh index 11db23dcc5..ef00681cb7 100755 --- a/mongodb/benchmark.sh +++ b/mongodb/benchmark.sh @@ -1,88 +1,7 @@ -#!/bin/bash -e - -# https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ - -sudo apt-get update -y -sudo apt-get install -y gnupg curl - -curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ - sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg \ - --dearmor - -source /etc/lsb-release -echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu ${DISTRIB_CODENAME}/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list - -sudo apt-get update -y -sudo apt-get install -y mongodb-org -sudo systemctl start mongod -sudo systemctl status mongod - -for _ in {1..300} -do - mongosh --quiet --eval "db.runCommand('ping').ok" && break - sleep 1 -done - -################################# -# set params `internalQueryPlannerGenerateCoveredWholeIndexScans` to true because we know that collscan is -# always bad. Decision about enabling should be made if collection data couldn't fit to RAM. -# NOTE: This option is reset to default on restart until it saved in mongo config file. -# Don't forget to set again if mongo restart needed or crashes happened while queries run and -# you want to continue theirs execution. -time mongosh --quiet --eval 'db.adminCommand({setParameter: 1,"internalQueryPlannerGenerateCoveredWholeIndexScans": true});' - - -################################# -# Create the indexes before import data because each index is reread all data -# Q6 -time mongosh --quiet --eval 'db.hits.createIndex({"EventDate": 1});' -# Q10, Q11 -time mongosh --quiet --eval 'db.hits.createIndex({"MobilePhoneModel": 1});' -# Q28 -time mongosh --quiet --eval 'db.hits.createIndex({"Referer": 1});' -# Q40 -time mongosh --quiet --eval 'db.hits.createIndex({"RefererHash": 1});' -# Q41 -time mongosh --quiet --eval 'db.hits.createIndex({"URLHash": 1});' -# Q3, Q4, Q15, Q19 -time mongosh --quiet --eval 'db.hits.createIndex({"UserID": 1});' -# Q1, Q2, Q7, Q9 -time mongosh --quiet --eval 'db.hits.createIndex({"AdvEngineID": 1, "ResolutionWidth": 1, "RegionID": 1});' -# Q8 -time mongosh --quiet --eval 'db.hits.createIndex({"RegionID": 1, "UserID": 1});' -# Q5, Q12, Q14, Q24, Q30, Q31 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "SearchEngineID": 1});' -# Q13, Q16, Q17, Q18, Q26 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "UserID": 1, "EventTime": 1});' -# Q21, Q22 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "URL": 1, "Title": 1});' -# Q38, Q39 -time mongosh --quiet --eval 'db.hits.createIndex({"CounterID": 1, "EventDate": 1, "URL": 1});' -# Q36, Q37, Q42 -time mongosh --quiet --eval 'db.hits.createIndex({"CounterID": 1, "IsRefresh": 1, "EventDate": 1});' -# Q20, Q23, Q27, Q33, Q34 -time mongosh --quiet --eval 'db.hits.createIndex({"URL": 1, "CounterID": 1 });' -# Q29, Q32, Q35 -time mongosh --quiet --eval 'db.hits.createIndex({"ClientIP": 1, "WatchID": 1, "ResolutionWidth": 1, "IsRefresh": 1});' - - -################################# -# Load data and import -../download-hits-tsv - -# Use mongo import to load data into mongo. By default numInsertionWorkers is 1 so change to half of VM where it would be run -#time mongoimport --collection hits --type tsv hits.tsv --fieldFile=create.txt --columnsHaveTypes --numInsertionWorkers=8 - -# But on the AWS c6a.4xlarge machines, parallel import is slower than single-threaded, so we choose the single-threaded import. -echo -n "Load time: " -command time -f '%e' mongoimport --collection hits --type tsv hits.tsv --fieldFile=create.txt --columnsHaveTypes - -echo -n "Data size: " -sudo du -bcs /var/lib/mongodb/ | grep total -# total size: 82937405440 (77.2 Gb) -# indexes size: 38326390784 (35.6 Gb) // heh, so much but indexes should be -# storage size: 44610863104 (41.5 Gb) - -# MongoDB does not support SQL in self-hosted option. Only with MongoDB Atlas service. - -time mongosh --quiet ./run.js > result.json +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. MongoDB uses +# aggregation pipelines (queries.txt, EJSON one-per-line) rather than SQL. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +export BENCH_QUERIES_FILE="queries.txt" +exec ../lib/benchmark-common.sh diff --git a/mongodb/check b/mongodb/check new file mode 100755 index 0000000000..dae52ba30e --- /dev/null +++ b/mongodb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null diff --git a/mongodb/data-size b/mongodb/data-size new file mode 100755 index 0000000000..59e468b760 --- /dev/null +++ b/mongodb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo du -bcs /var/lib/mongodb/ | grep total | awk '{print $1}' diff --git a/mongodb/formatResult.js b/mongodb/formatResult.js deleted file mode 100644 index f1f647ea0b..0000000000 --- a/mongodb/formatResult.js +++ /dev/null @@ -1,27 +0,0 @@ -// runs with node -const fs = require("fs"); -const inputFile = process.argv[2]; -const inputContent = fs.readFileSync(inputFile, "utf-8"); -const res = {}; -inputContent.split(/\r?\n/).forEach((line) => { - if (line.length == 0) { - return; - } - parsed = JSON.parse(line); - res[parsed.q + "_" + parsed.it] = parsed.ok == 1 ? parsed.t / 1000.0 : null; -}); -console.log("["); -for (let i = 0; i < 43; ++i) { - delim = i == 42 ? "" : ","; - line = - "[" + - res[i + "_0"] + - "," + - res[i + "_1"] + - "," + - res[i + "_2"] + - "]" + - delim; - console.log(line); -} -console.log("]"); diff --git a/mongodb/install b/mongodb/install new file mode 100755 index 0000000000..bbe3bc7ba2 --- /dev/null +++ b/mongodb/install @@ -0,0 +1,21 @@ +#!/bin/bash +# Install MongoDB and the mongosh shell. +# https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ +set -e + +if command -v mongod >/dev/null 2>&1 && command -v mongosh >/dev/null 2>&1; then + exit 0 +fi + +sudo apt-get update -y +sudo apt-get install -y gnupg curl + +curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ + sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg --dearmor + +# shellcheck disable=SC1091 +source /etc/lsb-release +echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu ${DISTRIB_CODENAME}/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list + +sudo apt-get update -y +sudo apt-get install -y mongodb-org diff --git a/mongodb/load b/mongodb/load new file mode 100755 index 0000000000..f63714e2c4 --- /dev/null +++ b/mongodb/load @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +# Indexes are created BEFORE import so each document is indexed once during +# load (re-indexing after the fact re-reads everything). The set of indexes +# was selected per-query in the original benchmark.sh. +mongosh --quiet test <<'EOF' +db.hits.createIndex({"EventDate": 1}); +db.hits.createIndex({"MobilePhoneModel": 1}); +db.hits.createIndex({"Referer": 1}); +db.hits.createIndex({"RefererHash": 1}); +db.hits.createIndex({"URLHash": 1}); +db.hits.createIndex({"UserID": 1}); +db.hits.createIndex({"AdvEngineID": 1, "ResolutionWidth": 1, "RegionID": 1}); +db.hits.createIndex({"RegionID": 1, "UserID": 1}); +db.hits.createIndex({"SearchPhrase": 1, "SearchEngineID": 1}); +db.hits.createIndex({"SearchPhrase": 1, "UserID": 1, "EventTime": 1}); +db.hits.createIndex({"SearchPhrase": 1, "URL": 1, "Title": 1}); +db.hits.createIndex({"CounterID": 1, "EventDate": 1, "URL": 1}); +db.hits.createIndex({"CounterID": 1, "IsRefresh": 1, "EventDate": 1}); +db.hits.createIndex({"URL": 1, "CounterID": 1}); +db.hits.createIndex({"ClientIP": 1, "WatchID": 1, "ResolutionWidth": 1, "IsRefresh": 1}); +EOF + +# Single-threaded import is faster on c6a.4xlarge per the original benchmark.sh. +mongoimport --db test --collection hits --type tsv hits.tsv \ + --fieldFile=create.txt --columnsHaveTypes + +rm -f hits.tsv +sync diff --git a/mongodb/queries.txt b/mongodb/queries.txt new file mode 100644 index 0000000000..915f13421d --- /dev/null +++ b/mongodb/queries.txt @@ -0,0 +1,43 @@ +[{"$project":{"_id":1}},{"$count":"c"}] +[{"$match":{"AdvEngineID":{"$ne":0}}},{"$count":"c"}] +[{"$group":{"_id":null,"sum_AdvEngineID":{"$sum":"$AdvEngineID"},"c":{"$sum":1},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"}}}] +[{"$group":{"_id":null,"a":{"$avg":{"$toDecimal":"$UserID"}}}}] +[{"$group":{"_id":"$UserID"}},{"$count":"c"}] +[{"$group":{"_id":"$SearchPhrase"}},{"$count":"c"}] +[{"$sort":{"EventDate":1}},{"$limit":1},{"$unionWith":{"coll":"hits","pipeline":[{"$sort":{"EventDate":-1}},{"$limit":1}]}},{"$group":{"_id":null,"tmpArray":{"$push":"$EventDate"}}},{"$project":{"min":{"$arrayElemAt":["$tmpArray",0]},"max":{"$arrayElemAt":["$tmpArray",1]}}}] +[{"$match":{"AdvEngineID":{"$ne":0}}},{"$group":{"_id":"$AdvEngineID","c":{"$sum":1}}},{"$sort":{"c":-1}}] +[{"$group":{"_id":{"RegionID":"$RegionID","UserID":"$UserID"}}},{"$group":{"_id":"$_id.RegionID","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$group":{"_id":"$RegionID","sum_AdvEngineID":{"$sum":"$AdvEngineID"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$lookup":{"from":"hits","let":{"regionIdVar":"$_id"},"pipeline":[{"$match":{"$expr":{"$eq":["$RegionID","$$regionIdVar"]}}},{"$group":{"_id":"$UserID"}},{"$count":"c"}],"as":"count_distinct_UserID"}},{"$set":{"count_distinct_UserID":{"$arrayElemAt":["$count_distinct_UserID.c",0]}}}] +[{"$match":{"MobilePhoneModel":{"$ne":""}}},{"$group":{"_id":{"MobilePhoneModel":"$MobilePhoneModel","UserID":"$UserID"}}},{"$group":{"_id":"$_id.MobilePhoneModel","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"MobilePhoneModel":{"$ne":""}}},{"$group":{"_id":{"MobilePhone":"$MobilePhone","MobilePhoneModel":"$MobilePhoneModel","UserID":"$UserID"}}},{"$group":{"_id":{"MobilePhone":"$_id.MobilePhone","MobilePhoneModel":"$_id.MobilePhoneModel"},"u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"SearchPhrase":"$SearchPhrase","UserID":"$UserID"}}},{"$group":{"_id":"$_id.SearchPhrase","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$SearchEngineID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"SearchEngineID":{"$first":"$SearchEngineID"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$UserID","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$UserID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"UserID":{"$first":"$UserID"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$UserID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"UserID":{"$first":"$UserID"},"c":{"$sum":1}}},{"$limit":10}] +[{"$group":{"_id":{"UserID":"$UserID","SearchPhrase":"$SearchPhrase","m":{"$minute":"$EventTime"}},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"UserID":{"$numberLong":"435090932899640449"}}},{"$project":{"UserID":1}}] +[{"$match":{"URL":{}}},{"$count":"c"}] +[{"$match":{"URL":{},"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","min_URL":{"$min":"$URL"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"Title":{},"URL":{"$not":{}},"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","count_distinct_UserID":{"$addToSet":"$UserID"},"min_Title":{"$min":"$Title"},"min_URL":{"$min":"$URL"},"c":{"$sum":1}}},{"$set":{"count_distinct_UserID":{"$size":"$count_distinct_UserID"}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"URL":{}}},{"$sort":{"EventTime":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$project":{"_id":0,"SearchPhrase":1}},{"$sort":{"EventTime":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$sort":{"SearchPhrase":1}},{"$project":{"SearchPhrase":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$project":{"_id":0,"EventTime":1,"SearchPhrase":1}},{"$sort":{"EventTime":1,"SearchPhrase":1}},{"$limit":10}] +[{"$match":{"URL":{"$ne":""}}},{"$group":{"_id":"$CounterID","l":{"$avg":{"$strLenBytes":"$URL"}},"c":{"$sum":1}}},{"$match":{"c":{"$gt":100000}}},{"$sort":{"l":-1}},{"$limit":25}] +[{"$match":{"Referer":{"$ne":""}}},{"$project":{"_id":0,"Referer":1}},{"$set":{"k":{"$regexFind":{"input":"$Referer","regex":"^https?://(?:www.)?([^/]+)/.*$"}}}},{"$group":{"_id":{"$ifNull":[{"$first":"$k.captures"},"$Referer"]},"l":{"$avg":{"$strLenBytes":"$Referer"}},"c":{"$sum":1}}},{"$match":{"c":{"$gt":100000}}},{"$sort":{"l":-1}},{"$limit":25}] +[{"$project":{"_id":0,"ResolutionWidth":{"$toLong":"$ResolutionWidth"}}},{"$group":{"_id":null,"srw_plus_0":{"$sum":{"$add":["$ResolutionWidth",0]}},"srw_plus_1":{"$sum":{"$add":["$ResolutionWidth",1]}},"srw_plus_2":{"$sum":{"$add":["$ResolutionWidth",2]}},"srw_plus_3":{"$sum":{"$add":["$ResolutionWidth",3]}},"srw_plus_4":{"$sum":{"$add":["$ResolutionWidth",4]}},"srw_plus_5":{"$sum":{"$add":["$ResolutionWidth",5]}},"srw_plus_6":{"$sum":{"$add":["$ResolutionWidth",6]}},"srw_plus_7":{"$sum":{"$add":["$ResolutionWidth",7]}},"srw_plus_8":{"$sum":{"$add":["$ResolutionWidth",8]}},"srw_plus_9":{"$sum":{"$add":["$ResolutionWidth",9]}},"srw_plus_10":{"$sum":{"$add":["$ResolutionWidth",10]}},"srw_plus_11":{"$sum":{"$add":["$ResolutionWidth",11]}},"srw_plus_12":{"$sum":{"$add":["$ResolutionWidth",12]}},"srw_plus_13":{"$sum":{"$add":["$ResolutionWidth",13]}},"srw_plus_14":{"$sum":{"$add":["$ResolutionWidth",14]}},"srw_plus_15":{"$sum":{"$add":["$ResolutionWidth",15]}},"srw_plus_16":{"$sum":{"$add":["$ResolutionWidth",16]}},"srw_plus_17":{"$sum":{"$add":["$ResolutionWidth",17]}},"srw_plus_18":{"$sum":{"$add":["$ResolutionWidth",18]}},"srw_plus_19":{"$sum":{"$add":["$ResolutionWidth",19]}},"srw_plus_20":{"$sum":{"$add":["$ResolutionWidth",20]}},"srw_plus_21":{"$sum":{"$add":["$ResolutionWidth",21]}},"srw_plus_22":{"$sum":{"$add":["$ResolutionWidth",22]}},"srw_plus_23":{"$sum":{"$add":["$ResolutionWidth",23]}},"srw_plus_24":{"$sum":{"$add":["$ResolutionWidth",24]}},"srw_plus_25":{"$sum":{"$add":["$ResolutionWidth",25]}},"srw_plus_26":{"$sum":{"$add":["$ResolutionWidth",26]}},"srw_plus_27":{"$sum":{"$add":["$ResolutionWidth",27]}},"srw_plus_28":{"$sum":{"$add":["$ResolutionWidth",28]}},"srw_plus_29":{"$sum":{"$add":["$ResolutionWidth",29]}},"srw_plus_30":{"$sum":{"$add":["$ResolutionWidth",30]}},"srw_plus_31":{"$sum":{"$add":["$ResolutionWidth",31]}},"srw_plus_32":{"$sum":{"$add":["$ResolutionWidth",32]}},"srw_plus_33":{"$sum":{"$add":["$ResolutionWidth",33]}},"srw_plus_34":{"$sum":{"$add":["$ResolutionWidth",34]}},"srw_plus_35":{"$sum":{"$add":["$ResolutionWidth",35]}},"srw_plus_36":{"$sum":{"$add":["$ResolutionWidth",36]}},"srw_plus_37":{"$sum":{"$add":["$ResolutionWidth",37]}},"srw_plus_38":{"$sum":{"$add":["$ResolutionWidth",38]}},"srw_plus_39":{"$sum":{"$add":["$ResolutionWidth",39]}},"srw_plus_40":{"$sum":{"$add":["$ResolutionWidth",40]}},"srw_plus_41":{"$sum":{"$add":["$ResolutionWidth",41]}},"srw_plus_42":{"$sum":{"$add":["$ResolutionWidth",42]}},"srw_plus_43":{"$sum":{"$add":["$ResolutionWidth",43]}},"srw_plus_44":{"$sum":{"$add":["$ResolutionWidth",44]}},"srw_plus_45":{"$sum":{"$add":["$ResolutionWidth",45]}},"srw_plus_46":{"$sum":{"$add":["$ResolutionWidth",46]}},"srw_plus_47":{"$sum":{"$add":["$ResolutionWidth",47]}},"srw_plus_48":{"$sum":{"$add":["$ResolutionWidth",48]}},"srw_plus_49":{"$sum":{"$add":["$ResolutionWidth",49]}},"srw_plus_50":{"$sum":{"$add":["$ResolutionWidth",50]}},"srw_plus_51":{"$sum":{"$add":["$ResolutionWidth",51]}},"srw_plus_52":{"$sum":{"$add":["$ResolutionWidth",52]}},"srw_plus_53":{"$sum":{"$add":["$ResolutionWidth",53]}},"srw_plus_54":{"$sum":{"$add":["$ResolutionWidth",54]}},"srw_plus_55":{"$sum":{"$add":["$ResolutionWidth",55]}},"srw_plus_56":{"$sum":{"$add":["$ResolutionWidth",56]}},"srw_plus_57":{"$sum":{"$add":["$ResolutionWidth",57]}},"srw_plus_58":{"$sum":{"$add":["$ResolutionWidth",58]}},"srw_plus_59":{"$sum":{"$add":["$ResolutionWidth",59]}},"srw_plus_60":{"$sum":{"$add":["$ResolutionWidth",60]}},"srw_plus_61":{"$sum":{"$add":["$ResolutionWidth",61]}},"srw_plus_62":{"$sum":{"$add":["$ResolutionWidth",62]}},"srw_plus_63":{"$sum":{"$add":["$ResolutionWidth",63]}},"srw_plus_64":{"$sum":{"$add":["$ResolutionWidth",64]}},"srw_plus_65":{"$sum":{"$add":["$ResolutionWidth",65]}},"srw_plus_66":{"$sum":{"$add":["$ResolutionWidth",66]}},"srw_plus_67":{"$sum":{"$add":["$ResolutionWidth",67]}},"srw_plus_68":{"$sum":{"$add":["$ResolutionWidth",68]}},"srw_plus_69":{"$sum":{"$add":["$ResolutionWidth",69]}},"srw_plus_70":{"$sum":{"$add":["$ResolutionWidth",70]}},"srw_plus_71":{"$sum":{"$add":["$ResolutionWidth",71]}},"srw_plus_72":{"$sum":{"$add":["$ResolutionWidth",72]}},"srw_plus_73":{"$sum":{"$add":["$ResolutionWidth",73]}},"srw_plus_74":{"$sum":{"$add":["$ResolutionWidth",74]}},"srw_plus_75":{"$sum":{"$add":["$ResolutionWidth",75]}},"srw_plus_76":{"$sum":{"$add":["$ResolutionWidth",76]}},"srw_plus_77":{"$sum":{"$add":["$ResolutionWidth",77]}},"srw_plus_78":{"$sum":{"$add":["$ResolutionWidth",78]}},"srw_plus_79":{"$sum":{"$add":["$ResolutionWidth",79]}},"srw_plus_80":{"$sum":{"$add":["$ResolutionWidth",80]}},"srw_plus_81":{"$sum":{"$add":["$ResolutionWidth",81]}},"srw_plus_82":{"$sum":{"$add":["$ResolutionWidth",82]}},"srw_plus_83":{"$sum":{"$add":["$ResolutionWidth",83]}},"srw_plus_84":{"$sum":{"$add":["$ResolutionWidth",84]}},"srw_plus_85":{"$sum":{"$add":["$ResolutionWidth",85]}},"srw_plus_86":{"$sum":{"$add":["$ResolutionWidth",86]}},"srw_plus_87":{"$sum":{"$add":["$ResolutionWidth",87]}},"srw_plus_88":{"$sum":{"$add":["$ResolutionWidth",88]}},"srw_plus_89":{"$sum":{"$add":["$ResolutionWidth",89]}}}}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":[{"$toString":"$SearchEngineID"},"|",{"$toString":"$ClientIP"}]},"SearchEngineID":{"$first":"$SearchEngineID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":[{"$toString":"$WatchID"},"|",{"$toString":"$ClientIP"}]},"WatchID":{"$first":"$WatchID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":[{"$toString":"$ClientIP"},"|",{"$toString":"$WatchID"}]},"WatchID":{"$first":"$WatchID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$URL","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$URL","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$set":{"one":1}}] +[{"$group":{"_id":"$ClientIP","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$set":{"ClientIP_0":"$_id","ClientIP_1":{"$add":["$_id",-1]},"ClientIP_2":{"$add":["$_id",-2]},"ClientIP_3":{"$add":["$_id",-3]}}}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"DontCountHits":0,"IsRefresh":0,"URL":{"$ne":""}}},{"$group":{"_id":"$URL","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"DontCountHits":0,"IsRefresh":0,"URL":{"$ne":""}}},{"$group":{"_id":"$Title","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"IsLink":{"$ne":0},"IsDownload":0,"URL":{"$ne":""}}},{"$group":{"_id":"$Title","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":1000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0}},{"$set":{"Src":{"$cond":{"if":{"$and":[{"$eq":["$SearchEngineID",0]},{"$eq":["$AdvEngineID",0]}]},"then":"$Referer","else":""}},"Dst":"$URL"}},{"$group":{"_id":{"TraficSourceID":"$TraficSourceID","SearchEngineID":"$SearchEngineID","AdvEngineID":"$AdvEngineID","Src":"$Src","Dst":"$Dst"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":1000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"TraficSourceID":{"$in":[-1,6]},"RefererHash":{"$numberLong":"3594120000172545465"}}},{"$group":{"_id":{"URLHash":"$URLHash","EventDate":"$EventDate"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":100},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"DontCountHits":0,"URLHash":{"$numberLong":"2868770270353813622"}}},{"$group":{"_id":{"WindowClientWidth":"$WindowClientWidth","WindowClientHeight":"$WindowClientHeight"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":10000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-14"},"$lte":{"$date":"2013-07-15"}},"IsRefresh":0,"DontCountHits":0}},{"$group":{"_id":{"$dateTrunc":{"date":"$EventTime","unit":"minute"}},"pageViews":{"$sum":1}}},{"$sort":{"_id":1}},{"$skip":1000},{"$limit":10}] diff --git a/mongodb/query b/mongodb/query new file mode 100755 index 0000000000..4c3f7e6946 --- /dev/null +++ b/mongodb/query @@ -0,0 +1,22 @@ +#!/bin/bash +# Reads a MongoDB aggregation pipeline (Extended JSON, single line) from +# stdin and runs it against the `hits` collection in the `test` DB. +# Stdout: query result (as printed by mongosh). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +# +# This system uses MongoDB aggregation pipelines instead of SQL. The +# canonical pipelines (one per line) live in queries.txt; the shared driver +# is configured to read that file via BENCH_QUERIES_FILE. +set -e + +pipeline=$(cat) + +PIPELINE_JSON="$pipeline" mongosh --quiet test --eval ' +const start = new Date(); +const pipeline = EJSON.parse(process.env.PIPELINE_JSON); +const result = db.hits.aggregate(pipeline, {allowDiskUse: true}).toArray(); +const elapsed = (new Date() - start) / 1000; +print(EJSON.stringify(result)); +console.error(elapsed.toFixed(3)); +' diff --git a/mongodb/run.js b/mongodb/run.js deleted file mode 100644 index 1bcd802fcd..0000000000 --- a/mongodb/run.js +++ /dev/null @@ -1,38 +0,0 @@ -const iterations = 3; - -// `col` need in queries to make lookups so define before load -let collectionName = "hits" -let col = db.getCollection(collectionName); - -load("./queries.js"); - -// If someone knows how to clear the OS page cache from javascript, -// please do (this is technically required by the benchmark rules) - -for (let i = 0; i < queries.length; i++) { - for (let j = 0; j < iterations; ++j) { - start = new Date(); - try { - res = col.aggregate(queries[i], { allowDiskUse: true }).toArray(); - print( - EJSON.stringify({ - q: i, - it: j, - ok: 1, - t: new Date().getTime() - start.getTime(), - res: res, - }) - ); - } catch (e) { - print( - EJSON.stringify({ - q: i, - it: j, - ok: 0, - t: new Date().getTime() - start.getTime(), - res: e, - }) - ); - } - } -} diff --git a/mongodb/start b/mongodb/start new file mode 100755 index 0000000000..9e8bafc100 --- /dev/null +++ b/mongodb/start @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo systemctl start mongod + +# Enable the planner option used by ClickBench (covered whole-index scans). +# This is a runtime parameter that resets on restart, so we re-apply on every +# start. Wait briefly for the server to accept connections first. +for _ in $(seq 1 60); do + if mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null 2>&1; then + break + fi + sleep 1 +done +mongosh --quiet --eval 'db.adminCommand({setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true});' >/dev/null diff --git a/mongodb/stop b/mongodb/stop new file mode 100755 index 0000000000..0c408822ee --- /dev/null +++ b/mongodb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop mongod || true diff --git a/mysql-myisam/benchmark.sh b/mysql-myisam/benchmark.sh index bdb34a4c50..531bd65038 100755 --- a/mysql-myisam/benchmark.sh +++ b/mysql-myisam/benchmark.sh @@ -1,30 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mysql-server-8.0 -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" -sudo service mysql restart - -# Load the data - -../download-hits-tsv - -sudo mysql -e "CREATE DATABASE test" -sudo mysql test < create.sql -echo -n "Load time: " -command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits;" - -# 41m8.979s - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mysql test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mysql-myisam/check b/mysql-myisam/check new file mode 100755 index 0000000000..b1e36dced8 --- /dev/null +++ b/mysql-myisam/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mysql -e "SELECT 1" >/dev/null diff --git a/mysql-myisam/data-size b/mysql-myisam/data-size new file mode 100755 index 0000000000..5015ae8667 --- /dev/null +++ b/mysql-myisam/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mysql test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mysql-myisam/install b/mysql-myisam/install new file mode 100755 index 0000000000..dcace5ba46 --- /dev/null +++ b/mysql-myisam/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mysql-server-8.0 + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" +sudo service mysql restart diff --git a/mysql-myisam/load b/mysql-myisam/load new file mode 100755 index 0000000000..b39827cdb0 --- /dev/null +++ b/mysql-myisam/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +sudo mysql -e "DROP DATABASE IF EXISTS test" +sudo mysql -e "CREATE DATABASE test" +sudo mysql test < create.sql + +sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits;" + +rm -f hits.tsv +sync diff --git a/mysql-myisam/query b/mysql-myisam/query new file mode 100755 index 0000000000..9d9168268f --- /dev/null +++ b/mysql-myisam/query @@ -0,0 +1,34 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mysql -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mysql's "N rows in set (X.XX sec)" footer). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mysql test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/mysql-myisam/run.sh b/mysql-myisam/run.sh deleted file mode 100755 index faf06250ef..0000000000 --- a/mysql-myisam/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mysql test -vvv -e "${query}" - done; -done; diff --git a/mysql-myisam/start b/mysql-myisam/start new file mode 100755 index 0000000000..1eda91080c --- /dev/null +++ b/mysql-myisam/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if sudo mysql -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mysql start diff --git a/mysql-myisam/stop b/mysql-myisam/stop new file mode 100755 index 0000000000..f887aafbff --- /dev/null +++ b/mysql-myisam/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mysql stop || true diff --git a/mysql/benchmark.sh b/mysql/benchmark.sh index 465f959aed..531bd65038 100755 --- a/mysql/benchmark.sh +++ b/mysql/benchmark.sh @@ -1,30 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mysql-server-8.0 -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" -sudo service mysql restart - -# Load the data - -../download-hits-tsv - -sudo mysql -e "CREATE DATABASE test" -sudo mysql test < create.sql -echo -n "Load time: " -command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits" - -# 2:37:52 elapsed - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mysql test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mysql/check b/mysql/check new file mode 100755 index 0000000000..b1e36dced8 --- /dev/null +++ b/mysql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mysql -e "SELECT 1" >/dev/null diff --git a/mysql/data-size b/mysql/data-size new file mode 100755 index 0000000000..5015ae8667 --- /dev/null +++ b/mysql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mysql test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mysql/install b/mysql/install new file mode 100755 index 0000000000..dcace5ba46 --- /dev/null +++ b/mysql/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mysql-server-8.0 + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" +sudo service mysql restart diff --git a/mysql/load b/mysql/load new file mode 100755 index 0000000000..69e75e085e --- /dev/null +++ b/mysql/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +sudo mysql -e "DROP DATABASE IF EXISTS test" +sudo mysql -e "CREATE DATABASE test" +sudo mysql test < create.sql + +sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits" + +rm -f hits.tsv +sync diff --git a/mysql/query b/mysql/query new file mode 100755 index 0000000000..14887e9980 --- /dev/null +++ b/mysql/query @@ -0,0 +1,35 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mysql -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mysql's "N rows in set (X.XX sec)" footer). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mysql test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +# Parse "(X.XX sec)" or "(N min Y.YY sec)" from the footer line. +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/mysql/run.sh b/mysql/run.sh deleted file mode 100755 index faf06250ef..0000000000 --- a/mysql/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mysql test -vvv -e "${query}" - done; -done; diff --git a/mysql/start b/mysql/start new file mode 100755 index 0000000000..d6763dbd2a --- /dev/null +++ b/mysql/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +# Idempotent: if already up, do nothing. +if sudo mysql -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mysql start diff --git a/mysql/stop b/mysql/stop new file mode 100755 index 0000000000..f887aafbff --- /dev/null +++ b/mysql/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mysql stop || true diff --git a/octosql/benchmark.sh b/octosql/benchmark.sh index c20a09d465..fc4bacc8f3 100755 --- a/octosql/benchmark.sh +++ b/octosql/benchmark.sh @@ -1,17 +1,5 @@ #!/bin/bash - -wget --continue --progress=dot:giga https://github.com/cube2222/octosql/releases/download/v0.13.0/octosql_0.13.0_linux_amd64.tar.gz -tar xf octosql_0.13.0_linux_amd64.tar.gz - -../download-hits-parquet-single - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|^Killed|^fatal error|^panic' | - sed -r -e 's/^(Error|Killed|fatal|panic).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/octosql/check b/octosql/check new file mode 100755 index 0000000000..2b362179eb --- /dev/null +++ b/octosql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./octosql --help >/dev/null diff --git a/octosql/data-size b/octosql/data-size new file mode 100755 index 0000000000..708c0b72e7 --- /dev/null +++ b/octosql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/octosql/install b/octosql/install new file mode 100755 index 0000000000..d257162094 --- /dev/null +++ b/octosql/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if [ ! -x ./octosql ]; then + wget --continue --progress=dot:giga \ + https://github.com/cube2222/octosql/releases/download/v0.13.0/octosql_0.13.0_linux_amd64.tar.gz + tar xf octosql_0.13.0_linux_amd64.tar.gz octosql +fi diff --git a/octosql/load b/octosql/load new file mode 100755 index 0000000000..1b395b9dd0 --- /dev/null +++ b/octosql/load @@ -0,0 +1,4 @@ +#!/bin/bash +# octosql queries hits.parquet directly. No persistent DB to load. +set -e +sync diff --git a/octosql/query b/octosql/query new file mode 100755 index 0000000000..984c1a66da --- /dev/null +++ b/octosql/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via octosql against hits.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +set -e + +query=$(cat) +# octosql wants the file path inline, not a table name. +query=${query//hits/hits.parquet} + +# Cap RSS to ~90% of host memory like the original benchmark. +max_rss=$(( $(grep MemTotal /proc/meminfo | grep -o -P '\d+') * 900 )) + +TIMEFORMAT='%R' +{ time prlimit --data="${max_rss}" ./octosql "$query" 1>/tmp/octosql.out.$$ 2>/tmp/octosql.err.$$; } 2>/tmp/octosql.time.$$ || status=$? +status=${status:-0} + +cat /tmp/octosql.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/octosql.err.$$ >&2 + rm -f /tmp/octosql.out.$$ /tmp/octosql.err.$$ /tmp/octosql.time.$$ + exit "$status" +fi + +cat /tmp/octosql.err.$$ >&2 +cat /tmp/octosql.time.$$ >&2 + +rm -f /tmp/octosql.out.$$ /tmp/octosql.err.$$ /tmp/octosql.time.$$ diff --git a/octosql/results/20260509/c6a.4xlarge.json b/octosql/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..113d26800d --- /dev/null +++ b/octosql/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "OctoSQL", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Go","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [6.576, 6.456, 6.428], + [41.986, 41.608, 41.225], + [49.56, 48.601, 48.948], + [null, null, null], + [null, null, null], + [null, null, null], + [61.55, 60.943, 60.994], + [41.606, 41.067, 41.777], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [60.737, 60.962, 61.526], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] +] +} + diff --git a/octosql/run.sh b/octosql/run.sh deleted file mode 100755 index 61a34ec780..0000000000 --- a/octosql/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -max_rss=$(( $(cat /proc/meminfo | grep MemTotal | grep -o -P '\d+') * 900 )) - -cat queries.sql | sed -r -e 's@hits@hits.parquet@' | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for _ in {1..3} - do - time prlimit --data="${max_rss}" ./octosql "${query}" - done -done diff --git a/octosql/start b/octosql/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/octosql/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/octosql/stop b/octosql/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/octosql/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/opteryx/benchmark.sh b/opteryx/benchmark.sh index 65fe910e8c..3b63e772a6 100755 --- a/opteryx/benchmark.sh +++ b/opteryx/benchmark.sh @@ -1,41 +1,5 @@ #!/bin/bash - -# Update package lists -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository -y ppa:deadsnakes/ppa -sudo apt-get update -y - -# Install required packages -sudo apt-get install -y python3.11 python3.11-venv git wget build-essential python3.11-dev - -# Create and activate a virtual environment using Python 3.11 -python3.11 -m venv ~/opteryx_venv -source ~/opteryx_venv/bin/activate - -# Upgrade pip in the virtual environment -~/opteryx_venv/bin/python -m pip install --upgrade pip -~/opteryx_venv/bin/python -m pip install --upgrade opteryx==0.26.1 - -# Download benchmark target data, partitioned -../download-hits-parquet-partitioned hits - -# Run a simple query to check the installation -~/opteryx_venv/bin/python -m opteryx "SELECT version()" 2>&1 - -# Run benchmarks for partitioned data using queries from queries.sql -if [[ -f ./queries.sql ]]; then - while read -r query; do - sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - (~/opteryx_venv/bin/python -m opteryx "$query" --cycles 3 2>&1 | grep -v -P '^3$') || echo '[null,null,null]' - done < ./queries.sql -else - echo "queries.sql not found." -fi - -# Deactivate the virtual environment -deactivate - -echo "Data size: $(du -bcs hits | grep total)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/opteryx/check b/opteryx/check new file mode 100755 index 0000000000..4d4c12fd75 --- /dev/null +++ b/opteryx/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +"$HOME/opteryx_venv/bin/python" -m opteryx "SELECT version()" >/dev/null diff --git a/opteryx/data-size b/opteryx/data-size new file mode 100755 index 0000000000..8e65ea4b35 --- /dev/null +++ b/opteryx/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits | awk '/total$/ { print $1 }' diff --git a/opteryx/install b/opteryx/install new file mode 100755 index 0000000000..ea31f110b4 --- /dev/null +++ b/opteryx/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository -y ppa:deadsnakes/ppa +sudo apt-get update -y +sudo apt-get install -y python3.11 python3.11-venv git wget build-essential python3.11-dev + +if [ ! -d "$HOME/opteryx_venv" ]; then + python3.11 -m venv "$HOME/opteryx_venv" +fi + +"$HOME/opteryx_venv/bin/python" -m pip install --upgrade pip +"$HOME/opteryx_venv/bin/python" -m pip install --upgrade opteryx==0.26.1 diff --git a/opteryx/load b/opteryx/load new file mode 100755 index 0000000000..fafa76868e --- /dev/null +++ b/opteryx/load @@ -0,0 +1,8 @@ +#!/bin/bash +# opteryx queries `FROM hits` and resolves it to ./hits/*.parquet, so move +# the partitioned files into the expected subdir. +set -e + +mkdir -p hits +mv hits_*.parquet hits/ 2>/dev/null || true +sync diff --git a/opteryx/query b/opteryx/query new file mode 100755 index 0000000000..39ed0b69cb --- /dev/null +++ b/opteryx/query @@ -0,0 +1,29 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via opteryx (Python in-process) +# against the partitioned parquet under ./hits/. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) + +"$HOME/opteryx_venv/bin/python" - "$query" <<'PY' +import sys +import timeit +import opteryx + +query = sys.argv[1] + +start = timeit.default_timer() +try: + res = opteryx.query(query) + rows = list(res) + end = timeit.default_timer() +finally: + pass + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/opteryx/results/20250103/c6a.4xlarge.json b/opteryx/results/20250103/c6a.4xlarge.json index f348eaf057..067c0a684f 100644 --- a/opteryx/results/20250103/c6a.4xlarge.json +++ b/opteryx/results/20250103/c6a.4xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": null, diff --git a/opteryx/results/20250112/c6a.4xlarge.json b/opteryx/results/20250112/c6a.4xlarge.json index 6c3a512e9c..18509d38a5 100644 --- a/opteryx/results/20250112/c6a.4xlarge.json +++ b/opteryx/results/20250112/c6a.4xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250330/c6a.4xlarge.json b/opteryx/results/20250330/c6a.4xlarge.json index 116d14c022..d6a423d081 100644 --- a/opteryx/results/20250330/c6a.4xlarge.json +++ b/opteryx/results/20250330/c6a.4xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250710/c6a.2xlarge.json b/opteryx/results/20250710/c6a.2xlarge.json index 5250601e9d..948fd5511f 100644 --- a/opteryx/results/20250710/c6a.2xlarge.json +++ b/opteryx/results/20250710/c6a.2xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250710/c6a.4xlarge.json b/opteryx/results/20250710/c6a.4xlarge.json index ba5a30734d..8f2e404b4c 100644 --- a/opteryx/results/20250710/c6a.4xlarge.json +++ b/opteryx/results/20250710/c6a.4xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250711/c6a.xlarge.json b/opteryx/results/20250711/c6a.xlarge.json index 83fdefbe7c..b6907a0777 100644 --- a/opteryx/results/20250711/c6a.xlarge.json +++ b/opteryx/results/20250711/c6a.xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250712/t3a.small.json b/opteryx/results/20250712/t3a.small.json index 53ec6e5210..7c2848f06b 100644 --- a/opteryx/results/20250712/t3a.small.json +++ b/opteryx/results/20250712/t3a.small.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250730/c6a.4xlarge.json b/opteryx/results/20250730/c6a.4xlarge.json index 8150b28526..f8404b868f 100644 --- a/opteryx/results/20250730/c6a.4xlarge.json +++ b/opteryx/results/20250730/c6a.4xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250731/c6a.2xlarge.json b/opteryx/results/20250731/c6a.2xlarge.json index d9d87b5491..3d5bc69308 100644 --- a/opteryx/results/20250731/c6a.2xlarge.json +++ b/opteryx/results/20250731/c6a.2xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250731/c6a.xlarge.json b/opteryx/results/20250731/c6a.xlarge.json index ed0ad82e7a..53adf9f390 100644 --- a/opteryx/results/20250731/c6a.xlarge.json +++ b/opteryx/results/20250731/c6a.xlarge.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20250731/t3a.small.json b/opteryx/results/20250731/t3a.small.json index ee3af666bb..ef75eba53e 100644 --- a/opteryx/results/20250731/t3a.small.json +++ b/opteryx/results/20250731/t3a.small.json @@ -9,7 +9,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ], "load_time": 0, diff --git a/opteryx/results/20251105/c6a.2xlarge.json b/opteryx/results/20251105/c6a.2xlarge.json index 1cad0d36ea..90129677a6 100644 --- a/opteryx/results/20251105/c6a.2xlarge.json +++ b/opteryx/results/20251105/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["stateless","column-oriented","serverless","embedded"], + "tags": ["stateless","column-oriented","embedded"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/opteryx/results/20251105/c6a.4xlarge.json b/opteryx/results/20251105/c6a.4xlarge.json index a2df251d7d..e197398072 100644 --- a/opteryx/results/20251105/c6a.4xlarge.json +++ b/opteryx/results/20251105/c6a.4xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["stateless","column-oriented","serverless","embedded"], + "tags": ["stateless","column-oriented","embedded"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/opteryx/results/20251105/c6a.xlarge.json b/opteryx/results/20251105/c6a.xlarge.json index 81d2ae1847..c131095234 100644 --- a/opteryx/results/20251105/c6a.xlarge.json +++ b/opteryx/results/20251105/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["stateless","column-oriented","serverless","embedded"], + "tags": ["stateless","column-oriented","embedded"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/opteryx/results/20251105/t3a.small.json b/opteryx/results/20251105/t3a.small.json index 880e83f028..02bcd745d3 100644 --- a/opteryx/results/20251105/t3a.small.json +++ b/opteryx/results/20251105/t3a.small.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["stateless","column-oriented","serverless","embedded"], + "tags": ["stateless","column-oriented","embedded"], "load_time": 0, "data_size": 14737666736, "result": [ diff --git a/opteryx/results/20260509/c6a.4xlarge.json b/opteryx/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..fa2fa2eee4 --- /dev/null +++ b/opteryx/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Opteryx", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["stateless","column-oriented","embedded"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [2.258, 0.571, 0.572], + [2.863, 0.772, 0.775], + [3.706, 0.979, 1.011], + [4.522, 0.862, 0.854], + [7.621, 3.966, 4.003], + [7.187, 2.864, 2.872], + [2.9, 0.836, 0.831], + [2.814, 0.774, 0.781], + [8.535, 4.765, 4.84], + [10.112, 5.702, 5.652], + [5.16, 1.132, 1.138], + [5.197, 1.17, 1.158], + [7.45, 3.576, 3.463], + [11.189, 5.41, 5.279], + [8.305, 4.542, 4.538], + [8.378, 4.586, 4.588], + [15.055, 8.913, 9.483], + [14.678, 8.655, 8.793], + [35.208, 26.847, 27.229], + [4.206, 0.839, 0.833], + [20.431, 6.636, 6.761], + [18.916, 3.336, 3.352], + [33.079, 5.75, 5.633], + [88.395, 27.123, 30.59], + [7.844, 1.436, 1.449], + [5.207, 1.361, 1.338], + [7.717, 1.444, 1.423], + [38.477, 23.614, 23.513], + [99.104, 85.766, 85.684], + [3.481, 0.97, 0.994], + [9.454, 3.26, 3.28], + [16.598, 5.207, 5.254], + [39.44, 30.914, 31.066], + [null, null, null], + [null, null, null], + [8.215, 4.946, 5.098], + [3.14, 0.972, 0.973], + [2.708, 0.824, 0.812], + [3.008, 0.824, 0.824], + [3.728, 1.523, 1.499], + [2.723, 0.791, 0.793], + [2.701, 0.787, 0.779], + [2.805, 0.91, 0.909] +] +} + diff --git a/opteryx/start b/opteryx/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/opteryx/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/opteryx/stop b/opteryx/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/opteryx/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/opteryx/template.json b/opteryx/template.json index 3fae831363..548d083dd8 100644 --- a/opteryx/template.json +++ b/opteryx/template.json @@ -6,7 +6,6 @@ "tags": [ "stateless", "column-oriented", - "serverless", "embedded" ] } diff --git a/oxla/README.md b/oxla/README.md new file mode 100644 index 0000000000..e916e55587 --- /dev/null +++ b/oxla/README.md @@ -0,0 +1,13 @@ +# Oxla + +## Dead (May 2026) + +Oxla was acquired by Redpanda in October 2025. The image previously pinned by `install` + + public.ecr.aws/oxla/release:1.53.0-beta + +now resolves to + + Error response from daemon: failed to resolve reference "public.ecr.aws/oxla/release:1.53.0-beta": public.ecr.aws/oxla/release:1.53.0-beta: not found + +The public ECR namespace stopped serving Oxla images and no replacement registry exists. The directory and historical results are kept; nothing here runs anymore. diff --git a/oxla/benchmark.sh b/oxla/benchmark.sh index 0a9be8cca0..1aa9264b91 100755 --- a/oxla/benchmark.sh +++ b/oxla/benchmark.sh @@ -1,39 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get install -y docker.io - -# base -sudo apt-get install -y postgresql-client curl wget apt-transport-https ca-certificates software-properties-common gnupg2 parallel -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential - -# download dataset -../download-hits-csv -sudo mkdir data -sudo mv hits.csv data - -# get and configure Oxla image -echo "Install and run Oxla." - -docker run --rm -p 5432:5432 -v "$(pwd)/data:/data" --name oxlacontainer public.ecr.aws/oxla/release:1.53.0-beta > /dev/null 2>&1 & - -# create table and ingest data -export PGCLIENTENCODING=UTF8 - -for _ in {1..600} -do - PGPASSWORD=oxla psql -h localhost -U oxla -t < create.sql && break - sleep 1 -done - -echo "Insert data." -echo -n "Load time: " -PGPASSWORD=oxla command time -f '%e' psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';" - -# get ingested data size -echo -n "Data size: " -PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "SELECT pg_total_relation_size('hits');" - -# run benchmark -echo "running benchmark..." -./run.sh +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/oxla/check b/oxla/check new file mode 100755 index 0000000000..93848a1fb1 --- /dev/null +++ b/oxla/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null diff --git a/oxla/data-size b/oxla/data-size new file mode 100755 index 0000000000..4a98fca07b --- /dev/null +++ b/oxla/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +PGPASSWORD=oxla psql -h localhost -U oxla -q -t -A -c "SELECT pg_total_relation_size('hits');" diff --git a/oxla/install b/oxla/install new file mode 100755 index 0000000000..ac8fc1dcb8 --- /dev/null +++ b/oxla/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client curl wget \ + apt-transport-https ca-certificates software-properties-common gnupg2 parallel +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential + +sudo docker pull public.ecr.aws/oxla/release:1.53.0-beta diff --git a/oxla/load b/oxla/load new file mode 100755 index 0000000000..e1f99c03b8 --- /dev/null +++ b/oxla/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +export PGCLIENTENCODING=UTF8 + +mkdir -p data +sudo mv hits.csv data/ + +PGPASSWORD=oxla psql -h localhost -U oxla -q -t < create.sql +PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';" + +sudo rm -f data/hits.csv +sync diff --git a/oxla/query b/oxla/query new file mode 100755 index 0000000000..a9551059b4 --- /dev/null +++ b/oxla/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against Oxla's pg-protocol port. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=oxla psql -h localhost -U oxla -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Pass-through result, strip Time: lines from stdout. +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/oxla/run.sh b/oxla/run.sh deleted file mode 100755 index 7d20f12153..0000000000 --- a/oxla/run.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -TRIES=3 -rm result.txt 2>/dev/null -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - # Oxla seems to cache major parts of the dataset without a documented way to clear the cache between the runs. - # It seems fairer to restart the database between the runs. - docker restart oxlacontainer - sleep 30 - - echo "$query"; - results="" - if [[ "$query" == "SELECT NULL;" ]]; then - results+="[null,null,null]" - else - results+="[" - for i in $(seq 1 $TRIES); do - time=$(PGPASSWORD=oxla psql -h localhost -U oxla -t -c '\timing' -c "$query" | grep 'Time' | perl -nle 'm/Time: ([^ ]*) ms/; print $1 / 1000') - echo "$time s" - results+="$time," - done - results=${results::-1} - results+="]" - fi - echo "$results," >> result.txt -done -result=$(cat result.txt) -result=${result::-1} -echo "$result" -rm result.txt 2>/dev/null diff --git a/oxla/start b/oxla/start new file mode 100755 index 0000000000..e2f0185f57 --- /dev/null +++ b/oxla/start @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +# Idempotent: if already serving, do nothing. +if PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Start (or restart) the container in the background. +sudo docker stop oxlacontainer >/dev/null 2>&1 || true +sudo docker rm oxlacontainer >/dev/null 2>&1 || true + +mkdir -p data +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + --name oxlacontainer \ + public.ecr.aws/oxla/release:1.53.0-beta >/dev/null + +# Wait briefly for protocol port (the lib's check loop will keep waiting). +for _ in $(seq 1 60); do + PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/oxla/stop b/oxla/stop new file mode 100755 index 0000000000..b673a6d161 --- /dev/null +++ b/oxla/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop oxlacontainer >/dev/null 2>&1 || true diff --git a/pandas/benchmark.sh b/pandas/benchmark.sh index fee224a324..fc4bacc8f3 100755 --- a/pandas/benchmark.sh +++ b/pandas/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas pyarrow - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/pandas/check b/pandas/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/pandas/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/pandas/data-size b/pandas/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/pandas/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/pandas/install b/pandas/install new file mode 100755 index 0000000000..9605452767 --- /dev/null +++ b/pandas/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas pyarrow fastapi uvicorn diff --git a/pandas/load b/pandas/load new file mode 100755 index 0000000000..ceba6becac --- /dev/null +++ b/pandas/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/pandas/queries.sql b/pandas/queries.sql new file mode 100644 index 0000000000..b4115ee3aa --- /dev/null +++ b/pandas/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/pandas/query b/pandas/query new file mode 100755 index 0000000000..0bc448d9c8 --- /dev/null +++ b/pandas/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running pandas server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Capture HTTP status and body separately to detect errors cleanly. +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/pandas/results/20240906/c6a.metal.json b/pandas/results/20240906/c6a.metal.json index 2f6bdb1da5..18e887de5a 100644 --- a/pandas/results/20240906/c6a.metal.json +++ b/pandas/results/20240906/c6a.metal.json @@ -12,7 +12,6 @@ "column-oriented", "embedded", "stateless", - "serverless", "dataframe", "in-memory", "lukewarm-cold-run" diff --git a/pandas/results/20240909/c6a.metal.json b/pandas/results/20240909/c6a.metal.json index dafabd18a9..0ff4fdf33f 100644 --- a/pandas/results/20240909/c6a.metal.json +++ b/pandas/results/20240909/c6a.metal.json @@ -12,7 +12,6 @@ "column-oriented", "embedded", "stateless", - "serverless", "dataframe", "in-memory", "lukewarm-cold-run" diff --git a/pandas/results/20260218/c6a.metal.json b/pandas/results/20260218/c6a.metal.json index d6eff0492c..ead3f84a38 100644 --- a/pandas/results/20260218/c6a.metal.json +++ b/pandas/results/20260218/c6a.metal.json @@ -7,7 +7,7 @@ "hardware": "cpu", "tuned": "no", "comment": "", - "tags": ["C++", "column-oriented", "embedded", "stateless", "serverless", "dataframe", "in-memory", "lukewarm-cold-run"], + "tags": ["C++", "column-oriented", "embedded", "stateless", "dataframe", "in-memory", "lukewarm-cold-run"], "load_time": 28, "data_size": 321067241472, "result": [ diff --git a/pandas/query.py b/pandas/server.py similarity index 73% rename from pandas/query.py rename to pandas/server.py index 9d6ba7136d..5bc676173b 100755 --- a/pandas/query.py +++ b/pandas/server.py @@ -1,65 +1,57 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around pandas so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import pandas as pd -import timeit -import datetime -import json -import subprocess +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded DataFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") +The (sql, lambda) list is the same as the previous standalone query.py — just +exposed over HTTP. queries.sql in this directory holds the SQL strings in the +same order. +""" -dataframe_size = hits.memory_usage().sum() +import os +import timeit -# print("Dataframe(numpy) size:", dataframe_size, "bytes") +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") +app = FastAPI() +hits: pd.DataFrame | None = None -# fix all object columns to string -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) -# 0: No., 1: SQL, 2: Pandas -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count()), +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# pandas/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.count()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x[x["AdvEngineID"] != 0].count(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), ), + ("SELECT AVG(UserID) FROM hits;", lambda x: x["UserID"].mean()), + ("SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x["UserID"].nunique()), ( - "Q3", - "SELECT AVG(UserID) FROM hits;", - lambda x: x["UserID"].mean(), - ), - ( - "Q4", - "SELECT COUNT(DISTINCT UserID) FROM hits;", - lambda x: x["UserID"].nunique(), - ), - ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x["SearchPhrase"].nunique(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", lambda x: (x["EventDate"].min(), x["EventDate"].max()), ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x[x["AdvEngineID"] != 0] .groupby("AdvEngineID") @@ -67,19 +59,16 @@ .sort_values(ascending=False), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.groupby("RegionID") .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"}) .nlargest(10, "AdvEngineID"), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x[x["MobilePhoneModel"] != ""] .groupby("MobilePhoneModel")["UserID"] @@ -87,7 +76,6 @@ .nlargest(10), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x[x["MobilePhoneModel"] != ""] .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"] @@ -95,7 +83,6 @@ .nlargest(10), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby("SearchPhrase") @@ -103,7 +90,6 @@ .nlargest(10), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby("SearchPhrase")["UserID"] @@ -111,7 +97,6 @@ .nlargest(10), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["SearchEngineID", "SearchPhrase"]) @@ -119,39 +104,32 @@ .nlargest(10), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby("UserID").size().nlargest(10), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"]) .size() .nlargest(10), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x[x["UserID"] == 435090932899640449], ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x[x["URL"].str.contains("google")].shape[0], ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")] .groupby("SearchPhrase") @@ -159,7 +137,6 @@ .nlargest(10, "SearchPhrase"), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[ (x["Title"].str.contains("Google")) @@ -173,35 +150,30 @@ .nlargest(10, "SearchPhrase"), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x[x["URL"].str.contains("google")] .sort_values(by="EventTime") .head(10), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by="EventTime")[["SearchPhrase"]] .head(10), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by="SearchPhrase")[["SearchPhrase"]] .head(10), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]] .head(10), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: x[x["URL"] != ""] .groupby("CounterID") @@ -211,7 +183,6 @@ .head(25), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x[x["Referer"] != ""] @@ -226,101 +197,10 @@ ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", - lambda x: x["ResolutionWidth"].sum() - + x["ResolutionWidth"].shift(1).sum() - + x["ResolutionWidth"].shift(2).sum() - + x["ResolutionWidth"].shift(3).sum() - + x["ResolutionWidth"].shift(4).sum() - + x["ResolutionWidth"].shift(5).sum() - + x["ResolutionWidth"].shift(6).sum() - + x["ResolutionWidth"].shift(7).sum() - + x["ResolutionWidth"].shift(8).sum() - + x["ResolutionWidth"].shift(9).sum() - + x["ResolutionWidth"].shift(10).sum() - + x["ResolutionWidth"].shift(11).sum() - + x["ResolutionWidth"].shift(12).sum() - + x["ResolutionWidth"].shift(13).sum() - + x["ResolutionWidth"].shift(14).sum() - + x["ResolutionWidth"].shift(15).sum() - + x["ResolutionWidth"].shift(16).sum() - + x["ResolutionWidth"].shift(17).sum() - + x["ResolutionWidth"].shift(18).sum() - + x["ResolutionWidth"].shift(19).sum() - + x["ResolutionWidth"].shift(20).sum() - + x["ResolutionWidth"].shift(21).sum() - + x["ResolutionWidth"].shift(22).sum() - + x["ResolutionWidth"].shift(23).sum() - + x["ResolutionWidth"].shift(24).sum() - + x["ResolutionWidth"].shift(25).sum() - + x["ResolutionWidth"].shift(26).sum() - + x["ResolutionWidth"].shift(27).sum() - + x["ResolutionWidth"].shift(28).sum() - + x["ResolutionWidth"].shift(29).sum() - + x["ResolutionWidth"].shift(30).sum() - + x["ResolutionWidth"].shift(31).sum() - + x["ResolutionWidth"].shift(32).sum() - + x["ResolutionWidth"].shift(33).sum() - + x["ResolutionWidth"].shift(34).sum() - + x["ResolutionWidth"].shift(35).sum() - + x["ResolutionWidth"].shift(36).sum() - + x["ResolutionWidth"].shift(37).sum() - + x["ResolutionWidth"].shift(38).sum() - + x["ResolutionWidth"].shift(39).sum() - + x["ResolutionWidth"].shift(40).sum() - + x["ResolutionWidth"].shift(41).sum() - + x["ResolutionWidth"].shift(42).sum() - + x["ResolutionWidth"].shift(43).sum() - + x["ResolutionWidth"].shift(44).sum() - + x["ResolutionWidth"].shift(45).sum() - + x["ResolutionWidth"].shift(46).sum() - + x["ResolutionWidth"].shift(47).sum() - + x["ResolutionWidth"].shift(48).sum() - + x["ResolutionWidth"].shift(49).sum() - + x["ResolutionWidth"].shift(50).sum() - + x["ResolutionWidth"].shift(51).sum() - + x["ResolutionWidth"].shift(52).sum() - + x["ResolutionWidth"].shift(53).sum() - + x["ResolutionWidth"].shift(54).sum() - + x["ResolutionWidth"].shift(55).sum() - + x["ResolutionWidth"].shift(56).sum() - + x["ResolutionWidth"].shift(57).sum() - + x["ResolutionWidth"].shift(58).sum() - + x["ResolutionWidth"].shift(59).sum() - + x["ResolutionWidth"].shift(60).sum() - + x["ResolutionWidth"].shift(61).sum() - + x["ResolutionWidth"].shift(62).sum() - + x["ResolutionWidth"].shift(63).sum() - + x["ResolutionWidth"].shift(64).sum() - + x["ResolutionWidth"].shift(65).sum() - + x["ResolutionWidth"].shift(66).sum() - + x["ResolutionWidth"].shift(67).sum() - + x["ResolutionWidth"].shift(68).sum() - + x["ResolutionWidth"].shift(69).sum() - + x["ResolutionWidth"].shift(70).sum() - + x["ResolutionWidth"].shift(71).sum() - + x["ResolutionWidth"].shift(72).sum() - + x["ResolutionWidth"].shift(73).sum() - + x["ResolutionWidth"].shift(74).sum() - + x["ResolutionWidth"].shift(75).sum() - + x["ResolutionWidth"].shift(76).sum() - + x["ResolutionWidth"].shift(77).sum() - + x["ResolutionWidth"].shift(78).sum() - + x["ResolutionWidth"].shift(79).sum() - + x["ResolutionWidth"].shift(80).sum() - + x["ResolutionWidth"].shift(81).sum() - + x["ResolutionWidth"].shift(82).sum() - + x["ResolutionWidth"].shift(83).sum() - + x["ResolutionWidth"].shift(84).sum() - + x["ResolutionWidth"].shift(85).sum() - + x["ResolutionWidth"].shift(86).sum() - + x["ResolutionWidth"].shift(87).sum() - + x["ResolutionWidth"].shift(88).sum() - + x["ResolutionWidth"].shift(89).sum(), - ), - ( - "Q30", + lambda x: sum(x["ResolutionWidth"].shift(i).sum() for i in range(90)), + ), + ( "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["SearchEngineID", "ClientIP"]) @@ -332,7 +212,6 @@ .nlargest(10, "c"), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["WatchID", "ClientIP"]) @@ -344,7 +223,6 @@ .nlargest(10, "c"), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.groupby(["WatchID", "ClientIP"]) .agg( @@ -355,17 +233,14 @@ .nlargest(10, "c"), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.assign( **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)} @@ -378,7 +253,6 @@ .reset_index(name="c"), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x[ (x["CounterID"] == 62) @@ -393,7 +267,6 @@ .nlargest(10), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x[ (x["CounterID"] == 62) @@ -408,7 +281,6 @@ .nlargest(10), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -425,7 +297,6 @@ .iloc[1000:1010], ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -440,7 +311,6 @@ .iloc[1000:1010], ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x[ (x["CounterID"] == 62) @@ -457,7 +327,6 @@ .iloc[100:110], ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x[ (x["CounterID"] == 62) @@ -474,7 +343,6 @@ .iloc[10000:10010], ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -490,15 +358,51 @@ ), ] -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](hits) - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_PANDAS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/pandas/start b/pandas/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/pandas/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/pandas/stop b/pandas/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/pandas/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/paradedb-partitioned/benchmark.sh b/paradedb-partitioned/benchmark.sh index 85e7e94fbd..6a7f45d3a1 100755 --- a/paradedb-partitioned/benchmark.sh +++ b/paradedb-partitioned/benchmark.sh @@ -1,62 +1,5 @@ #!/bin/bash - -PARADEDB_VERSION=latest - -cleanup() { - echo "Done, goodbye!" -} - -trap cleanup EXIT - -echo "" -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -echo "" -echo "Pulling ParadeDB image..." -sudo docker run \ - --name paradedb \ - -e POSTGRESQL_USERNAME=myuser \ - -e POSTGRESQL_PASSWORD=mypassword \ - -e POSTGRESQL_DATABASE=mydb \ - -e POSTGRES_PASSWORD=postgres \ - -p 5432:5432 \ - -d \ - paradedb/paradedb:$PARADEDB_VERSION - -echo "" -echo "Downloading ClickBench dataset..." -if [ ! -e /tmp/partitioned/ ]; then - ../download-hits-parquet-partitioned /tmp/partitioned -fi -if ! sudo docker exec paradedb sh -c '[ -f /tmp/partitioned ]'; then - sudo docker cp /tmp/partitioned paradedb:tmp -fi - -echo "" -echo "Creating database..." -export PGPASSWORD='postgres' -psql -h localhost -U postgres -p 5432 -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# load_time is zero, since the data is directly read from the Parquet file(s) -# Time: 0000000.000 ms (00:00.000) - -echo "" -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -# data_size is the Parquet file(s) total size -# 14779976446 - -echo "Data size: $(du -b /tmp/hits*.parquet)" -echo "Load time: 0" - -echo "" -echo "Parsing results..." -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/paradedb-partitioned/check b/paradedb-partitioned/check new file mode 100755 index 0000000000..07fe18256c --- /dev/null +++ b/paradedb-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='postgres' +psql -h localhost -U postgres -p 5432 -t -c 'SELECT 1' >/dev/null diff --git a/paradedb-partitioned/data-size b/paradedb-partitioned/data-size new file mode 100755 index 0000000000..e2e7183e55 --- /dev/null +++ b/paradedb-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker exec -i "$CONTAINER_NAME" sh -c "du -bcs /tmp/partitioned 2>/dev/null | tail -n1 | awk '{print \$1}'" diff --git a/paradedb-partitioned/install b/paradedb-partitioned/install new file mode 100755 index 0000000000..df2628c8a7 --- /dev/null +++ b/paradedb-partitioned/install @@ -0,0 +1,14 @@ +#!/bin/bash +# ParadeDB's pg_lakehouse extension — which gave the partitioned-parquet +# variant its meaning — was discontinued upstream after the 0.10.x line. +# No current ParadeDB image ships parquet_fdw_handler, so this benchmark +# can no longer reproduce. The single-table paradedb/ directory was +# reworked to use pg_search; the "partitioned" variant has no analogous +# replacement, so it's preserved as a historical entry only. +# +# Refuse to run rather than silently produce broken or non-comparable +# numbers under the same system name as past entries. +echo "paradedb-partitioned: this benchmark is historical (pg_lakehouse" >&2 +echo "was removed from ParadeDB after 0.10.x). See paradedb/ for the" >&2 +echo "current pg_search-based variant." >&2 +exit 1 diff --git a/paradedb-partitioned/load b/paradedb-partitioned/load new file mode 100755 index 0000000000..867f146429 --- /dev/null +++ b/paradedb-partitioned/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +export PGPASSWORD='postgres' + +# Move all hits_*.parquet files into the container at /tmp/partitioned/. +sudo docker exec -i "$CONTAINER_NAME" mkdir -p /tmp/partitioned +for f in hits_*.parquet; do + sudo docker cp "$f" "$CONTAINER_NAME":/tmp/partitioned/ +done + +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN TABLE IF EXISTS hits;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP SERVER IF EXISTS parquet_server CASCADE;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN DATA WRAPPER IF EXISTS parquet_wrapper CASCADE;" || true + +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t < create.sql + +rm -f hits_*.parquet +sync diff --git a/paradedb-partitioned/query b/paradedb-partitioned/query new file mode 100755 index 0000000000..ba9577725d --- /dev/null +++ b/paradedb-partitioned/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the default DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD='postgres' +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h localhost -U postgres -p 5432 -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/paradedb-partitioned/results/20240713/c6a.4xlarge.json b/paradedb-partitioned/results/20240713/c6a.4xlarge.json index a7fe4d272f..789698ed01 100644 --- a/paradedb-partitioned/results/20240713/c6a.4xlarge.json +++ b/paradedb-partitioned/results/20240713/c6a.4xlarge.json @@ -6,8 +6,8 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "comment": "", - "tags": ["Rust", "column-oriented", "search", "PostgreSQL compatible", "lukewarm-cold-run"], + "comment": "Historical result. ParadeDB's pg_lakehouse / parquet_fdw extension (which made the partitioned-parquet variant possible) was discontinued by upstream after the 0.10.x line, and no current ParadeDB release can reproduce this benchmark. Kept for reference only.", + "tags": ["Rust", "column-oriented", "search", "PostgreSQL compatible", "lukewarm-cold-run", "historical"], "load_time": 0, "data_size": 14779976446, diff --git a/paradedb-partitioned/run.sh b/paradedb-partitioned/run.sh deleted file mode 100755 index 5f276eb74b..0000000000 --- a/paradedb-partitioned/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 -export PGPASSWORD='postgres' - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -h localhost -U postgres -p 5432 -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/paradedb-partitioned/start b/paradedb-partitioned/start new file mode 100755 index 0000000000..5db7dd44cd --- /dev/null +++ b/paradedb-partitioned/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/paradedb-partitioned/stop b/paradedb-partitioned/stop new file mode 100755 index 0000000000..209823b903 --- /dev/null +++ b/paradedb-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/paradedb/benchmark.sh b/paradedb/benchmark.sh index 0119121d04..531bd65038 100755 --- a/paradedb/benchmark.sh +++ b/paradedb/benchmark.sh @@ -1,50 +1,5 @@ #!/bin/bash - -PARADEDB_VERSION=0.10.0 - -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -echo "Pulling ParadeDB image..." -sudo docker run \ - --name paradedb \ - -e POSTGRESQL_USERNAME=myuser \ - -e POSTGRESQL_PASSWORD=mypassword \ - -e POSTGRESQL_DATABASE=mydb \ - -e POSTGRES_PASSWORD=postgres \ - -p 5432:5432 \ - -d \ - paradedb/paradedb:$PARADEDB_VERSION - -echo "Downloading ClickBench dataset..." -if [ ! -e /tmp/hits.parquet ]; then - ../download-hits-parquet-single /tmp -fi -if ! sudo docker exec paradedb sh -c '[ -f /tmp/hits.parquet ]'; then - sudo docker cp /tmp/hits.parquet paradedb:/tmp/hits.parquet -fi - -echo "Creating database..." -export PGPASSWORD='postgres' -psql -h localhost -U postgres -p 5432 -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# load_time is zero, since the data is directly read from the Parquet file(s) -# Time: 0000000.000 ms (00:00.000) -echo "Load time: 0" - -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -# data_size is the Parquet file(s) total size -# 14779976446 - -echo "Data size: $(du -b /tmp/hits.parquet)" - -echo "Parsing results..." -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/paradedb/check b/paradedb/check new file mode 100755 index 0000000000..07fe18256c --- /dev/null +++ b/paradedb/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='postgres' +psql -h localhost -U postgres -p 5432 -t -c 'SELECT 1' >/dev/null diff --git a/paradedb/create.sql b/paradedb/create.sql index 70ff561b25..19c90f34ca 100644 --- a/paradedb/create.sql +++ b/paradedb/create.sql @@ -1,10 +1,115 @@ -CREATE FOREIGN DATA WRAPPER parquet_wrapper - HANDLER parquet_fdw_handler - VALIDATOR parquet_fdw_validator; +-- ParadeDB ships pg_search (Tantivy-backed BM25) on top of Postgres. +-- The previous benchmark used pg_lakehouse for parquet access, but +-- that extension was removed upstream after 0.10.x. Use a regular +-- Postgres table loaded via COPY, plus a BM25 index on the text-heavy +-- columns to actually exercise pg_search. +CREATE EXTENSION IF NOT EXISTS pg_search; -CREATE SERVER parquet_server - FOREIGN DATA WRAPPER parquet_wrapper; - -CREATE FOREIGN TABLE IF NOT EXISTS hits () -SERVER parquet_server -OPTIONS (files '/tmp/hits.parquet'); +CREATE TABLE hits +( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title TEXT NOT NULL, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate Date NOT NULL, + CounterID INTEGER NOT NULL, + ClientIP INTEGER NOT NULL, + RegionID INTEGER NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL TEXT NOT NULL, + Referer TEXT NOT NULL, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INTEGER NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INTEGER NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 TEXT NOT NULL, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor VARCHAR(255) NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel TEXT NOT NULL, + Params TEXT NOT NULL, + IPNetworkID INTEGER NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase TEXT NOT NULL, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INTEGER NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset TEXT NOT NULL, + CodeVersion INTEGER NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL TEXT NOT NULL, + HID INTEGER NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor CHAR NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INTEGER NOT NULL, + WindowName INTEGER NOT NULL, + OpenerName INTEGER NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage TEXT NOT NULL, + BrowserCountry TEXT NOT NULL, + SocialNetwork TEXT NOT NULL, + SocialAction TEXT NOT NULL, + HTTPError SMALLINT NOT NULL, + SendTiming INTEGER NOT NULL, + DNSTiming INTEGER NOT NULL, + ConnectTiming INTEGER NOT NULL, + ResponseStartTiming INTEGER NOT NULL, + ResponseEndTiming INTEGER NOT NULL, + FetchTiming INTEGER NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage TEXT NOT NULL, + ParamPrice BIGINT NOT NULL, + ParamOrderID TEXT NOT NULL, + ParamCurrency TEXT NOT NULL, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName TEXT NOT NULL, + OpenstatCampaignID TEXT NOT NULL, + OpenstatAdID TEXT NOT NULL, + OpenstatSourceID TEXT NOT NULL, + UTMSource TEXT NOT NULL, + UTMMedium TEXT NOT NULL, + UTMCampaign TEXT NOT NULL, + UTMContent TEXT NOT NULL, + UTMTerm TEXT NOT NULL, + FromTag TEXT NOT NULL, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INTEGER NOT NULL +); diff --git a/paradedb/data-size b/paradedb/data-size new file mode 100755 index 0000000000..52831ab812 --- /dev/null +++ b/paradedb/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +export PGPASSWORD='postgres' +psql -h localhost -U postgres -p 5432 -t -A \ + -c "SELECT pg_total_relation_size('hits')" diff --git a/paradedb/install b/paradedb/install new file mode 100755 index 0000000000..2dd046b964 --- /dev/null +++ b/paradedb/install @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +# Track ParadeDB's current Postgres-17 build. The 0.10.0 pin (when this +# benchmark exercised pg_lakehouse / parquet_fdw) was rotated out of +# Docker Hub, and pg_lakehouse itself was dropped from ParadeDB after +# the 0.10.x line. The benchmark now uses pg_search instead — see +# create.sql. +PARADEDB_VERSION=${PARADEDB_VERSION:-latest-pg17} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "paradedb/paradedb:$PARADEDB_VERSION" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -e POSTGRESQL_USERNAME=myuser \ + -e POSTGRESQL_PASSWORD=mypassword \ + -e POSTGRESQL_DATABASE=mydb \ + -e POSTGRES_PASSWORD=postgres \ + -p 5432:5432 \ + "paradedb/paradedb:$PARADEDB_VERSION" diff --git a/paradedb/load b/paradedb/load new file mode 100755 index 0000000000..d6ef82eba1 --- /dev/null +++ b/paradedb/load @@ -0,0 +1,31 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +export PGPASSWORD='postgres' + +# Drop+recreate so re-runs are idempotent. +psql -h localhost -U postgres -p 5432 -t -c "DROP INDEX IF EXISTS hits_bm25_idx;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP TABLE IF EXISTS hits;" || true + +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t < create.sql + +# Move the TSV into the container so server-side COPY can read it +# without round-tripping over the wire. +sudo docker cp hits.tsv "$CONTAINER_NAME":/tmp/hits.tsv +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t \ + -c "COPY hits FROM '/tmp/hits.tsv' WITH (FORMAT text);" + +# BM25 index on the text-heavy columns. Built after bulk load (cheaper +# than maintaining the index during COPY) and after VACUUM ANALYZE so +# planner stats are fresh. +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t \ + -c "VACUUM ANALYZE hits;" +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t \ + -c "CREATE INDEX hits_bm25_idx ON hits + USING bm25 (WatchID, URL, Title, SearchPhrase, Referer) + WITH (key_field='WatchID');" + +sudo docker exec "$CONTAINER_NAME" rm -f /tmp/hits.tsv +rm -f hits.tsv +sync diff --git a/paradedb/queries.sql b/paradedb/queries.sql index 8d394088e8..31f65fc898 100644 --- a/paradedb/queries.sql +++ b/paradedb/queries.sql @@ -4,7 +4,7 @@ SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; SELECT AVG(UserID) FROM hits; SELECT COUNT(DISTINCT UserID) FROM hits; SELECT COUNT(DISTINCT SearchPhrase) FROM hits; -SELECT DATE '1970-01-01' + MIN(EventDate) * INTERVAL '1 day' AS min, DATE '1970-01-01' + MAX(EventDate) * INTERVAL '1 day' AS max FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; @@ -16,17 +16,17 @@ SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM CAST(to_timestamp(EventTime) AS TIMESTAMP)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID FROM hits WHERE UserID = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE URL::VARCHAR LIKE '%google%'; -SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL::VARCHAR LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title::VARCHAR LIKE '%Google%' AND URL::VARCHAR NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE URL::VARCHAR LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; -SELECT CounterID, AVG(length(URL::VARCHAR)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer::VARCHAR, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer::VARCHAR)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; @@ -34,10 +34,10 @@ SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FR SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; -SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; -SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-01' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', CAST(to_timestamp(EventTime) AS TIMESTAMP)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') >= '2013-07-14' AND (DATE '1970-01-01' + EventDate * INTERVAL '1 day') <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', CAST(to_timestamp(EventTime) AS TIMESTAMP)) ORDER BY DATE_TRUNC('minute', CAST(to_timestamp(EventTime) AS TIMESTAMP)) LIMIT 10 OFFSET 1000; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/paradedb/query b/paradedb/query new file mode 100755 index 0000000000..ba9577725d --- /dev/null +++ b/paradedb/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the default DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD='postgres' +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h localhost -U postgres -p 5432 -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/paradedb/run.sh b/paradedb/run.sh deleted file mode 100755 index 5f276eb74b..0000000000 --- a/paradedb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 -export PGPASSWORD='postgres' - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -h localhost -U postgres -p 5432 -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/paradedb/start b/paradedb/start new file mode 100755 index 0000000000..5db7dd44cd --- /dev/null +++ b/paradedb/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/paradedb/stop b/paradedb/stop new file mode 100755 index 0000000000..209823b903 --- /dev/null +++ b/paradedb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/paradedb/template.json b/paradedb/template.json index 439f218afb..0862418ea9 100644 --- a/paradedb/template.json +++ b/paradedb/template.json @@ -1,11 +1,11 @@ { - "system": "ParadeDB (Parquet, single)", + "system": "ParadeDB", "proprietary": "no", "hardware": "cpu", "tuned": "no", "tags": [ "Rust", - "column-oriented", + "row-oriented", "search", "PostgreSQL compatible", "lukewarm-cold-run" diff --git a/parseable/benchmark.sh b/parseable/benchmark.sh index aee27a28d6..bbfe34d76a 100755 --- a/parseable/benchmark.sh +++ b/parseable/benchmark.sh @@ -1,53 +1,6 @@ -# Install Dependencies -sudo apt-get update -y -sudo apt-get install -y parallel -# Determine instance type based on CPU and memory -CPU_COUNT=$(nproc) - -if [ $CPU_COUNT -ge 190 ]; then - export P_EXECUTION_BATCH_SIZE=75000 - echo "Setting P_EXECUTION_BATCH_SIZE=75000 (detected c6a.metal equivalent)" -elif [ $CPU_COUNT -ge 15 ]; then - export P_EXECUTION_BATCH_SIZE=40000 - echo "Setting P_EXECUTION_BATCH_SIZE=40000 (detected c6a.4xlarge equivalent)" -else - # Default for other configurations - export P_EXECUTION_BATCH_SIZE=1000000 - echo "Using default P_EXECUTION_BATCH_SIZE=1000000 for default configuration" -fi - -# Download Parseable v2.5.12 binary -wget --continue --progress=dot:giga https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu -mv Parseable_OSS_x86_64-unknown-linux-gnu parseable -chmod +x parseable - -# Run Parseable -export RUST_LOG=warn - -./parseable local-store > parseable.log 2>&1 & PARSEABLE_PID=$! -# Verify Parseable is running -if ps -p $PARSEABLE_PID > /dev/null; then - echo "Parseable is running with PID: $PARSEABLE_PID" -else - echo "Error: Parseable failed to start. Check parseable.log for details." - exit 1 -fi - -chmod +x ingestion.sh -chmod +x run_query.sh - -#run ingestion script -echo -n "Load time: " -command time -f '%e' ./ingestion.sh - -#run query script -./run_query.sh - -#view results -cat result.csv | sed -r -e 's/^([0-9\.]+) ([0-9\.]+) ([0-9\.]+)$/[\1, \2, \3]/' - -echo -n "Data size: " -du -bcs local-store | grep total - -#kill parseable -kill $PARSEABLE_PID +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# parseable ingests gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/parseable/check b/parseable/check new file mode 100755 index 0000000000..8de868e04a --- /dev/null +++ b/parseable/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf -u admin:admin 'http://localhost:8000/api/v1/about' >/dev/null diff --git a/parseable/data-size b/parseable/data-size new file mode 100755 index 0000000000..559b25d677 --- /dev/null +++ b/parseable/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs local-store | grep total | awk '{print $1}' diff --git a/parseable/ingestion.sh b/parseable/ingestion.sh deleted file mode 100755 index ca782477fb..0000000000 --- a/parseable/ingestion.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -echo "Installing Parallel pigz pv..." -sudo apt-get update -y && sudo apt-get install -y parallel pigz pv - -# Set number of cores for parallel processing -NUM_CORES=$(nproc) - -echo "Downloading dataset..." -wget --progress=bar:force --show-progress https://datasets.clickhouse.com/hits_compatible/hits.json.gz - -echo "Decompressing dataset..." -# Get file size for progress reporting -FILE_SIZE=$(stat -c %s hits.json.gz) -pv -s $FILE_SIZE hits.json.gz | pigz -d > hits.json - -# Split file into chunks of 2500 lines and process them -echo "Splitting file and processing chunks in parallel..." - -# Create partitioned directory if it doesn't exist -mkdir -p partitioned - -# Define processing function that will be applied immediately after splitting -split_and_process() { - local chunk_num=$1 - local content=$(cat) - local output_file="./partitioned/hits_${chunk_num}.json" - - # Format with brackets and commas in one step - ( - echo "[" - echo "$content" | sed '$!s/$/,/' - echo "]" - ) > "$output_file" -} -export -f split_and_process - -LINES_PER_CHUNK=2500 - -pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \ - --jobs $NUM_CORES split_and_process {#} - -echo "Split and process complete" - -# Remove original file -rm hits.json - -# Create stream -echo "Creating stream..." -SCHEMA_FILE="static_schema.json" -curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ - -H 'X-P-Static-Schema-Flag: true' \ - -H 'Content-Type: application/json' \ - -u "admin:admin" \ - --data-binary @"${SCHEMA_FILE}" - -# Ingest files in parallel with progress monitoring -echo "Ingesting files..." - -INGEST_JOBS=6 -start_time=$(date +%s) -find . -name "hits_*" -type f | parallel --progress --jobs $INGEST_JOBS \ - 'curl --silent -H "Content-Type: application/json" -H "X-P-Stream: hits" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/ingest" --data-binary @"{}"' - -#sleep for 3 minutes to allow sync to complete -sleep 180 - -end_time=$(date +%s) -total_time=$((end_time - start_time)) - -echo "Total load (ingestion) time: ${total_time} seconds" diff --git a/parseable/install b/parseable/install new file mode 100755 index 0000000000..9fcb8ffa1d --- /dev/null +++ b/parseable/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y parallel pigz pv + +if [ ! -x ./parseable ]; then + wget --continue --progress=dot:giga \ + https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu + mv Parseable_OSS_x86_64-unknown-linux-gnu parseable + chmod +x parseable +fi diff --git a/parseable/load b/parseable/load new file mode 100755 index 0000000000..3f74150940 --- /dev/null +++ b/parseable/load @@ -0,0 +1,51 @@ +#!/bin/bash +set -eu + +NUM_CORES=$(nproc) + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +# Decompress with progress. +FILE_SIZE=$(stat -c %s hits.json.gz) +pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json + +# Split into chunks wrapped in [ ... , ... ] arrays for parseable's ingest API. +mkdir -p partitioned +rm -f partitioned/hits_*.json + +split_and_process() { + local chunk_num=$1 + local content + content=$(cat) + { + echo "[" + echo "$content" | sed '$!s/$/,/' + echo "]" + } > "./partitioned/hits_${chunk_num}.json" +} +export -f split_and_process + +LINES_PER_CHUNK=2500 +pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \ + --jobs "$NUM_CORES" split_and_process {#} + +rm -f hits.json hits.json.gz + +# Create the stream. +curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ + -H 'X-P-Static-Schema-Flag: true' \ + -H 'Content-Type: application/json' \ + -u "admin:admin" \ + --data-binary @static_schema.json >/dev/null + +# Parallel ingest of chunks. +INGEST_JOBS=6 +find partitioned -name "hits_*" -type f | parallel --jobs $INGEST_JOBS \ + 'curl --silent -H "Content-Type: application/json" -H "X-P-Stream: hits" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/ingest" --data-binary @"{}"' + +# Allow sync to complete. +sleep 180 + +rm -rf partitioned +sync diff --git a/parseable/query b/parseable/query new file mode 100755 index 0000000000..7603f63f0d --- /dev/null +++ b/parseable/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via parseable's HTTP /api/v1/query. +# Stdout: query result (JSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +CURRENT_DATE=$(date +%Y-%m-%d) +START_TIME="${CURRENT_DATE}T00:00:00.000Z" +END_TIME="${CURRENT_DATE}T23:59:00.000Z" + +# JSON-escape quotes inside the query. +escaped=$(printf '%s' "$query" | sed 's/"/\\"/g') +JSON=$(printf '{"query":"%s","startTime":"%s","endTime":"%s"}' "$escaped" "$START_TIME" "$END_TIME") + +t1=$(date +%s.%N) +out=$(curl -sS -H "Content-Type: application/json" -k -XPOST \ + -u "admin:admin" 'http://localhost:8000/api/v1/query' \ + --data "$JSON") && exit_code=0 || exit_code=$? +t2=$(date +%s.%N) + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$out" + +awk -v a="$t1" -v b="$t2" 'BEGIN { printf "%.6f\n", b - a }' >&2 diff --git a/parseable/run_query.sh b/parseable/run_query.sh deleted file mode 100755 index 3e7c162ffc..0000000000 --- a/parseable/run_query.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -echo "Running queries..." -TRIES=3 -QUERY_NUM=1 -rm -f result.csv - -# Get current date in YYYY-MM-DD format -CURRENT_DATE=$(date +%Y-%m-%d) -START_TIME="${CURRENT_DATE}T00:00:00.000Z" -END_TIME="${CURRENT_DATE}T23:59:00.000Z" - -cat 'queries.sql' | while read -r QUERY; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo "$QUERY" > /tmp/query.sql - echo "Query $QUERY_NUM: $QUERY" - QUERY=$(echo "$QUERY" | sed 's/"/\\"/g') - # Create array to store results for this query - RESULTS=() - - for i in $(seq 1 $TRIES); do - echo "Iteration $i:" -JSON=$(printf '{"query":"%s","startTime":"%s","endTime":"%s"}' "$QUERY" "$START_TIME" "$END_TIME") - - start_time=$(date +%s.%N) - - # Execute the query and print the response to terminal - curl -s -H "Content-Type: application/json" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/query" --data "${JSON}" > /dev/null - end_time=$(date +%s.%N) - - # Calculate elapsed time in seconds with millisecond precision - elapsed_time=$(echo "$end_time - $start_time" | bc) - # Convert to desired format - RES=$(printf "%.9f" $elapsed_time) - - # Store result in array - RESULTS+=("$RES") - - echo "Time: $RES seconds" - echo "----------------------------------------" - done - - # Output results to CSV with tab separation - echo -e "${RESULTS[0]} ${RESULTS[1]} ${RESULTS[2]}" >> result.csv - - echo "Query $QUERY_NUM completed. [${RESULTS[0]}, ${RESULTS[1]}, ${RESULTS[2]}]" - echo "========================================" - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "Benchmark completed. Results saved to result.csv" \ No newline at end of file diff --git a/parseable/start b/parseable/start new file mode 100755 index 0000000000..d7badddb62 --- /dev/null +++ b/parseable/start @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +# Idempotent: if already up, do nothing. +if curl -sSf -u admin:admin 'http://localhost:8000/api/v1/about' >/dev/null 2>&1; then + exit 0 +fi + +# Tune batch size by hardware (matches original). +CPU_COUNT=$(nproc) +if [ "$CPU_COUNT" -ge 190 ]; then + export P_EXECUTION_BATCH_SIZE=75000 +elif [ "$CPU_COUNT" -ge 15 ]; then + export P_EXECUTION_BATCH_SIZE=40000 +else + export P_EXECUTION_BATCH_SIZE=1000000 +fi +export RUST_LOG=warn + +nohup ./parseable local-store > parseable.log 2>&1 & +disown diff --git a/parseable/stop b/parseable/stop new file mode 100755 index 0000000000..004cb12f0e --- /dev/null +++ b/parseable/stop @@ -0,0 +1,11 @@ +#!/bin/bash + +pid=$(pidof parseable 2>/dev/null || true) +if [ -n "$pid" ]; then + kill $pid 2>/dev/null || true + for _ in $(seq 1 30); do + pidof parseable >/dev/null 2>&1 || exit 0 + sleep 1 + done + sudo killall -9 parseable 2>/dev/null || true +fi diff --git a/pg_clickhouse/benchmark.sh b/pg_clickhouse/benchmark.sh index 249734de88..6a7f45d3a1 100755 --- a/pg_clickhouse/benchmark.sh +++ b/pg_clickhouse/benchmark.sh @@ -1,18 +1,5 @@ #!/bin/bash - -# apt-get update -y -# env DEBIAN_FRONTEND=noninteractive apt-get install -y wget curl sudo -# env TOTAL_PARTITIONS=1 EXPLAIN=1 ./benchmark.sh - -# Install and start ClickHouse and Postgres -./clickhouse.sh "$@" -./postgres.sh - -# Run the queries -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" - -cat log.txt | grep -oP '^Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_clickhouse/check b/pg_clickhouse/check new file mode 100755 index 0000000000..9988f4f586 --- /dev/null +++ b/pg_clickhouse/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Both backends must respond. +clickhouse-client --query "SELECT 1" >/dev/null +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/pg_clickhouse/clickhouse.sh b/pg_clickhouse/clickhouse.sh deleted file mode 100755 index 1e8ae363c3..0000000000 --- a/pg_clickhouse/clickhouse.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Install - -if [ ! -x /usr/bin/clickhouse ] -then - cd /tmp || exit - curl https://clickhouse.com/ | sh - sudo ./clickhouse install --noninteractive - rm clickhouse - cd - || exit -fi - -# Optional: if you want to use higher compression: -if (( 0 )); then - echo " -compression: - case: - method: zstd - " | sudo tee /etc/clickhouse-server/config.d/compression.yaml -fi; - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -elif [ ! -z "$1" ]; then - SUFFIX="-$1" -fi - -# Load the data - -clickhouse-client < create"$SUFFIX".sql - -TOTAL_PARTITIONS=${TOTAL_PARTITIONS:-100} - -seq 0 "$((TOTAL_PARTITIONS-1))" | xargs -P100 -I{} bash -c 'wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' -mkdir -p /var/lib/clickhouse/user_files -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -sync - -start=$(date +%s.%N) - -clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) -sync - -end=$(date +%s.%N) -elapsed=$(echo "$end - $start" | bc) - -echo "Load time: $elapsed s" diff --git a/pg_clickhouse/data-size b/pg_clickhouse/data-size new file mode 100755 index 0000000000..1ac73ce98e --- /dev/null +++ b/pg_clickhouse/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# Data is in ClickHouse, not Postgres. Report the hits table on-disk size. +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/pg_clickhouse/install b/pg_clickhouse/install new file mode 100755 index 0000000000..a5d67fcd7b --- /dev/null +++ b/pg_clickhouse/install @@ -0,0 +1,55 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +# --- ClickHouse --- +if [ ! -x /usr/bin/clickhouse ]; then + cd /tmp + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive + rm -f clickhouse + cd - +fi + +# --- PostgreSQL + pg_clickhouse --- +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y \ + postgresql-$PGVERSION \ + postgresql-server-dev-$PGVERSION \ + libcurl4-openssl-dev \ + uuid-dev \ + libssl-dev \ + make \ + cmake \ + g++ \ + pgxnclient + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf < /dev/null 2>&1 ; then - sudo systemctl restart "postgresql@$PGVERSION-main" -else - sudo /etc/init.d/postgresql start -fi - -sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y \ - libcurl4-openssl-dev \ - uuid-dev \ - libssl-dev \ - make \ - cmake \ - g++ \ - pgxnclient - -# Setup the database. -pgxn install pg_clickhouse - -sudo -u postgres psql -t -c 'CREATE DATABASE test' -sudo -u postgres psql test -f create-postgres.sql 2>&1 | tee pg_load_out.txt -if grep 'ERROR' pg_load_out.txt -then - exit 1 -fi diff --git a/pg_clickhouse/query b/pg_clickhouse/query new file mode 100755 index 0000000000..5bbe3cff5c --- /dev/null +++ b/pg_clickhouse/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB +# (which proxies to ClickHouse via the pg_clickhouse FDW). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql --no-psqlrc --tuples-only test 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP '^Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_clickhouse/results/20260509/c6a.4xlarge.json b/pg_clickhouse/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..a1db093654 --- /dev/null +++ b/pg_clickhouse/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "pg_clickhouse", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 275, + "data_size": 15306021868, + "result": [ + [0.04, 0.005, 0.005], + [0.164, 0.011, 0.011], + [0.121, 0.024, 0.025], + [0.162, 0.032, 0.031], + [0.424, 0.269, 0.26], + [0.796, 0.587, 0.581], + [0.089, 0.014, 0.014], + [0.181, 0.015, 0.014], + [0.61, 0.427, 0.424], + [0.671, 0.486, 0.486], + [0.3, 0.152, 0.15], + [0.299, 0.153, 0.151], + [0.783, 0.54, 0.528], + [1.69, 0.773, 0.773], + [0.964, 0.57, 0.545], + [0.561, 0.385, 0.387], + [2.173, 1.723, 1.621], + [1.517, 0.959, 1.004], + [4.568, 2.945, 3.047], + [0.221, 0.006, 0.006], + [9.615, 0.315, 0.307], + [10.629, 0.086, 0.088], + [13.679, 0.658, 0.659], + [1.457, 0.093, 0.106], + [1.008, 0.034, 0.035], + [0.957, 0.146, 0.145], + [0.835, 0.069, 0.035], + [0.576, 0.084, 0.084], + [10.092, 9.628, 9.63], + [0.125, 0.035, 0.035], + [0.49, 0.312, 0.292], + [3.553, 0.562, 0.529], + [5.907, 4.23, 4.226], + [10.836, 3.083, 3.117], + [10.83, 3.158, 3.197], + [1.039, 1.032, 0.828], + [0.251, 0.053, 0.055], + [0.207, 0.029, 0.026], + [0.215, 0.023, 0.023], + [0.295, 0.087, 0.087], + [0.202, 0.018, 0.021], + [0.191, 0.015, 0.016], + [0.193, 0.014, 0.014] +] +} + diff --git a/pg_clickhouse/results/20260509/c6a.metal.json b/pg_clickhouse/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..7a2beb410c --- /dev/null +++ b/pg_clickhouse/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "pg_clickhouse", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 229, + "data_size": 15289155267, + "result": [ + [0.198, 0.006, 0.005], + [0.442, 0.034, 0.015], + [0.681, 0.04, 0.034], + [0.624, 0.021, 0.022], + [0.748, 0.098, 0.086], + [1.244, 0.145, 0.133], + [0.135, 0.017, 0.017], + [0.791, 0.022, 0.021], + [1.949, 0.336, 0.328], + [1.473, 0.277, 0.314], + [0.897, 0.226, 0.222], + [0.981, 0.126, 0.128], + [2.384, 0.183, 0.204], + [3.252, 0.238, 0.247], + [2.58, 0.182, 0.2], + [1.438, 0.16, 0.145], + [3.238, 0.368, 0.331], + [2.889, 0.265, 0.291], + [4.096, 0.645, 0.617], + [1.432, 0.007, 0.007], + [9.864, 0.102, 0.131], + [10.889, 0.153, 0.177], + [14.056, 0.235, 0.272], + [3.986, 0.1, 0.092], + [2.202, 0.036, 0.033], + [2.02, 0.054, 0.045], + [2.219, 0.228, 0.04], + [2.352, 0.048, 0.095], + [9.337, 1.474, 1.556], + [0.922, 0.065, 0.046], + [2.633, 0.11, 0.135], + [4.569, 0.166, 0.177], + [5.165, 1.166, 1.282], + [10.296, 0.786, 0.761], + [10.695, 0.83, 0.792], + [1.212, 0.199, 0.166], + [1.086, 0.076, 0.076], + [1.673, 0.041, 0.038], + [1.716, 0.039, 0.028], + [2.093, 0.137, 0.14], + [1.118, 0.024, 0.022], + [1.471, 0.091, 0.016], + [1.021, 0.034, 0.015] +] +} + diff --git a/pg_clickhouse/run.sh b/pg_clickhouse/run.sh deleted file mode 100755 index 93f6fc8d1a..0000000000 --- a/pg_clickhouse/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -TRIES=3 -prefix="" -if [ -n "$EXPLAIN" ]; then - prefix="EXPLAIN (ANALYZE, VERBOSE) " -fi - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - ( - echo '\timing' - yes "$prefix$query" | head -n $TRIES - ) | sudo -u postgres psql -e --no-psqlrc --tuples-only test 2>&1 # | grep -P 'Time|psql: error' -done diff --git a/pg_clickhouse/start b/pg_clickhouse/start new file mode 100755 index 0000000000..7a21cd5755 --- /dev/null +++ b/pg_clickhouse/start @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Start ClickHouse server. +sudo clickhouse start || true + +# Start PostgreSQL. +sudo systemctl start postgresql@$PGVERSION-main diff --git a/pg_clickhouse/stop b/pg_clickhouse/stop new file mode 100755 index 0000000000..6bc1a37dbf --- /dev/null +++ b/pg_clickhouse/stop @@ -0,0 +1,5 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true +sudo clickhouse stop || true diff --git a/pg_duckdb-indexed/benchmark.sh b/pg_duckdb-indexed/benchmark.sh index 4a8fee7777..531bd65038 100755 --- a/pg_duckdb-indexed/benchmark.sh +++ b/pg_duckdb-indexed/benchmark.sh @@ -1,82 +1,5 @@ #!/bin/bash - -set -eu - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -../download-hits-tsv - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# We also give DuckDB 25% of the memory to work with. -duckdb_memory=$(($memory / 4)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb pgduckdb/pgduckdb:17-v1.0.0 - -sleep 2 - -sudo docker exec -i pgduck bash -c " -cat >> /var/lib/postgresql/data/postgresql.conf <<'EOF' -shared_buffers=${shared_buffers}kB -max_worker_processes=${max_worker_processes} -max_parallel_workers=${threads} -max_parallel_maintenance_workers=${cpus} -max_parallel_workers_per_gather=${cpus} -duckdb.max_workers_per_postgres_scan=${cpus} -max_wal_size=32GB -work_mem=64MB -effective_cache_size=${effective_cache_size}kB -duckdb.max_memory=${duckdb_memory}kB -EOF -" - -docker restart pgduck - -export PGUSER=postgres -export PGPASSWORD=duckdb -export CONNECTION=postgres://postgres:duckdb@localhost:5432/postgres - -for _ in {1..300} -do - psql $CONNECTION -t < create.sql 2>&1 | tee load_out.txt && break - sleep 1 -done -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -psql $CONNECTION -c "ALTER DATABASE postgres SET duckdb.force_execution = true;" -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pgduck du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_duckdb-indexed/check b/pg_duckdb-indexed/check new file mode 100755 index 0000000000..917d67d78e --- /dev/null +++ b/pg_duckdb-indexed/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/pg_duckdb-indexed/data-size b/pg_duckdb-indexed/data-size new file mode 100755 index 0000000000..c5ef7ac005 --- /dev/null +++ b/pg_duckdb-indexed/data-size @@ -0,0 +1,3 @@ +#!/bin/bash +set -eu +sudo docker exec pgduck du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/pg_duckdb-indexed/install b/pg_duckdb-indexed/install new file mode 100755 index 0000000000..c17febb8dd --- /dev/null +++ b/pg_duckdb-indexed/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull pgduckdb/pgduckdb:17-v1.0.0 diff --git a/pg_duckdb-indexed/load b/pg_duckdb-indexed/load new file mode 100755 index 0000000000..ff11a5b956 --- /dev/null +++ b/pg_duckdb-indexed/load @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +export PGUSER=postgres +export PGPASSWORD=duckdb +CONNECTION=postgres://postgres:duckdb@127.0.0.1:5432/postgres + +threads=$(nproc) +cpus=$(( threads / 2 )) + +psql "$CONNECTION" -v ON_ERROR_STOP=1 -t < create.sql + +# Parallel COPY in $cpus shards via `split -n r/N --filter=...` so the load +# isn't single-threaded. Explicit column list keeps the COPY ordering +# stable across shards (the file is not header-prefixed). +COLUMNS='WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID' + +split hits.tsv -n r/$cpus --filter='psql '"$CONNECTION"' -v ON_ERROR_STOP=1 -t -c "\\copy hits ('"$COLUMNS"') FROM STDIN"' + +psql "$CONNECTION" -v ON_ERROR_STOP=1 -q -t -c 'CREATE EXTENSION IF NOT EXISTS pg_trgm;' +psql "$CONNECTION" -v ON_ERROR_STOP=1 -q -t < index.sql +psql "$CONNECTION" -v ON_ERROR_STOP=1 -q -t -c 'VACUUM ANALYZE hits' + +# Indexed variant explicitly does NOT force the duckdb engine — the +# point of this entry is to measure pg_duckdb's index-aware planner +# fallback to vanilla Postgres execution where indexes help. +rm -f hits.tsv +sync diff --git a/pg_duckdb-indexed/load.sh b/pg_duckdb-indexed/load.sh deleted file mode 100755 index 5d9e9c870b..0000000000 --- a/pg_duckdb-indexed/load.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -eu - -threads=$(nproc) -cpus=$(($threads / 2)) - -# Using COPY with explicit column mapping to ensure correct alignment. -split hits.tsv -n r/$cpus --filter='psql '$CONNECTION' -t -c "\\copy hits (WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) FROM STDIN"' - -psql $CONNECTION -q -t -c 'CREATE EXTENSION pg_trgm;' -psql $CONNECTION -q -t < index.sql -psql $CONNECTION -q -t -c 'VACUUM ANALYZE hits' diff --git a/pg_duckdb-indexed/query b/pg_duckdb-indexed/query new file mode 100755 index 0000000000..33b871529d --- /dev/null +++ b/pg_duckdb-indexed/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against pg_duckdb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGUSER=postgres +export PGPASSWORD=duckdb +query=$(cat) + +raw=$(psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/pg_duckdb-indexed/run.sh b/pg_duckdb-indexed/run.sh deleted file mode 100755 index 8561c10265..0000000000 --- a/pg_duckdb-indexed/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -TRIES=3 - -export PGUSER=postgres -export PGPASSWORD=duckdb - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql $CONNECTION --no-psqlrc --tuples-only 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/pg_duckdb-indexed/start b/pg_duckdb-indexed/start new file mode 100755 index 0000000000..c569063f61 --- /dev/null +++ b/pg_duckdb-indexed/start @@ -0,0 +1,43 @@ +#!/bin/bash +set -eu + +# Idempotent: postgres responding on :5432 means we're done. +if PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# pgduckdb's image is plain postgres + the duckdb extension. Pass tuning +# parameters via `postgres -c k=v` so the running cluster picks them up +# without a second restart. +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(( threads / 2 )) +shared_buffers=$(( memory / 4 )) +effective_cache_size=$(( memory - memory / 4 )) +max_worker_processes=$(( threads + 15 )) +duckdb_memory=$(( memory / 4 )) + +if ! sudo docker ps -a --format '{{.Names}}' | grep -qx pgduck; then + sudo docker run -d --name pgduck -p 5432:5432 \ + -e POSTGRES_PASSWORD=duckdb \ + pgduckdb/pgduckdb:17-v1.0.0 \ + -c shared_buffers="${shared_buffers}kB" \ + -c max_worker_processes="$max_worker_processes" \ + -c max_parallel_workers="$threads" \ + -c max_parallel_maintenance_workers="$cpus" \ + -c max_parallel_workers_per_gather="$cpus" \ + -c duckdb.max_workers_per_postgres_scan="$cpus" \ + -c max_wal_size=32GB \ + -c work_mem=64MB \ + -c effective_cache_size="${effective_cache_size}kB" \ + -c duckdb.max_memory="${duckdb_memory}kB" >/dev/null +else + sudo docker start pgduck >/dev/null +fi + +# Wait for postgres protocol to come up (the lib's check loop polls +# ./check too, but bench_start expects a quick ready signal). +for _ in $(seq 1 60); do + PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/pg_duckdb-indexed/stop b/pg_duckdb-indexed/stop new file mode 100755 index 0000000000..518e5460d1 --- /dev/null +++ b/pg_duckdb-indexed/stop @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop pgduck >/dev/null 2>&1 || true diff --git a/pg_duckdb-motherduck/benchmark.sh b/pg_duckdb-motherduck/benchmark.sh index b2ba6958c1..0b26b6fc3e 100755 --- a/pg_duckdb-motherduck/benchmark.sh +++ b/pg_duckdb-motherduck/benchmark.sh @@ -1,39 +1,11 @@ #!/bin/bash - -set -e - -# Note: To get equivalent performance you should be running from -# AWS US-EAST-1 region or as close to there as possible. Otherwise -# you'll see additional latency. - -# Sign up for MotherDuck. -# Go to the web ui and obtain a token -# https://motherduck.com/docs/key-tasks/authenticating-and-connecting-to-motherduck/authenticating-to-motherduck/ -# Save the token as the MOTHERDUCK_TOKEN environment variable: -# export MOTHERDUCK_TOKEN=... -# create a database called pgclick in the motherduck UI or duckdb cli -# `CREATE DATABASE pgclick` - -if [ -z "${MOTHERDUCK_TOKEN}" ]; then - echo "Error: MOTHERDUCK_TOKEN is not set." - exit 1 -fi - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client -sudo docker run -d --name pgduck --network=host -e POSTGRES_PASSWORD=duckdb -e MOTHERDUCK_TOKEN=${MOTHERDUCK_TOKEN} pgduckdb/pgduckdb:17-v1.0.0 -c duckdb.motherduck_enabled=true - -# Give postgres time to start running -sleep 10 - -echo -n "Load time: " -command time -f '%e' ./load.sh - -./run.sh 2>&1 | tee log.txt - -# Go to https://app.motherduck.com and execute: -# `SELECT database_size FROM pragma_database_size() WHERE database_name = 'pgclick'` -# 25 GB - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Empty BENCH_DOWNLOAD_SCRIPT: the data lives in MotherDuck cloud (the +# load script CTAS'es directly from S3 inside MotherDuck), nothing to +# fetch locally. +export BENCH_DOWNLOAD_SCRIPT="" +# BENCH_RESTARTABLE=yes still gives us cold/warm tries (the local +# pg_duckdb container is what we restart; the MotherDuck side caches +# its own way). +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_duckdb-motherduck/check b/pg_duckdb-motherduck/check new file mode 100755 index 0000000000..917d67d78e --- /dev/null +++ b/pg_duckdb-motherduck/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/pg_duckdb-motherduck/data-size b/pg_duckdb-motherduck/data-size new file mode 100755 index 0000000000..422bbd8d36 --- /dev/null +++ b/pg_duckdb-motherduck/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +# The data lives in MotherDuck's cloud, not in the local container, so +# `du` on the postgres data dir would just report bookkeeping. Report +# the source parquet size (≈ 14.8 GB) so the lib's post-load size +# sanity check (data must be > 5 GB) doesn't false-positive. +echo 14779976446 diff --git a/pg_duckdb-motherduck/install b/pg_duckdb-motherduck/install new file mode 100755 index 0000000000..58754dc41d --- /dev/null +++ b/pg_duckdb-motherduck/install @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# pg_duckdb-motherduck: pgduckdb container talks to MotherDuck cloud. +# Requires MOTHERDUCK_TOKEN to be exported on the operator side +# (forwarded into cloud-init.sh by run-benchmark.sh's runtime_env loop +# if it's set in the operator's shell). +: "${MOTHERDUCK_TOKEN:?MOTHERDUCK_TOKEN is required (sign up + create database 'pgclick' first)}" + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull pgduckdb/pgduckdb:17-v1.0.0 diff --git a/pg_duckdb-motherduck/load b/pg_duckdb-motherduck/load new file mode 100755 index 0000000000..3cd928ddde --- /dev/null +++ b/pg_duckdb-motherduck/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# create.sql has REPLACE_SCHEMA / REPLACE_PARQUET_FILE placeholders; +# substitute the MotherDuck database name and the canonical ClickBench +# parquet URL so the CTAS reads directly from S3. +DATABASE='ddb$pgclick' +PARQUET_FILE='https://datasets.clickhouse.com/hits_compatible/hits.parquet' + +sed -e "s=REPLACE_SCHEMA=$DATABASE=g" -e "s=REPLACE_PARQUET_FILE=$PARQUET_FILE=g" create.sql \ + | PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 --no-psqlrc --tuples-only + +sync diff --git a/pg_duckdb-motherduck/load.sh b/pg_duckdb-motherduck/load.sh deleted file mode 100755 index eb130082a0..0000000000 --- a/pg_duckdb-motherduck/load.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -e - -CONNECTION=postgres://postgres:duckdb@localhost:5432/postgres - -DATABASE='ddb$pgclick' -PARQUET_FILE='https://datasets.clickhouse.com/hits_compatible/hits.parquet' - -echo "Loading data" -( - cat create.sql | sed -e "s=REPLACE_SCHEMA=$DATABASE=g" -e "s=REPLACE_PARQUET_FILE=$PARQUET_FILE=g" -) | psql --no-psqlrc --tuples-only $CONNECTION diff --git a/pg_duckdb-motherduck/query b/pg_duckdb-motherduck/query new file mode 100755 index 0000000000..e12df9a7a3 --- /dev/null +++ b/pg_duckdb-motherduck/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against pg_duckdb in +# MotherDuck mode. The hits table lives in the ddb$pgclick MotherDuck +# database, so we set search_path before each query. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGUSER=postgres +export PGPASSWORD=duckdb +DATABASE='ddb$pgclick' +query=$(cat) + +raw=$(psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t \ + -c "set search_path=$DATABASE;" -c '\timing' -c "$query" 2>&1) \ + && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/pg_duckdb-motherduck/run.sh b/pg_duckdb-motherduck/run.sh deleted file mode 100755 index f4ebc2f493..0000000000 --- a/pg_duckdb-motherduck/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 -CONNECTION=postgres://postgres:duckdb@localhost:5432/postgres - -DATABASE='ddb$pgclick' - -cat queries.sql | while read -r query; do - echo "$query" - ( - echo "set search_path=$DATABASE;" - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql --no-psqlrc --tuples-only $CONNECTION 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/pg_duckdb-motherduck/start b/pg_duckdb-motherduck/start new file mode 100755 index 0000000000..065fa5cf04 --- /dev/null +++ b/pg_duckdb-motherduck/start @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +: "${MOTHERDUCK_TOKEN:?MOTHERDUCK_TOKEN is required}" + +if PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +if ! sudo docker ps -a --format '{{.Names}}' | grep -qx pgduck; then + sudo docker run -d --name pgduck --network=host \ + -e POSTGRES_PASSWORD=duckdb \ + -e MOTHERDUCK_TOKEN="$MOTHERDUCK_TOKEN" \ + pgduckdb/pgduckdb:17-v1.0.0 \ + -c duckdb.motherduck_enabled=true >/dev/null +else + sudo docker start pgduck >/dev/null +fi + +for _ in $(seq 1 60); do + PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/pg_duckdb-motherduck/stop b/pg_duckdb-motherduck/stop new file mode 100755 index 0000000000..518e5460d1 --- /dev/null +++ b/pg_duckdb-motherduck/stop @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop pgduck >/dev/null 2>&1 || true diff --git a/pg_duckdb-parquet/benchmark.sh b/pg_duckdb-parquet/benchmark.sh index 59995ceac9..3c0bc0449e 100755 --- a/pg_duckdb-parquet/benchmark.sh +++ b/pg_duckdb-parquet/benchmark.sh @@ -1,28 +1,8 @@ #!/bin/bash - -set -e - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -../download-hits-parquet-single -sudo docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb -v ./hits.parquet:/tmp/hits.parquet pgduckdb/pgduckdb:17-v1.0.0 -c duckdb.max_memory=10GB - -for _ in {1..300} -do - psql postgres://postgres:duckdb@localhost:5432/postgres -f create.sql 2>&1 | tee load_out.txt && break - sleep 1 -done -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec -i pgduck du -bcs /var/lib/postgresql/data /tmp/hits.parquet | grep total -echo "Load time: 0" - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Empty BENCH_DOWNLOAD_SCRIPT: install fetches hits.parquet itself, +# because the container needs the file bind-mounted at start time +# (before lib's bench_download step runs). +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_duckdb-parquet/check b/pg_duckdb-parquet/check new file mode 100755 index 0000000000..917d67d78e --- /dev/null +++ b/pg_duckdb-parquet/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/pg_duckdb-parquet/data-size b/pg_duckdb-parquet/data-size new file mode 100755 index 0000000000..4b5d34e7e1 --- /dev/null +++ b/pg_duckdb-parquet/data-size @@ -0,0 +1,3 @@ +#!/bin/bash +set -eu +sudo docker exec pgduck du -bcs /var/lib/postgresql/data /tmp/hits.parquet | grep total | awk '{print $1}' diff --git a/pg_duckdb-parquet/install b/pg_duckdb-parquet/install new file mode 100755 index 0000000000..12ae9dd393 --- /dev/null +++ b/pg_duckdb-parquet/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull pgduckdb/pgduckdb:17-v1.0.0 + +# pg_duckdb-parquet bind-mounts hits.parquet into the container at start +# time (the create.sql view reads from /tmp/hits.parquet directly), so +# the file has to exist before ./start runs. The lib's bench_download +# step happens after start, so download here in install instead. +if [ ! -f hits.parquet ]; then + ../lib/download-hits-parquet-single +fi diff --git a/pg_duckdb-parquet/load b/pg_duckdb-parquet/load new file mode 100755 index 0000000000..87ee517fc1 --- /dev/null +++ b/pg_duckdb-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +# No data ingest — the view in create.sql just defines a +# read_parquet('/tmp/hits.parquet', ...) shape over the bind-mounted +# parquet file. +PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -f create.sql + +sync diff --git a/pg_duckdb-parquet/query b/pg_duckdb-parquet/query new file mode 100755 index 0000000000..33b871529d --- /dev/null +++ b/pg_duckdb-parquet/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against pg_duckdb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGUSER=postgres +export PGPASSWORD=duckdb +query=$(cat) + +raw=$(psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/pg_duckdb-parquet/results/20260509/c6a.4xlarge.json b/pg_duckdb-parquet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..b64055f314 --- /dev/null +++ b/pg_duckdb-parquet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "pg_duckdb (Parquet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 0, + "data_size": 14820640783, + "result": [ + [0.423, 0.149, 0.149], + [0.47, 0.181, 0.182], + [0.533, 0.212, 0.212], + [0.733, 0.206, 0.205], + [0.816, 0.568, 0.562], + [1.217, 0.685, 0.695], + [0.932, 0.638, 0.638], + [0.483, 0.188, 0.185], + [1.134, 0.679, 0.676], + [1.424, 0.796, 0.79], + [0.823, 0.28, 0.281], + [0.834, 0.313, 0.309], + [1.241, 0.688, 0.691], + [2.736, 1.01, 1.001], + [1.285, 0.739, 0.746], + [0.977, 0.623, 0.626], + [2.711, 1.256, 1.259], + [2.501, 1.045, 1.043], + [4.805, 2.383, 2.402], + [0.544, 0.196, 0.181], + [9.824, 4.629, 4.699], + [11.425, 1.908, 1.898], + [20.338, 2.865, 2.905], + [12.558, 4.933, 4.984], + [2.933, 0.598, 0.605], + [1.13, 0.429, 0.426], + [2.879, 0.555, 0.547], + [9.856, 1.22, 1.221], + [11.174, 10.738, 10.729], + [0.53, 0.213, 0.212], + [2.554, 0.754, 0.739], + [6.125, 0.842, 0.832], + [5.603, 2.308, 2.293], + [10.326, 2.502, 2.544], + [10.33, 2.578, 2.573], + [1.073, 0.722, 0.742], + [0.638, 0.297, 0.295], + [0.58, 0.279, 0.274], + [0.566, 0.22, 0.22], + [0.838, 0.411, 0.411], + [0.524, 0.194, 0.193], + [0.5, 0.192, 0.189], + [0.537, 0.233, 0.232] +] +} + diff --git a/pg_duckdb-parquet/run.sh b/pg_duckdb-parquet/run.sh deleted file mode 100755 index 37b7cece28..0000000000 --- a/pg_duckdb-parquet/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql --no-psqlrc --tuples-only postgres://postgres:duckdb@localhost:5432/postgres 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/pg_duckdb-parquet/start b/pg_duckdb-parquet/start new file mode 100755 index 0000000000..f4650608a3 --- /dev/null +++ b/pg_duckdb-parquet/start @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +if ! sudo docker ps -a --format '{{.Names}}' | grep -qx pgduck; then + sudo docker run -d --name pgduck -p 5432:5432 \ + -e POSTGRES_PASSWORD=duckdb \ + -v "$(pwd)/hits.parquet:/tmp/hits.parquet:ro" \ + pgduckdb/pgduckdb:17-v1.0.0 \ + -c duckdb.max_memory=10GB >/dev/null +else + sudo docker start pgduck >/dev/null +fi + +for _ in $(seq 1 60); do + PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/pg_duckdb-parquet/stop b/pg_duckdb-parquet/stop new file mode 100755 index 0000000000..518e5460d1 --- /dev/null +++ b/pg_duckdb-parquet/stop @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop pgduck >/dev/null 2>&1 || true diff --git a/pg_duckdb/benchmark.sh b/pg_duckdb/benchmark.sh index 77a2b15f2d..531bd65038 100755 --- a/pg_duckdb/benchmark.sh +++ b/pg_duckdb/benchmark.sh @@ -1,82 +1,5 @@ #!/bin/bash - -set -eu - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -../download-hits-tsv - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# We also give DuckDB 25% of the memory to work with. -duckdb_memory=$(($memory / 4)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb pgduckdb/pgduckdb:17-v1.0.0 - -sleep 2 - -sudo docker exec -i pgduck bash -c " -cat >> /var/lib/postgresql/data/postgresql.conf <<'EOF' -shared_buffers=${shared_buffers}kB -max_worker_processes=${max_worker_processes} -max_parallel_workers=${threads} -max_parallel_maintenance_workers=${cpus} -max_parallel_workers_per_gather=${cpus} -duckdb.max_workers_per_postgres_scan=${cpus} -max_wal_size=32GB -work_mem=64MB -effective_cache_size=${effective_cache_size}kB -duckdb.max_memory='${duckdb_memory}kB' -EOF -" - -docker restart pgduck - -export PGUSER=postgres -export PGPASSWORD=duckdb -export CONNECTION=postgres://postgres:duckdb@localhost:5432/postgres - -for _ in {1..300} -do - psql $CONNECTION -t < create.sql 2>&1 | tee load_out.txt && break - sleep 1 -done -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -psql $CONNECTION -c "ALTER DATABASE postgres SET duckdb.force_execution = true;" -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pgduck du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_duckdb/check b/pg_duckdb/check new file mode 100755 index 0000000000..917d67d78e --- /dev/null +++ b/pg_duckdb/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/pg_duckdb/data-size b/pg_duckdb/data-size new file mode 100755 index 0000000000..c5ef7ac005 --- /dev/null +++ b/pg_duckdb/data-size @@ -0,0 +1,3 @@ +#!/bin/bash +set -eu +sudo docker exec pgduck du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/pg_duckdb/install b/pg_duckdb/install new file mode 100755 index 0000000000..c17febb8dd --- /dev/null +++ b/pg_duckdb/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull pgduckdb/pgduckdb:17-v1.0.0 diff --git a/pg_duckdb/load b/pg_duckdb/load new file mode 100755 index 0000000000..a431dc0d23 --- /dev/null +++ b/pg_duckdb/load @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +export PGUSER=postgres +export PGPASSWORD=duckdb +CONNECTION=postgres://postgres:duckdb@127.0.0.1:5432/postgres + +psql "$CONNECTION" -v ON_ERROR_STOP=1 -t < create.sql + +# COPY FREEZE requires the table to be created or truncated in the same +# subtransaction; wrap the truncate + copy in a single transaction. +psql "$CONNECTION" -v ON_ERROR_STOP=1 -q <<'EOF' +BEGIN; +TRUNCATE TABLE hits; +\copy hits FROM 'hits.tsv' WITH FREEZE; +COMMIT; +EOF + +psql "$CONNECTION" -v ON_ERROR_STOP=1 -q -t -c 'VACUUM ANALYZE hits' + +# Make every query in this database execute via the duckdb engine. +psql "$CONNECTION" -v ON_ERROR_STOP=1 -c \ + "ALTER DATABASE postgres SET duckdb.force_execution = true;" + +rm -f hits.tsv +sync diff --git a/pg_duckdb/load.sh b/pg_duckdb/load.sh deleted file mode 100755 index 5c3b4b6f2a..0000000000 --- a/pg_duckdb/load.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -eu - -# Load data: wrap TRUNCATE and \copy FREEZE in a single transaction -# If we dont' do this, Postgres will throw an error: -# "ERROR: cannot perform COPY FREEZE because the table was not created or truncated in the current subtransaction" -# (i.e. Postgres requires that the table be either created or truncated in the current subtransaction) -psql $CONNECTION -q <<'EOF' -BEGIN; -TRUNCATE TABLE hits; -\copy hits FROM 'hits.tsv' with freeze; -COMMIT; -EOF - -psql $CONNECTION -q -t -c 'VACUUM ANALYZE hits' diff --git a/pg_duckdb/query b/pg_duckdb/query new file mode 100755 index 0000000000..33b871529d --- /dev/null +++ b/pg_duckdb/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against pg_duckdb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGUSER=postgres +export PGPASSWORD=duckdb +query=$(cat) + +raw=$(psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/pg_duckdb/run.sh b/pg_duckdb/run.sh deleted file mode 100755 index 8561c10265..0000000000 --- a/pg_duckdb/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -TRIES=3 - -export PGUSER=postgres -export PGPASSWORD=duckdb - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql $CONNECTION --no-psqlrc --tuples-only 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/pg_duckdb/start b/pg_duckdb/start new file mode 100755 index 0000000000..c569063f61 --- /dev/null +++ b/pg_duckdb/start @@ -0,0 +1,43 @@ +#!/bin/bash +set -eu + +# Idempotent: postgres responding on :5432 means we're done. +if PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# pgduckdb's image is plain postgres + the duckdb extension. Pass tuning +# parameters via `postgres -c k=v` so the running cluster picks them up +# without a second restart. +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(( threads / 2 )) +shared_buffers=$(( memory / 4 )) +effective_cache_size=$(( memory - memory / 4 )) +max_worker_processes=$(( threads + 15 )) +duckdb_memory=$(( memory / 4 )) + +if ! sudo docker ps -a --format '{{.Names}}' | grep -qx pgduck; then + sudo docker run -d --name pgduck -p 5432:5432 \ + -e POSTGRES_PASSWORD=duckdb \ + pgduckdb/pgduckdb:17-v1.0.0 \ + -c shared_buffers="${shared_buffers}kB" \ + -c max_worker_processes="$max_worker_processes" \ + -c max_parallel_workers="$threads" \ + -c max_parallel_maintenance_workers="$cpus" \ + -c max_parallel_workers_per_gather="$cpus" \ + -c duckdb.max_workers_per_postgres_scan="$cpus" \ + -c max_wal_size=32GB \ + -c work_mem=64MB \ + -c effective_cache_size="${effective_cache_size}kB" \ + -c duckdb.max_memory="${duckdb_memory}kB" >/dev/null +else + sudo docker start pgduck >/dev/null +fi + +# Wait for postgres protocol to come up (the lib's check loop polls +# ./check too, but bench_start expects a quick ready signal). +for _ in $(seq 1 60); do + PGPASSWORD=duckdb psql -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/pg_duckdb/stop b/pg_duckdb/stop new file mode 100755 index 0000000000..518e5460d1 --- /dev/null +++ b/pg_duckdb/stop @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop pgduck >/dev/null 2>&1 || true diff --git a/pg_ducklake/benchmark.sh b/pg_ducklake/benchmark.sh index cd3d5855ec..b851876173 100755 --- a/pg_ducklake/benchmark.sh +++ b/pg_ducklake/benchmark.sh @@ -1,22 +1,5 @@ #!/bin/bash - -set -e - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -../download-hits-parquet-single -docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb -v ./hits.parquet:/tmp/hits.parquet pgducklake/pgducklake:18-main - -sleep 5 # wait for pgducklake start up - -echo -n "Load time: " -command time -f '%e' psql postgres://postgres:duckdb@localhost:5432/postgres -f create.sql 2>&1 - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pgduck du -bcs /var/lib/postgresql/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_ducklake/check b/pg_ducklake/check new file mode 100755 index 0000000000..dfe3a7c57d --- /dev/null +++ b/pg_ducklake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql postgres://postgres:duckdb@localhost:5432/postgres -t -c 'SELECT 1' >/dev/null diff --git a/pg_ducklake/data-size b/pg_ducklake/data-size new file mode 100755 index 0000000000..e15dcd55c6 --- /dev/null +++ b/pg_ducklake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/ | grep total | awk '{print $1}' diff --git a/pg_ducklake/install b/pg_ducklake/install new file mode 100755 index 0000000000..2838782769 --- /dev/null +++ b/pg_ducklake/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +PGDUCK_IMAGE=${PGDUCK_IMAGE:-pgducklake/pgducklake:18-main} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PGDUCK_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=duckdb \ + "$PGDUCK_IMAGE" diff --git a/pg_ducklake/load b/pg_ducklake/load new file mode 100755 index 0000000000..5636d2eabb --- /dev/null +++ b/pg_ducklake/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} + +# Move parquet file into the container at /tmp/hits.parquet (path used by create.sql). +sudo docker cp hits.parquet "$CONTAINER_NAME":/tmp/hits.parquet + +psql postgres://postgres:duckdb@localhost:5432/postgres -v ON_ERROR_STOP=1 -t -c "DROP TABLE IF EXISTS hits;" || true +psql postgres://postgres:duckdb@localhost:5432/postgres -v ON_ERROR_STOP=1 -f create.sql + +rm -f hits.parquet +sync diff --git a/pg_ducklake/query b/pg_ducklake/query new file mode 100755 index 0000000000..54d362e652 --- /dev/null +++ b/pg_ducklake/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql --no-psqlrc --tuples-only postgres://postgres:duckdb@localhost:5432/postgres 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_ducklake/results/20260509/c6a.4xlarge.json b/pg_ducklake/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..580f64ce09 --- /dev/null +++ b/pg_ducklake/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "pg_ducklake", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented","PostgreSQL compatible"], + "load_time": 973, + "data_size": 14369015692, + "result": [ + [0.351, 0.023, 0.022], + [0.487, 0.087, 0.084], + [0.589, 0.102, 0.101], + [1.197, 0.103, 0.102], + [1.343, 0.356, 0.361], + [1.738, 0.508, 0.541], + [0.194, 0.088, 0.086], + [0.498, 0.092, 0.084], + [1.453, 0.506, 0.505], + [2.289, 0.62, 0.617], + [1.314, 0.185, 0.195], + [1.355, 0.201, 0.205], + [1.78, 0.588, 0.592], + [3.844, 0.966, 0.98], + [1.576, 0.645, 0.655], + [1.373, 0.423, 0.42], + [3.63, 1.152, 1.153], + [3.196, 0.906, 0.88], + [7.109, 2.553, 2.537], + [0.23, 0.082, 0.087], + [10.036, 3.809, 3.789], + [12.268, 1.977, 1.978], + [20.024, 2.632, 2.609], + [3.545, 0.937, 0.915], + [0.294, 0.173, 0.165], + [1.67, 0.281, 0.284], + [0.789, 0.127, 0.13], + [9.934, 1.207, 1.218], + [9.282, 8.162, 8.194], + [0.271, 0.104, 0.111], + [3.625, 0.569, 0.556], + [7.59, 0.678, 0.678], + [6.868, 3, 2.991], + [10.941, 3.335, 3.377], + [11.044, 3.75, 3.742], + [0.733, 0.558, 0.555], + [0.501, 0.104, 0.104], + [0.462, 0.063, 0.069], + [0.481, 0.088, 0.08], + [0.56, 0.139, 0.15], + [0.482, 0.061, 0.07], + [0.457, 0.067, 0.07], + [0.459, 0.071, 0.065] +] +} + diff --git a/pg_ducklake/run.sh b/pg_ducklake/run.sh deleted file mode 100755 index 7451228b99..0000000000 --- a/pg_ducklake/run.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart pgduck - sleep 5 # wait for restart - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql --no-psqlrc --tuples-only postgres://postgres:duckdb@localhost:5432/postgres 2>&1 -done diff --git a/pg_ducklake/start b/pg_ducklake/start new file mode 100755 index 0000000000..5ab27c2879 --- /dev/null +++ b/pg_ducklake/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pg_ducklake/stop b/pg_ducklake/stop new file mode 100755 index 0000000000..838edfdbef --- /dev/null +++ b/pg_ducklake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pg_mooncake/benchmark.sh b/pg_mooncake/benchmark.sh index a49b373a68..b851876173 100755 --- a/pg_mooncake/benchmark.sh +++ b/pg_mooncake/benchmark.sh @@ -1,33 +1,5 @@ #!/bin/bash - - -#install docker if needed. - -sudo apt-get update -y -sudo apt-get install -y docker.io -sudo usermod -aG docker $USER -newgrp docker - -sudo apt-get install -y postgresql-client - -../download-hits-parquet-single -docker run -d --name pg_mooncake -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust -v ./hits.parquet:/tmp/hits.parquet mooncakelabs/pg_mooncake:17-v0.1.0 - -sleep 5 -echo -n "Load time: " -command time -f '%e' psql postgres://postgres:pg_mooncake@localhost:5432/postgres -q -t -f create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# COPY 99997497 -# Time: 576219.151 ms (09:36.219) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pg_mooncake du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_mooncake/check b/pg_mooncake/check new file mode 100755 index 0000000000..4d9368d263 --- /dev/null +++ b/pg_mooncake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql postgres://postgres@localhost:5432/postgres -t -c 'SELECT 1' >/dev/null diff --git a/pg_mooncake/data-size b/pg_mooncake/data-size new file mode 100755 index 0000000000..c0e4b3d680 --- /dev/null +++ b/pg_mooncake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/pg_mooncake/install b/pg_mooncake/install new file mode 100755 index 0000000000..992c34eb00 --- /dev/null +++ b/pg_mooncake/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +PG_MOONCAKE_IMAGE=${PG_MOONCAKE_IMAGE:-mooncakelabs/pg_mooncake:17-v0.1.0} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PG_MOONCAKE_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + "$PG_MOONCAKE_IMAGE" diff --git a/pg_mooncake/load b/pg_mooncake/load new file mode 100755 index 0000000000..5026725e24 --- /dev/null +++ b/pg_mooncake/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} + +# Move parquet file into the container at /tmp/hits.parquet (path used by create.sql). +sudo docker cp hits.parquet "$CONTAINER_NAME":/tmp/hits.parquet + +psql postgres://postgres@localhost:5432/postgres -v ON_ERROR_STOP=1 -t -c "DROP TABLE IF EXISTS hits;" || true +psql postgres://postgres@localhost:5432/postgres -v ON_ERROR_STOP=1 -q -t -f create.sql + +rm -f hits.parquet +sync diff --git a/pg_mooncake/query b/pg_mooncake/query new file mode 100755 index 0000000000..1b5078ac0e --- /dev/null +++ b/pg_mooncake/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql postgres://postgres@localhost:5432/postgres 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_mooncake/results/20260509/c6a.4xlarge.json b/pg_mooncake/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..85055457da --- /dev/null +++ b/pg_mooncake/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "pg_mooncake", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 583, + "data_size": 14623017634, + "result": [ + [0.804, 0.329, 0.335], + [0.836, 0.347, 0.351], + [0.935, 0.372, 0.367], + [1.324, 0.363, 0.361], + [1.494, 0.646, 0.639], + [1.537, 0.823, 0.814], + [0.441, 0.349, 0.347], + [0.837, 0.352, 0.352], + [1.889, 0.741, 0.746], + [2.225, 0.964, 0.963], + [1.44, 0.42, 0.422], + [1.774, 0.446, 0.451], + [1.944, 0.728, 0.731], + [3.466, 1.052, 1.059], + [1.726, 0.784, 0.802], + [1.131, 0.697, 0.696], + [3.558, 1.437, 1.431], + [3.482, 1.348, 1.347], + [7.023, 2.437, 2.443], + [0.47, 0.345, 0.343], + [8.8, 1.398, 1.402], + [10.558, 1.323, 1.328], + [18.465, 2.068, 2.074], + [54.901, 9.044, 9.059], + [5.134, 0.582, 0.579], + [1.715, 0.479, 0.48], + [5.228, 0.595, 0.595], + [9.028, 1.475, 1.466], + [10.291, 9.985, 10.029], + [4.162, 4.062, 4.057], + [4.023, 0.831, 0.83], + [7.546, 0.927, 0.93], + [7.999, 3.927, 3.797], + [9.617, 3.642, 3.57], + [9.625, 3.7, 3.686], + [1.005, 0.867, 0.866], + [0.372, 0.246, 0.243], + [0.738, 0.221, 0.219], + [0.751, 0.23, 0.229], + [0.828, 0.296, 0.296], + [0.743, 0.211, 0.211], + [0.727, 0.209, 0.212], + [0.734, 0.214, 0.207] +] +} + diff --git a/pg_mooncake/run.sh b/pg_mooncake/run.sh deleted file mode 100755 index 6d6c25192e..0000000000 --- a/pg_mooncake/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 -CONNECTION=postgres://postgres:pg_mooncake@localhost:5432/postgres - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches 1>/dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql $CONNECTION 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done \ No newline at end of file diff --git a/pg_mooncake/start b/pg_mooncake/start new file mode 100755 index 0000000000..6109c96589 --- /dev/null +++ b/pg_mooncake/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pg_mooncake/stop b/pg_mooncake/stop new file mode 100755 index 0000000000..ac0c768620 --- /dev/null +++ b/pg_mooncake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pgpro_tam/benchmark.sh b/pgpro_tam/benchmark.sh index 07fa462f74..531bd65038 100755 --- a/pgpro_tam/benchmark.sh +++ b/pgpro_tam/benchmark.sh @@ -1,71 +1,5 @@ #!/bin/bash - -#Usage: -#./benchmark.sh parquet_fd -#./benchmark.sh parquet_mem_fd -#./benchmark.sh parquet_fd_parall -#./benchmark.sh feather_mem_fd - -#install docker -sudo apt-get update -y -sudo apt-get install -y ca-certificates curl -sudo install -m 0755 -d /etc/apt/keyrings -sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc -sudo chmod a+r /etc/apt/keyrings/docker.asc -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null -sudo apt-get update -y -sudo apt-get install -y docker.io - -#install postgres client; postgres server is inside docker container -sudo apt-get install -y postgresql-client - -#calculate target shm size (in mb) as a half of available memory and run postgres container -MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') -SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) -sudo docker run --shm-size="$SHM_SIZE"m -p5432:5432 --name pgpro_tam -e POSTGRES_HOST_AUTH_METHOD=trust -d innerlife/pgpro_tam:0.0.1 - -#wait for postgres startup and create extension -sleep 10 -psql -h 127.0.0.1 -U postgres -t -c "create extension ppg_tam" - -#create table -if [ "$1" != "parquet_fd" ] && [ "$1" != "parquet_mem_fd" ] && [ "$1" != "parquet_fd_parall" ] && \ - [ "$1" != "feather_mem_fd" ] && [ "$1" != "" ]; then - echo "Error: command line argument must be one of {'parquet_fd', 'parquet_mem_fd', 'parquet_fd_parall', 'feather_mem_fd'}" - exit 1 -fi -if [ ! -z "$1" ]; then - CREATE_FILE="$1" -else - CREATE_FILE="parquet_fd" -fi -psql -h 127.0.0.1 -U postgres -t < create/"$CREATE_FILE".sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -#get and unpack hits.tsv -sudo docker exec pgpro_tam bash -c "cd /tmp && wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' && gzip -d -f hits.tsv.gz" - -#insert data to table -if [ "$1" == "parquet_fd_parall" ] ; then - #insert data in parallel; not ordered insert is much faster, but breaks query performance - sudo docker exec pgpro_tam bash -c "time cat /tmp/hits.tsv | parallel -l 2000000 -j 50 -N1 -k --spreadstdin 'psql -U postgres -t -c \"copy hits FROM STDIN\"'" -else - echo -n "Load time: " - command time -f '%e' psql -h 127.0.0.1 -U postgres -t -c "COPY hits FROM '/tmp/hits.tsv'" -fi - -#run benchmark -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec pgpro_tam du -bcs /var/lib/postgresql/data/base | grep total - -#parse logfile for query execution time -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pgpro_tam/check b/pgpro_tam/check new file mode 100755 index 0000000000..2a82d4699b --- /dev/null +++ b/pgpro_tam/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -h 127.0.0.1 -U postgres -t -c 'SELECT 1' >/dev/null diff --git a/pgpro_tam/data-size b/pgpro_tam/data-size new file mode 100755 index 0000000000..138af564f9 --- /dev/null +++ b/pgpro_tam/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data/base | grep total | awk '{print $1}' diff --git a/pgpro_tam/install b/pgpro_tam/install new file mode 100755 index 0000000000..1f647b6876 --- /dev/null +++ b/pgpro_tam/install @@ -0,0 +1,34 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +PGPRO_TAM_IMAGE=${PGPRO_TAM_IMAGE:-innerlife/pgpro_tam:0.0.1} + +# Install Docker (official repo) + postgres client. +sudo apt-get update -y +sudo apt-get install -y ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PGPRO_TAM_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') +SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + --shm-size="${SHM_SIZE}m" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + "$PGPRO_TAM_IMAGE" diff --git a/pgpro_tam/load b/pgpro_tam/load new file mode 100755 index 0000000000..89d255ff77 --- /dev/null +++ b/pgpro_tam/load @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +# Variant of create.sql to use; see create/*.sql. +CREATE_FILE=${PGPRO_TAM_VARIANT:-parquet_fd} + +# Move hits.tsv into the container at /tmp/hits.tsv (the path used by the +# original benchmark.sh's COPY command). +sudo docker cp hits.tsv "$CONTAINER_NAME":/tmp/hits.tsv + +# Ensure the table-access-method extension is loaded. +psql -h 127.0.0.1 -U postgres -t -c "CREATE EXTENSION IF NOT EXISTS ppg_tam" + +# Drop+create table per the chosen variant. +psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t < "create/${CREATE_FILE}.sql" + +if [ "$CREATE_FILE" = "parquet_fd_parall" ]; then + sudo docker exec "$CONTAINER_NAME" bash -c \ + "cat /tmp/hits.tsv | parallel -l 2000000 -j 50 -N1 -k --spreadstdin 'psql -U postgres -t -c \"copy hits FROM STDIN\"'" +else + psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t -c "COPY hits FROM '/tmp/hits.tsv'" +fi + +# Cleanup source data both inside the container and outside. +sudo docker exec "$CONTAINER_NAME" rm -f /tmp/hits.tsv || true +rm -f hits.tsv +sync diff --git a/pgpro_tam/query b/pgpro_tam/query new file mode 100755 index 0000000000..ceb833618d --- /dev/null +++ b/pgpro_tam/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h 127.0.0.1 -U postgres -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pgpro_tam/results/20260509/c6a.4xlarge.json b/pgpro_tam/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..7e740e774c --- /dev/null +++ b/pgpro_tam/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "pgpro_tam", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","column-oriented","PostgreSQL compatible","lukewarm-cold-run"], + "load_time": 1495, + "data_size": 9690524005, + "result": [ + [0.341, 0.07, 0.066], + [0.365, 0.079, 0.08], + [0.415, 0.12, 0.119], + [0.424, 0.11, 0.111], + [0.832, 0.506, 0.509], + [1.128, 0.794, 0.796], + [0.358, 0.069, 0.07], + [0.415, 0.085, 0.084], + [0.973, 0.624, 0.627], + [1.173, 0.797, 0.799], + [0.581, 0.233, 0.236], + [0.632, 0.275, 0.274], + [1.138, 0.796, 0.801], + [1.671, 1.183, 1.231], + [1.211, 0.86, 0.864], + [0.908, 0.574, 0.577], + [1.87, 1.503, 1.493], + [1.637, 1.291, 1.278], + [3.642, 2.609, 2.593], + [0.44, 0.118, 0.115], + [4.879, 2.053, 1.994], + [5.719, 1.243, 1.248], + [10.83, 2.167, 2.167], + [31.008, 9.931, 8.632], + [0.524, 0.27, 0.268], + [0.761, 0.438, 0.441], + [0.614, 0.284, 0.279], + [4.915, 1.979, 2.043], + [10.992, 10.535, 10.585], + [0.41, 0.101, 0.102], + [1.367, 0.907, 0.904], + [4.67, 1.028, 1.037], + [5.373, 2.75, 2.788], + [5.845, 3.723, 3.672], + [5.851, 3.729, 3.763], + [1.157, 0.839, 0.832], + [0.616, 0.242, 0.243], + [0.646, 0.31, 0.316], + [0.458, 0.117, 0.115], + [0.792, 0.364, 0.366], + [0.41, 0.066, 0.067], + [0.397, 0.064, 0.064], + [0.394, 0.069, 0.071] +] +} + diff --git a/pgpro_tam/run.sh b/pgpro_tam/run.sh deleted file mode 100755 index 6104a64d13..0000000000 --- a/pgpro_tam/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql -h 127.0.0.1 -U postgres -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done; diff --git a/pgpro_tam/start b/pgpro_tam/start new file mode 100755 index 0000000000..b71b022dc7 --- /dev/null +++ b/pgpro_tam/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pgpro_tam/stop b/pgpro_tam/stop new file mode 100755 index 0000000000..fa6533efb9 --- /dev/null +++ b/pgpro_tam/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pinot/benchmark.sh b/pinot/benchmark.sh index 525dd572d4..78b538124c 100755 --- a/pinot/benchmark.sh +++ b/pinot/benchmark.sh @@ -1,47 +1,9 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y openjdk-11-jdk jq -sudo update-alternatives --config java - -# Install - -PINOT_VERSION=1.3.0 - -wget --continue --progress=dot:giga https://downloads.apache.org/pinot/apache-pinot-$PINOT_VERSION/apache-pinot-$PINOT_VERSION-bin.tar.gz -tar -zxvf apache-pinot-$PINOT_VERSION-bin.tar.gz - -./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh QuickStart -type batch & -sleep 30 -./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh AddTable -tableConfigFile offline_table.json -schemaFile schema.json -exec - -# Load the data - -../download-hits-tsv - -# Pinot was unable to load data as a single file wihout any errors returned. We have to split the data -echo -n "Load time: " -command time -f '%e' split -d --additional-suffix .tsv -n l/100 hits.tsv parts - -# Pinot can't load value '"tatuirovarki_redmond' so we need to fix this row to make it work -echo -n "Load time: " -command time -f '%e' sed parts93.tsv -e 's/"tatuirovarki_redmond/tatuirovarki_redmond/g' -i - -# Fix path to local directory -sed splitted.yaml 's/PWD_DIR_PLACEHOLDER/'$PWD'/g' -i -sed local.yaml 's/PWD_DIR_PLACEHOLDER/'$PWD'/g' -i - -# Load data -echo -n "Load time: " -command time -f '%e' ./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh LaunchDataIngestionJob -jobSpecFile splitted.yaml - -# After upload it shows 94465149 rows instead of 99997497 in the dataset - -# Run the queries -./run.sh - -# stop Pinot services -kill %1 - -echo -n "Data size: " -du -bcs ./batch | grep total +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +# Pinot's quickstart starts a controller, broker, server and a Zookeeper +# inside one JVM and takes longer than the lib's 300 s default to be +# query-ready on a cold instance. 900 s clears the observed cold start. +export BENCH_CHECK_TIMEOUT=900 +exec ../lib/benchmark-common.sh diff --git a/pinot/check b/pinot/check new file mode 100755 index 0000000000..3bfe104c3f --- /dev/null +++ b/pinot/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Pinot is responsive once both the controller and the broker accept queries. +RES=$(curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + -d '{"sql":"SELECT 1"}') + +[ "$RES" = "200" ] diff --git a/pinot/data-size b/pinot/data-size new file mode 100755 index 0000000000..80fba7748b --- /dev/null +++ b/pinot/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs ./batch | awk '/total$/ {print $1}' diff --git a/pinot/install b/pinot/install new file mode 100755 index 0000000000..a7a9feb5eb --- /dev/null +++ b/pinot/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +# 1.3.0 was retired from the Apache mirror; bump to a currently-published +# Pinot release. +PINOT_VERSION=1.5.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +if [ ! -d "$PINOT_DIR" ]; then + sudo apt-get update -y + sudo apt-get install -y openjdk-11-jdk jq + + if [ ! -f "$PINOT_DIR.tar.gz" ]; then + wget --continue --progress=dot:giga \ + "https://downloads.apache.org/pinot/apache-pinot-$PINOT_VERSION/$PINOT_DIR.tar.gz" + fi + tar -zxf "$PINOT_DIR.tar.gz" +fi diff --git a/pinot/load b/pinot/load new file mode 100755 index 0000000000..585964d98b --- /dev/null +++ b/pinot/load @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +PINOT_VERSION=1.3.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +"./${PINOT_DIR}/bin/pinot-admin.sh" AddTable \ + -tableConfigFile offline_table.json \ + -schemaFile schema.json -exec || true + +# Pinot was unable to load data as a single file without errors. Split. +split -d --additional-suffix .tsv -n l/100 hits.tsv parts + +# Pinot can't load value '"tatuirovarki_redmond' so we need to fix this row. +sed parts93.tsv -e 's/"tatuirovarki_redmond/tatuirovarki_redmond/g' -i + +# Fix path in YAML to local directory (idempotent — only replace placeholder). +sed splitted.yaml -e "s|PWD_DIR_PLACEHOLDER|$PWD|g" -i +sed local.yaml -e "s|PWD_DIR_PLACEHOLDER|$PWD|g" -i + +"./${PINOT_DIR}/bin/pinot-admin.sh" LaunchDataIngestionJob -jobSpecFile splitted.yaml + +rm -f hits.tsv parts*.tsv +sync diff --git a/pinot/query b/pinot/query new file mode 100755 index 0000000000..337792437a --- /dev/null +++ b/pinot/query @@ -0,0 +1,37 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Pinot broker HTTP API. +# Stdout: query result JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) +# Pinot doesn't accept trailing semicolons. +query=$(printf '%s' "$query" | tr -d ';') + +req=$(printf '%s' "$query" | python3 -c ' +import json, sys +q = sys.stdin.read() +print(json.dumps({"sql": q + " option(timeoutMs=300000)"})) +') + +resp=$(curl -sS -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + --data-binary "$req") + +echo "$resp" + +# Detect failure: Pinot returns a JSON object always; non-empty exceptions +# array means failure. +if echo "$resp" | jq -e '.exceptions | length > 0' >/dev/null 2>&1; then + echo "pinot query failed" >&2 + exit 1 +fi + +# timeUsedMs in JSON; convert to seconds. +secs=$(echo "$resp" | jq -r '.timeUsedMs / 1000') +if [ -z "$secs" ] || [ "$secs" = "null" ]; then + echo "no timing in pinot response" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/pinot/run.sh b/pinot/run.sh deleted file mode 100755 index 5f5ea49765..0000000000 --- a/pinot/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo -n "[" - for i in $(seq 1 $TRIES); do - echo "{\"sql\":\"$query option(timeoutMs=300000)\"}"| tr -d ';' > query.json - RES=$(curl -s -XPOST -H'Content-Type: application/json' http://localhost:8000/query/sql/ -d @query.json | jq 'if .exceptions == [] then .timeUsedMs/1000 else "-" end' ) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," -done diff --git a/pinot/start b/pinot/start new file mode 100755 index 0000000000..c51ea1e885 --- /dev/null +++ b/pinot/start @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +PINOT_VERSION=1.3.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +# Idempotent: if broker query endpoint is up, do nothing. +if curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + -d '{"sql":"SELECT 1"}' 2>/dev/null | grep -q '^200'; then + exit 0 +fi + +nohup "./${PINOT_DIR}/bin/pinot-admin.sh" QuickStart -type batch \ + >> pinot.log 2>&1 < /dev/null & +disown diff --git a/pinot/stop b/pinot/stop new file mode 100755 index 0000000000..140a589693 --- /dev/null +++ b/pinot/stop @@ -0,0 +1,7 @@ +#!/bin/bash + +pkill -f 'pinot-admin' 2>/dev/null || true +pkill -f 'pinot.tools.admin' 2>/dev/null || true +pkill -f 'org.apache.pinot' 2>/dev/null || true +sleep 2 +exit 0 diff --git a/polars-dataframe/benchmark.sh b/polars-dataframe/benchmark.sh index b7cf32a63a..fc4bacc8f3 100755 --- a/polars-dataframe/benchmark.sh +++ b/polars-dataframe/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install polars - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/polars-dataframe/check b/polars-dataframe/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/polars-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/polars-dataframe/data-size b/polars-dataframe/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/polars-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/polars-dataframe/install b/polars-dataframe/install new file mode 100755 index 0000000000..e8eaaea9fe --- /dev/null +++ b/polars-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet polars pyarrow fastapi uvicorn diff --git a/polars-dataframe/load b/polars-dataframe/load new file mode 100755 index 0000000000..ceba6becac --- /dev/null +++ b/polars-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/polars-dataframe/queries.sql b/polars-dataframe/queries.sql new file mode 100644 index 0000000000..717ebd9262 --- /dev/null +++ b/polars-dataframe/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/polars-dataframe/query b/polars-dataframe/query new file mode 100755 index 0000000000..8f1c38e8c4 --- /dev/null +++ b/polars-dataframe/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running polars server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Capture HTTP status and body separately to detect errors cleanly. +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/polars-dataframe/results/20241129/c6a.metal.json b/polars-dataframe/results/20241129/c6a.metal.json index 8a3e3444e9..d5da1b9862 100644 --- a/polars-dataframe/results/20241129/c6a.metal.json +++ b/polars-dataframe/results/20241129/c6a.metal.json @@ -6,7 +6,8 @@ "tuned": "no", "comment": "", "tags": [ - "column-oriented" + "column-oriented", + "in-memory" ], "load_time": 2, "data_size": 14779976446, diff --git a/polars-dataframe/results/20250711/c6a.2xlarge.json b/polars-dataframe/results/20250711/c6a.2xlarge.json index 88c4acaaa6..15f191dc87 100644 --- a/polars-dataframe/results/20250711/c6a.2xlarge.json +++ b/polars-dataframe/results/20250711/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical"], + "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical", "in-memory"], "load_time": 381, "data_size": 15558373376, "result": [ diff --git a/polars-dataframe/results/20250712/c6a.xlarge.json b/polars-dataframe/results/20250712/c6a.xlarge.json index 18344090af..16f0e7ae11 100644 --- a/polars-dataframe/results/20250712/c6a.xlarge.json +++ b/polars-dataframe/results/20250712/c6a.xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical"], + "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical", "in-memory"], "load_time": 410, "data_size": 7457083392, "result": [ diff --git a/polars-dataframe/results/20251214/c6a.2xlarge.json b/polars-dataframe/results/20251214/c6a.2xlarge.json index 4b1dca43b0..feddde92e3 100644 --- a/polars-dataframe/results/20251214/c6a.2xlarge.json +++ b/polars-dataframe/results/20251214/c6a.2xlarge.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical"], + "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical", "in-memory"], "load_time": 325, "data_size": 15650611200, "result": [ diff --git a/polars-dataframe/results/20251215/c8g.metal-48xl.json b/polars-dataframe/results/20251215/c8g.metal-48xl.json index e5d6b3902b..9a477c2ea0 100644 --- a/polars-dataframe/results/20251215/c8g.metal-48xl.json +++ b/polars-dataframe/results/20251215/c8g.metal-48xl.json @@ -6,7 +6,7 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical"], + "tags": ["column-oriented", "dataframe", "lukewarm-cold-run", "historical", "in-memory"], "load_time": 3, "data_size": 182137651200, "result": [ diff --git a/polars-dataframe/results/20260509/c6a.metal.json b/polars-dataframe/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..555dd43041 --- /dev/null +++ b/polars-dataframe/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Polars (DataFrame)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented","dataframe","in-memory","lukewarm-cold-run"], + "load_time": 6, + "data_size": 58970980436, + "result": [ + [0.021, 0.003, 0.003], + [0.014, 0.011, 0.013], + [0.017, 0.013, 0.012], + [0.015, 0.015, 0.016], + [0.114, 0.103, 0.099], + [0.161, 0.162, 0.171], + [0.014, 0.011, 0.016], + [0.032, 0.031, 0.028], + [0.336, 0.315, 0.294], + [0.373, 0.35, 0.364], + [0.096, 0.095, 0.093], + [0.101, 0.099, 0.1], + [0.164, 0.163, 0.15], + [0.34, 0.346, 0.34], + [0.198, 0.188, 0.194], + [0.14, 0.142, 0.137], + [0.495, 0.5, 0.503], + [0.456, 0.455, 0.451], + [0.841, 0.777, 0.856], + [0.016, 0.015, 0.017], + [0.165, 0.073, 0.077], + [0.121, 0.12, 0.114], + [0.254, 0.218, 0.221], + [0.202, 0.131, 0.178], + [0.042, 0.042, 0.043], + [0.048, 0.046, 0.054], + [0.071, 0.07, 0.069], + [0.158, 0.145, 0.131], + [0.733, 0.747, 0.71], + [0.051, 0.037, 0.035], + [0.169, 0.172, 0.167], + [0.178, 0.16, 0.161], + [0.937, 0.838, 0.918], + [0.754, 0.591, 0.584], + [0.584, 0.572, 0.567], + [0.132, 0.134, 0.135], + [0.073, 0.076, 0.07], + [0.069, 0.072, 0.067], + [0.04, 0.039, 0.04], + [0.08, 0.077, 0.077], + [0.07, 0.061, 0.061], + [0.039, 0.04, 0.04], + [0.035, 0.039, 0.038] +] +} + diff --git a/polars-dataframe/query.py b/polars-dataframe/server.py old mode 100755 new mode 100644 similarity index 86% rename from polars-dataframe/query.py rename to polars-dataframe/server.py index cdda28df22..773324dd0e --- a/polars-dataframe/query.py +++ b/polars-dataframe/server.py @@ -1,51 +1,66 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around polars so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import polars as pl +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the LazyFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded LazyFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (estimated_size) + +The (sql, lambda) list is the same as the previous standalone query.py. +""" + +import os import timeit from datetime import date -import json -import subprocess -import os -# The streaming engine will be the default soon -# https://pola.rs/posts/polars-in-aggregate-dec25/ +import polars as pl +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +# Streaming engine will be the default soon. pl.Config.set_engine_affinity("streaming") -# 0: No., 1: SQL, 2: Polars -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), +app = FastAPI() +hits_df: pl.DataFrame | None = None +hits: pl.LazyFrame | None = None + + +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# polars-dataframe/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x.filter(pl.col("AdvEngineID") != 0).select(pl.len()).collect().item(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("ResolutionWidth").mean()).collect().rows()[0], ), ( - "Q3", "SELECT AVG(UserID) FROM hits;", lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( - "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", - lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0] + lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0], ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") @@ -53,7 +68,6 @@ .sort("count", descending=True).collect(), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique().alias("u")) @@ -61,7 +75,6 @@ .head(10).collect(), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg( @@ -76,7 +89,6 @@ .head(10).collect(), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by("MobilePhoneModel") @@ -85,7 +97,6 @@ .head(10).collect(), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by(["MobilePhone", "MobilePhoneModel"]) @@ -94,7 +105,6 @@ .head(10).collect(), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -103,7 +113,6 @@ .head(10).collect(), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -112,7 +121,6 @@ .head(10).collect(), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "SearchPhrase"]) @@ -121,7 +129,6 @@ .head(10).collect(), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) @@ -129,7 +136,6 @@ .head(10).collect(), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) @@ -137,12 +143,10 @@ .head(10).collect(), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by( [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"] @@ -152,17 +156,14 @@ .head(10).collect(), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") @@ -173,7 +174,6 @@ .head(10).collect(), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("Title").str.contains("Google")) @@ -193,14 +193,12 @@ .head(10).collect(), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") .head(10).collect(), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") @@ -208,7 +206,6 @@ .head(10).collect(), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") @@ -216,7 +213,6 @@ .head(10).collect(), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) @@ -224,22 +220,20 @@ .head(10).collect(), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' - .group_by("CounterID") # GROUP BY CounterID + lambda x: x.filter(pl.col("URL") != "") + .group_by("CounterID") .agg( [ - pl.col("URL").str.len_chars().mean().alias("l"), # AVG(STRLEN(URL)) - pl.len().alias("c"), # COUNT(*) + pl.col("URL").str.len_chars().mean().alias("l"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect(), # LIMIT 25, + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect(), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x.filter(pl.col("Referer") != "") @@ -251,23 +245,21 @@ .group_by("k") .agg( [ - pl.col("Referer").str.len_chars().mean().alias("l"), # AVG(STRLEN(Referer)) - pl.col("Referer").min().alias("min_referer"), # MIN(Referer) - pl.len().alias("c"), # COUNT(*) + pl.col("Referer").str.len_chars().mean().alias("l"), + pl.col("Referer").min().alias("min_referer"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect() # LIMIT 25 + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect() ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", lambda x: x.select([(pl.col("ResolutionWidth") + i).sum().alias(f"c_{i}") for i in range(90)]).collect(), ), ( - "Q30", "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "ClientIP"]) @@ -282,7 +274,6 @@ .head(10).collect(), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["WatchID", "ClientIP"]) @@ -297,7 +288,6 @@ .head(10).collect(), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.group_by(["WatchID", "ClientIP"]) .agg( @@ -311,7 +301,6 @@ .head(10).collect(), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -319,7 +308,6 @@ .head(10).collect(), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -328,7 +316,6 @@ .head(10).collect(), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("ClientIP") .agg(pl.len().alias("c")) @@ -338,10 +325,9 @@ (pl.col("ClientIP") - 3).alias("ClientIP_minus_3") ]) .sort("c", descending=True) - .head(10).collect() + .head(10).collect(), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -357,7 +343,6 @@ .head(10).collect(), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -373,7 +358,6 @@ .head(10).collect(), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -389,7 +373,6 @@ .slice(1000, 10).collect(), ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -417,7 +400,6 @@ .slice(1000, 10).collect(), ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -433,7 +415,6 @@ .slice(100, 10).collect(), ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -449,7 +430,6 @@ .slice(10000, 10).collect(), ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -465,41 +445,52 @@ ), ] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + -def run_timings(lf: pl.LazyFrame) -> None: - for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +@app.post("/load") +def load(): + global hits, hits_df + start = timeit.default_timer() + df = pl.scan_parquet("hits.parquet").collect() + df = df.with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), + ) + df = df.rechunk() + hits_df = df + hits = df.lazy() + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](lf) - end = timeit.default_timer() - if result is None: - times.append(None) - else: - times.append(round(end - start, 3)) - print(f"{times},") +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} -data_size = os.path.getsize("hits.parquet") -print("run DataFrame (in-memory) queries, this loads all data in memory!") -start = timeit.default_timer() -df = pl.scan_parquet("hits.parquet").collect() -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") +@app.get("/data-size") +def data_size(): + if hits_df is None: + return {"bytes": 0} + return {"bytes": int(hits_df.estimated_size())} -# fix some types -df = df.with_columns( - (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), - pl.col("EventDate").cast(pl.Date), -) -assert df["EventTime"][0].year == 2013 -df = df.rechunk() -lf = df.lazy() -run_timings(lf) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_POLARS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/polars-dataframe/start b/polars-dataframe/start new file mode 100755 index 0000000000..e3fab72731 --- /dev/null +++ b/polars-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/polars-dataframe/stop b/polars-dataframe/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/polars-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/polars/benchmark.sh b/polars/benchmark.sh index bf81cf5f3f..fc4bacc8f3 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -1,18 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install polars - -# Download the data -../download-hits-parquet-single - -# Run the queries - -./query.py 2>&1 | tee log.txt - -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/polars/check b/polars/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/polars/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/polars/data-size b/polars/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/polars/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/polars/install b/polars/install new file mode 100755 index 0000000000..e8eaaea9fe --- /dev/null +++ b/polars/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet polars pyarrow fastapi uvicorn diff --git a/polars/load b/polars/load new file mode 100755 index 0000000000..4c98a2da1a --- /dev/null +++ b/polars/load @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Polars uses LazyFrame (scan_parquet), so the parquet file must remain +# available for queries — we only build the plan here and DO NOT delete the +# input. /load is essentially "register the source". +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/polars/queries.sql b/polars/queries.sql new file mode 100644 index 0000000000..717ebd9262 --- /dev/null +++ b/polars/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/polars/query b/polars/query new file mode 100755 index 0000000000..9129884cf7 --- /dev/null +++ b/polars/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running polars server. +# Stdout: server response JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/polars/results/20260509/c6a.4xlarge.json b/polars/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..48d21b91d9 --- /dev/null +++ b/polars/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Polars (Parquet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented","lukewarm-cold-run"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [2.641, 0.094, 0.095], + [2.411, 0.04, 0.04], + [0.059, 0.031, 0.031], + [0.541, 0.082, 0.081], + [1.069, 0.546, 0.539], + [0.808, 0.65, 0.633], + [0.039, 0.026, 0.025], + [0.024, 0.01, 0.01], + [1.246, 1.186, 1.176], + [1.412, 1.229, 1.251], + [0.3, 0.098, 0.097], + [0.607, 0.108, 0.11], + [1.215, 0.498, 0.503], + [2.282, 1.012, 1.028], + [0.862, 0.627, 0.631], + [0.698, 0.616, 0.611], + [2.561, 1.866, 1.855], + [2.491, 1.798, 1.841], + [5.12, 3.058, 3.014], + [0.179, 0.024, 0.024], + [12.574, 0.68, 0.681], + [10.797, 0.729, 0.712], + [21.188, 1.723, 1.725], + [45.202, 1.503, 1.502], + [2.17, 0.272, 0.271], + [0.653, 0.246, 0.239], + [2.378, 0.378, 0.368], + [9.255, 1.332, 1.359], + [8.215, 6.427, 6.383], + [0.203, 0.147, 0.143], + [1.872, 0.546, 0.55], + [5.238, 0.616, 0.608], + [5.945, 3.231, 3.092], + [10.47, 2.751, 2.713], + [10.483, 2.701, 2.79], + [0.583, 0.505, 0.51], + [0.169, 0.066, 0.07], + [0.09, 0.039, 0.039], + [0.099, 0.033, 0.033], + [0.204, 0.11, 0.114], + [0.058, 0.013, 0.012], + [0.031, 0.01, 0.011], + [0.024, 0.009, 0.009] +] +} + diff --git a/polars/results/20260509/c6a.metal.json b/polars/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..d7f3049fd6 --- /dev/null +++ b/polars/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Polars (Parquet)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented","lukewarm-cold-run"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [2.647, 0.028, 0.027], + [2.581, 0.016, 0.015], + [0.04, 0.021, 0.021], + [0.751, 0.027, 0.029], + [0.948, 0.142, 0.127], + [1.027, 0.202, 0.209], + [0.024, 0.015, 0.015], + [0.034, 0.028, 0.028], + [0.727, 0.346, 0.346], + [0.961, 0.392, 0.431], + [0.236, 0.082, 0.081], + [0.773, 0.089, 0.085], + [1.184, 0.196, 0.217], + [2.221, 0.403, 0.395], + [0.786, 0.239, 0.223], + [0.352, 0.163, 0.163], + [2.374, 0.5, 0.53], + [1.995, 0.484, 1.109], + [4.032, 0.816, 0.784], + [0.103, 0.009, 0.01], + [12.719, 0.147, 0.129], + [10.994, 0.203, 0.18], + [21.413, 0.466, 0.463], + [45.295, 0.335, 0.333], + [2.176, 0.07, 0.068], + [1.103, 0.06, 0.062], + [2.82, 0.11, 0.111], + [9.711, 0.371, 0.408], + [7.911, 0.924, 0.939], + [0.067, 0.045, 0.041], + [1.816, 0.187, 0.181], + [5.348, 0.175, 0.185], + [4.527, 1.016, 1.038], + [9.729, 0.816, 0.778], + [9.686, 0.753, 0.751], + [0.206, 0.146, 0.147], + [0.165, 0.102, 0.1], + [0.107, 0.079, 0.075], + [0.103, 0.057, 0.059], + [0.177, 0.132, 0.117], + [0.094, 0.037, 0.036], + [0.065, 0.042, 0.034], + [0.056, 0.034, 0.033] +] +} + diff --git a/polars/query.py b/polars/server.py similarity index 86% rename from polars/query.py rename to polars/server.py index 1fbd34f2bb..f788df8a95 100755 --- a/polars/query.py +++ b/polars/server.py @@ -1,51 +1,66 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around polars so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import polars as pl +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the LazyFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded LazyFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (estimated_size) + +The (sql, lambda) list is the same as the previous standalone query.py. +""" + +import os import timeit from datetime import date -import json -import subprocess -import os -# The streaming engine will be the default soon -# https://pola.rs/posts/polars-in-aggregate-dec25/ +import polars as pl +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +# Streaming engine will be the default soon. pl.Config.set_engine_affinity("streaming") -# 0: No., 1: SQL, 2: Polars -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), +app = FastAPI() +hits: pl.LazyFrame | None = None +parquet_path: str = "hits.parquet" + + +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# polars/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x.filter(pl.col("AdvEngineID") != 0).select(pl.len()).collect().item(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("ResolutionWidth").mean()).collect().rows()[0], ), ( - "Q3", "SELECT AVG(UserID) FROM hits;", lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( - "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", - lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0] + lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0], ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") @@ -53,7 +68,6 @@ .sort("count", descending=True).collect(), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique().alias("u")) @@ -61,7 +75,6 @@ .head(10).collect(), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg( @@ -76,7 +89,6 @@ .head(10).collect(), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by("MobilePhoneModel") @@ -85,7 +97,6 @@ .head(10).collect(), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by(["MobilePhone", "MobilePhoneModel"]) @@ -94,7 +105,6 @@ .head(10).collect(), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -103,7 +113,6 @@ .head(10).collect(), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -112,7 +121,6 @@ .head(10).collect(), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "SearchPhrase"]) @@ -121,7 +129,6 @@ .head(10).collect(), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) @@ -129,7 +136,6 @@ .head(10).collect(), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) @@ -137,12 +143,10 @@ .head(10).collect(), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by( [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"] @@ -152,17 +156,14 @@ .head(10).collect(), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") @@ -173,7 +174,6 @@ .head(10).collect(), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("Title").str.contains("Google")) @@ -193,14 +193,12 @@ .head(10).collect(), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") .head(10).collect(), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") @@ -208,7 +206,6 @@ .head(10).collect(), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") @@ -216,7 +213,6 @@ .head(10).collect(), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) @@ -224,22 +220,20 @@ .head(10).collect(), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' - .group_by("CounterID") # GROUP BY CounterID + lambda x: x.filter(pl.col("URL") != "") + .group_by("CounterID") .agg( [ - pl.col("URL").str.len_chars().mean().alias("l"), # AVG(STRLEN(URL)) - pl.len().alias("c"), # COUNT(*) + pl.col("URL").str.len_chars().mean().alias("l"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect(), # LIMIT 25, + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect(), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x.filter(pl.col("Referer") != "") @@ -251,23 +245,21 @@ .group_by("k") .agg( [ - pl.col("Referer").str.len_chars().mean().alias("l"), # AVG(STRLEN(Referer)) - pl.col("Referer").min().alias("min_referer"), # MIN(Referer) - pl.len().alias("c"), # COUNT(*) + pl.col("Referer").str.len_chars().mean().alias("l"), + pl.col("Referer").min().alias("min_referer"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect() # LIMIT 25 + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect() ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", lambda x: x.select([(pl.col("ResolutionWidth") + i).sum().alias(f"c_{i}") for i in range(90)]).collect(), ), ( - "Q30", "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "ClientIP"]) @@ -282,7 +274,6 @@ .head(10).collect(), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["WatchID", "ClientIP"]) @@ -297,7 +288,6 @@ .head(10).collect(), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.group_by(["WatchID", "ClientIP"]) .agg( @@ -311,7 +301,6 @@ .head(10).collect(), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -319,7 +308,6 @@ .head(10).collect(), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -328,7 +316,6 @@ .head(10).collect(), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("ClientIP") .agg(pl.len().alias("c")) @@ -338,10 +325,9 @@ (pl.col("ClientIP") - 3).alias("ClientIP_minus_3") ]) .sort("c", descending=True) - .head(10).collect() + .head(10).collect(), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -357,7 +343,6 @@ .head(10).collect(), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -373,7 +358,6 @@ .head(10).collect(), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -389,7 +373,6 @@ .slice(1000, 10).collect(), ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -417,7 +400,6 @@ .slice(1000, 10).collect(), ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -433,7 +415,6 @@ .slice(100, 10).collect(), ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -449,7 +430,6 @@ .slice(10000, 10).collect(), ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -465,35 +445,51 @@ ), ] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits + start = timeit.default_timer() + # Lazy: just builds the plan. Data is read on each query collect(). + hits = pl.scan_parquet(parquet_path).with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), + ) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + -def run_timings(lf: pl.LazyFrame) -> None: - for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](lf) - end = timeit.default_timer() - if result is None: - times.append(None) - else: - times.append(round(end - start, 3)) - print(f"{times},") -data_size = os.path.getsize("hits.parquet") +@app.get("/data-size") +def data_size(): + # LazyFrame doesn't materialize, so report the on-disk parquet size. + try: + return {"bytes": os.path.getsize(parquet_path)} + except OSError: + return {"bytes": 0} -# Run from Parquet -start = timeit.default_timer() -lf = pl.scan_parquet("hits.parquet").with_columns( - (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), - pl.col("EventDate").cast(pl.Date), -) -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") -print("run parquet queries") -run_timings(lf) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_POLARS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/polars/start b/polars/start new file mode 100755 index 0000000000..7fee34fc14 --- /dev/null +++ b/polars/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, do nothing. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/polars/stop b/polars/stop new file mode 100755 index 0000000000..00a85c15e6 --- /dev/null +++ b/polars/stop @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/postgresql-indexed/benchmark.sh b/postgresql-indexed/benchmark.sh index c6f06df301..531bd65038 100755 --- a/postgresql-indexed/benchmark.sh +++ b/postgresql-indexed/benchmark.sh @@ -1,74 +1,5 @@ #!/bin/bash - -set -eu - -PGVERSION=17 - -# Source: https://wiki.postgresql.org/wiki/Apt -sudo apt-get update -y -sudo apt-get install -y postgresql-common -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y - -sudo apt-get update -y -sudo apt-get install -y postgresql-common postgresql-$PGVERSION - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -# COPY 99997497 -# Time: 2341543.463 ms (39:01.543) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql-indexed/check b/postgresql-indexed/check new file mode 100755 index 0000000000..5c6f711234 --- /dev/null +++ b/postgresql-indexed/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/postgresql-indexed/data-size b/postgresql-indexed/data-size new file mode 100755 index 0000000000..14b724ff18 --- /dev/null +++ b/postgresql-indexed/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total | awk '{print $1}' diff --git a/postgresql-indexed/install b/postgresql-indexed/install new file mode 100755 index 0000000000..05b3082aea --- /dev/null +++ b/postgresql-indexed/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Source: https://wiki.postgresql.org/wiki/Apt +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y postgresql-$PGVERSION + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql-indexed/run.sh b/postgresql-indexed/run.sh deleted file mode 100755 index 5fa550e405..0000000000 --- a/postgresql-indexed/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | sudo -u postgres psql test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql-indexed/start b/postgresql-indexed/start new file mode 100755 index 0000000000..941f213c51 --- /dev/null +++ b/postgresql-indexed/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/postgresql-indexed/stop b/postgresql-indexed/stop new file mode 100755 index 0000000000..47969378d7 --- /dev/null +++ b/postgresql-indexed/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/postgresql-orioledb/benchmark.sh b/postgresql-orioledb/benchmark.sh index e78865660d..531bd65038 100755 --- a/postgresql-orioledb/benchmark.sh +++ b/postgresql-orioledb/benchmark.sh @@ -1,75 +1,5 @@ #!/bin/bash - -# digest: sha256:3304142dbe8de8d5bbaa0e398cca58683ed603add4524c3582debf9c119994f1 -VERSION=beta12-pg17 -CONTAINER_NAME=orioledb-clickbench - -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io pigz postgresql-client - -# Using Docker due to pending patches in upstream PostgreSQL, see https://web.archive.org/web/20250722125912/https://www.orioledb.com/docs/usage/getting-started#start-postgresql -echo "Starting OrioleDB Docker container with name $CONTAINER_NAME. Using tag $VERSION..." -# Increase shared memory size, because Docker default will hit the limit ("ERROR: could not resize shared memory segment") -MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') -SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) -mkdir -p /tmp/data -sudo docker run --name $CONTAINER_NAME -v /tmp/data:/tmp/data --shm-size="$SHM_SIZE"m -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust -d orioledb/orioledb:$VERSION - -# Similar (but not identical) to PostgreSQL configuration -echo "Updating configuration" -THREADS=$(nproc) -CPUS=$(($THREADS / 2)) - -# Since we are only using OrioleDB tables, set to 1/4 of RAM and keep default value for shared buffers -# See https://www.orioledb.com/docs/usage/configuration#orioledbmain_buffers -MAIN_BUFFERS=$(($MEM_SIZE / 4)) -EFFECTIVE_CACHE_SIZE=$(($MEM_SIZE - ($MEM_SIZE / 4))) -MAX_WORKER_PROCESSES=$(($THREADS + 15)) - -envsubst < "$CONTAINER_NAME.log" & -while ! tail -n 1 "$CONTAINER_NAME.log" | grep -q 'database system is ready to accept connections'; do - echo "OrioleDB is not running yet. Checking again in 1 second..." - sleep 1 -done - -echo "Downloading dataset..." -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -O /tmp/data/hits.tsv.gz -pigz -d -f /tmp/data/hits.tsv.gz - -echo "Creating database and table..." -psql -h localhost -p 5432 -U postgres -c "CREATE DATABASE test;" -psql -h localhost -p 5432 -U postgres -c "CREATE EXTENSION IF NOT EXISTS orioledb;" -psql -h localhost -p 5432 -U postgres -d test < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# Expected: 'Access method: orioledb' -psql -h localhost -p 5432 -U postgres -d test -c "\d+ hits" | grep 'Access method:' - -echo "Loading data..." -command time -f '%e' ./load.sh - -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec -i $CONTAINER_NAME du -bcs /var/lib/postgresql/data/orioledb_data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql-orioledb/check b/postgresql-orioledb/check new file mode 100755 index 0000000000..a9019ec652 --- /dev/null +++ b/postgresql-orioledb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -h localhost -p 5432 -U postgres -t -c 'SELECT 1' >/dev/null diff --git a/postgresql-orioledb/data-size b/postgresql-orioledb/data-size new file mode 100755 index 0000000000..c65100637c --- /dev/null +++ b/postgresql-orioledb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data/orioledb_data | grep total | awk '{print $1}' diff --git a/postgresql-orioledb/install b/postgresql-orioledb/install new file mode 100755 index 0000000000..88d2cca689 --- /dev/null +++ b/postgresql-orioledb/install @@ -0,0 +1,47 @@ +#!/bin/bash +set -eu + +VERSION=${VERSION:-beta12-pg17} +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} + +sudo apt-get update -y +sudo apt-get install -y docker.io pigz postgresql-client + +# Pull image up front so subsequent ./start is fast and idempotent. +sudo docker pull "orioledb/orioledb:$VERSION" + +mkdir -p /tmp/data + +# (Re)create container with our config. Remove any existing one first. +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') +SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) + +sudo docker run --name "$CONTAINER_NAME" \ + -v /tmp/data:/tmp/data \ + --shm-size="${SHM_SIZE}m" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + -d "orioledb/orioledb:$VERSION" + +THREADS=$(nproc) +CPUS=$(($THREADS / 2)) +MAIN_BUFFERS=$(($MEM_SIZE / 4)) +EFFECTIVE_CACHE_SIZE=$(($MEM_SIZE - ($MEM_SIZE / 4))) +MAX_WORKER_PROCESSES=$(($THREADS + 15)) + +cat <&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql-orioledb/run.sh b/postgresql-orioledb/run.sh deleted file mode 100755 index 71a0d68670..0000000000 --- a/postgresql-orioledb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql -h localhost -p 5432 -U postgres -d test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql-orioledb/start b/postgresql-orioledb/start new file mode 100755 index 0000000000..f45f281e52 --- /dev/null +++ b/postgresql-orioledb/start @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} + +# Idempotent: start if not already running. +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/postgresql-orioledb/stop b/postgresql-orioledb/stop new file mode 100755 index 0000000000..4c10d953f3 --- /dev/null +++ b/postgresql-orioledb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/postgresql/benchmark.sh b/postgresql/benchmark.sh index 4c1a3a4e6f..531bd65038 100755 --- a/postgresql/benchmark.sh +++ b/postgresql/benchmark.sh @@ -1,74 +1,5 @@ #!/bin/bash - -set -eu - -PGVERSION=17 - -# Source: https://wiki.postgresql.org/wiki/Apt -sudo apt-get update -y -sudo apt-get install -y postgresql-common -y -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y - -sudo apt-get update -y -sudo apt-get install -y postgresql-$PGVERSION - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -# COPY 99997497 -# Time: 2341543.463 ms (39:01.543) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql/check b/postgresql/check new file mode 100755 index 0000000000..5c6f711234 --- /dev/null +++ b/postgresql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/postgresql/data-size b/postgresql/data-size new file mode 100755 index 0000000000..14b724ff18 --- /dev/null +++ b/postgresql/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total | awk '{print $1}' diff --git a/postgresql/install b/postgresql/install new file mode 100755 index 0000000000..05b3082aea --- /dev/null +++ b/postgresql/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Source: https://wiki.postgresql.org/wiki/Apt +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y postgresql-$PGVERSION + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1) +status=$? + +# psql may print "ERROR:" on a failed query but exit 0 with -t. Detect. +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +# Print everything except the Time: line to stdout. +printf '%s\n' "$out" | grep -v '^Time:' + +# Extract last "Time: NNN.NNN ms" line and emit seconds on stderr. +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql/run.sh b/postgresql/run.sh deleted file mode 100755 index 96a8161ec1..0000000000 --- a/postgresql/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | sudo -u postgres psql test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql/start b/postgresql/start new file mode 100755 index 0000000000..941f213c51 --- /dev/null +++ b/postgresql/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/postgresql/stop b/postgresql/stop new file mode 100755 index 0000000000..47969378d7 --- /dev/null +++ b/postgresql/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/presto-datalake-partitioned/benchmark.sh b/presto-datalake-partitioned/benchmark.sh index 979092e3b9..7215fbe151 100755 --- a/presto-datalake-partitioned/benchmark.sh +++ b/presto-datalake-partitioned/benchmark.sh @@ -1,144 +1,6 @@ #!/bin/bash - -set -e - -PRESTO_VERSION=0.297 - -# Presto's S3 client uses the AWS default credentials chain, which fails on -# anonymous public buckets. To read the public bucket we drop a tiny shim -# that returns AnonymousAWSCredentials into the hive-hadoop2 plugin and -# point presto.s3.credentials-provider at it. - -sudo apt-get update -y -sudo apt-get install -y docker.io openjdk-21-jre-headless bc wget - -wget --continue --quiet -O presto-cli.jar \ - "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" -chmod +x presto-cli.jar - -mkdir -p data/meta etc/catalog shim -# The trino container used to compile the shim runs as uid 1000. Make -# sure that uid can write here even when benchmark.sh runs as root -# (cloud-init). -sudo chown 1000:1000 shim - -cat > shim/S3AnonymousProvider.java <<'EOF' -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AnonymousAWSCredentials; -import org.apache.hadoop.conf.Configuration; -import java.net.URI; - -public class S3AnonymousProvider implements AWSCredentialsProvider { - public S3AnonymousProvider(URI uri, Configuration conf) {} - public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } - public void refresh() {} -} -EOF - -# Compile the shim against AWS SDK + Hadoop jars bundled in the presto -# image. The presto image ships with a JRE only, so use the trino image -# for the JDK and target Java 11 bytecode for compatibility. -sudo docker run --rm \ - -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' - set -e - cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" - javac --release 11 -cp "$CP" S3AnonymousProvider.java - jar cf S3AnonymousProvider.jar S3AnonymousProvider.class - ' - -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive-hadoop2 -hive.metastore=file -hive.metastore.catalog.dir=file:///data/meta -hive.config.resources=/etc/presto/core-site.xml -hive.non-managed-table-writes-enabled=true -EOF - -cat > etc/core-site.xml <<'EOF' - - - - presto.s3.credentials-provider - S3AnonymousProvider - - - presto.s3.endpoint - https://s3.eu-central-1.amazonaws.com - - -EOF - -cat > etc/jvm.config <<'EOF' --server --Xmx48G --XX:+UseG1GC --XX:G1HeapRegionSize=32M --XX:+UseGCOverheadLimit --XX:+ExplicitGCInvokesConcurrent --XX:+HeapDumpOnOutOfMemoryError --XX:+ExitOnOutOfMemoryError --Djdk.attach.allowAttachSelf=true ---add-opens=java.base/java.io=ALL-UNNAMED ---add-opens=java.base/java.lang=ALL-UNNAMED ---add-opens=java.base/java.lang.ref=ALL-UNNAMED ---add-opens=java.base/java.lang.reflect=ALL-UNNAMED ---add-opens=java.base/java.net=ALL-UNNAMED ---add-opens=java.base/java.nio=ALL-UNNAMED ---add-opens=java.base/java.security=ALL-UNNAMED ---add-opens=java.base/javax.security.auth=ALL-UNNAMED ---add-opens=java.base/javax.security.auth.login=ALL-UNNAMED ---add-opens=java.base/java.text=ALL-UNNAMED ---add-opens=java.base/java.util=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.locks=ALL-UNNAMED ---add-opens=java.base/java.util.regex=ALL-UNNAMED ---add-opens=java.base/sun.nio.cs=ALL-UNNAMED ---add-opens=java.base/sun.security.action=ALL-UNNAMED ---add-opens=java.base/sun.security.util=ALL-UNNAMED ---add-opens=java.base/sun.util.calendar=ALL-UNNAMED ---add-opens=java.management/javax.management=ALL-UNNAMED ---add-opens=java.management/javax.management.openmbean=ALL-UNNAMED ---add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED ---add-opens=java.sql/java.sql=ALL-UNNAMED -EOF - -cat > etc/config.properties <<'EOF' -coordinator=true -node-scheduler.include-coordinator=true -http-server.http.port=8080 -discovery-server.enabled=true -discovery.uri=http://localhost:8080 -query.max-memory=24GB -query.max-memory-per-node=16GB -query.max-total-memory-per-node=24GB -memory.heap-headroom-per-node=8GB -EOF - -sudo docker rm -f presto 2>/dev/null || true -sudo docker run -d --name presto \ - -p 8081:8080 \ - -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ - -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ - -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ - -v "$PWD/etc/core-site.xml:/etc/presto/core-site.xml:ro" \ - -v "$PWD/data/meta:/data/meta" \ - -v "$PWD/shim/S3AnonymousProvider.jar:/opt/presto-server/plugin/hive-hadoop2/S3AnonymousProvider.jar:ro" \ - prestodb/presto:${PRESTO_VERSION} - -until sudo docker logs presto 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -LOAD_START=$(date +%s) -java -jar presto-cli.jar --server http://localhost:8081 --file create.sql -LOAD_END=$(date +%s) - -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: 14737666736" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Datalake variant: Parquet is read directly from public S3, no download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/presto-datalake-partitioned/check b/presto-datalake-partitioned/check new file mode 100755 index 0000000000..fd74785f0c --- /dev/null +++ b/presto-datalake-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Presto's coordinator emits "SERVER STARTED" when ready. +sudo docker logs presto 2>&1 | grep -q "SERVER STARTED" diff --git a/presto-datalake-partitioned/data-size b/presto-datalake-partitioned/data-size new file mode 100755 index 0000000000..03827db0ee --- /dev/null +++ b/presto-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Data is read from S3 on demand; published partitioned size is ~14.7 GB. +echo 14737666736 diff --git a/presto-datalake-partitioned/install b/presto-datalake-partitioned/install new file mode 100755 index 0000000000..46bb615d52 --- /dev/null +++ b/presto-datalake-partitioned/install @@ -0,0 +1,134 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +# Presto's S3 client uses the AWS default credentials chain, which fails on +# anonymous public buckets. To read the public bucket we drop a tiny shim +# that returns AnonymousAWSCredentials into the hive-hadoop2 plugin and +# point presto.s3.credentials-provider at it. + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y openjdk-21-jre-headless bc wget + +if [ ! -f presto-cli.jar ]; then + wget --continue --quiet -O presto-cli.jar \ + "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" + chmod +x presto-cli.jar +fi + +sudo docker pull prestodb/presto:${PRESTO_VERSION} +sudo docker pull trinodb/trino:latest + +mkdir -p data/meta etc/catalog shim +# The trino container used to compile the shim runs as uid 1000. Make +# sure that uid can write here even when scripts run as root (cloud-init). +sudo chown 1000:1000 shim + +cat > shim/S3AnonymousProvider.java <<'EOF' +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AnonymousAWSCredentials; +import org.apache.hadoop.conf.Configuration; +import java.net.URI; + +public class S3AnonymousProvider implements AWSCredentialsProvider { + public S3AnonymousProvider(URI uri, Configuration conf) {} + public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } + public void refresh() {} +} +EOF + +# Compile the shim against AWS SDK + Hadoop jars bundled in the presto +# image. The presto image ships with a JRE only, so use the trino image +# for the JDK and target Java 11 bytecode for compatibility. Skip if +# already built. +if [ ! -f shim/S3AnonymousProvider.jar ]; then + sudo docker run --rm \ + -v "$PWD/shim:/shim" \ + --entrypoint sh trinodb/trino:latest -c ' + set -e + cd /shim + CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + javac --release 11 -cp "$CP" S3AnonymousProvider.java + jar cf S3AnonymousProvider.jar S3AnonymousProvider.class + ' +fi + +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive-hadoop2 +hive.metastore=file +hive.metastore.catalog.dir=file:///data/meta +hive.config.resources=/etc/presto/core-site.xml +hive.non-managed-table-writes-enabled=true +EOF + +cat > etc/core-site.xml <<'EOF' + + + + presto.s3.credentials-provider + S3AnonymousProvider + + + presto.s3.endpoint + https://s3.eu-central-1.amazonaws.com + + +EOF + +RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo) +HEAP_GB=$(( RAM_GB * 70 / 100 )) +[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4 +QUERY_GB=$(( HEAP_GB / 2 )) +HEADROOM_GB=$(( HEAP_GB / 8 + 1 )) + +cat > etc/jvm.config < etc/config.properties <&2 diff --git a/presto-datalake-partitioned/run.sh b/presto-datalake-partitioned/run.sh deleted file mode 100755 index 8697089faa..0000000000 --- a/presto-datalake-partitioned/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - java -jar presto-cli.jar --server http://localhost:8081 \ - --catalog hive --schema clickbench \ - --session offset_clause_enabled=true \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/presto-datalake-partitioned/start b/presto-datalake-partitioned/start new file mode 100755 index 0000000000..388bceb7c1 --- /dev/null +++ b/presto-datalake-partitioned/start @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +if sudo docker ps --format '{{.Names}}' | grep -qx presto; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx presto; then + sudo docker start presto + exit 0 +fi + +sudo docker run -d --name presto \ + -p 8081:8080 \ + -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ + -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ + -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ + -v "$PWD/etc/core-site.xml:/etc/presto/core-site.xml:ro" \ + -v "$PWD/data/meta:/data/meta" \ + -v "$PWD/shim/S3AnonymousProvider.jar:/opt/presto-server/plugin/hive-hadoop2/S3AnonymousProvider.jar:ro" \ + prestodb/presto:${PRESTO_VERSION} diff --git a/presto-datalake-partitioned/stop b/presto-datalake-partitioned/stop new file mode 100755 index 0000000000..2211543c33 --- /dev/null +++ b/presto-datalake-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop presto 2>/dev/null || true +exit 0 diff --git a/presto-datalake/benchmark.sh b/presto-datalake/benchmark.sh index 618870db0a..7215fbe151 100755 --- a/presto-datalake/benchmark.sh +++ b/presto-datalake/benchmark.sh @@ -1,144 +1,6 @@ #!/bin/bash - -set -e - -PRESTO_VERSION=0.297 - -# Presto's S3 client uses the AWS default credentials chain, which fails on -# anonymous public buckets. To read the public bucket we drop a tiny shim -# that returns AnonymousAWSCredentials into the hive-hadoop2 plugin and -# point presto.s3.credentials-provider at it. - -sudo apt-get update -y -sudo apt-get install -y docker.io openjdk-21-jre-headless bc wget - -wget --continue --quiet -O presto-cli.jar \ - "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" -chmod +x presto-cli.jar - -mkdir -p data/meta etc/catalog shim -# The trino container used to compile the shim runs as uid 1000. Make -# sure that uid can write here even when benchmark.sh runs as root -# (cloud-init). -sudo chown 1000:1000 shim - -cat > shim/S3AnonymousProvider.java <<'EOF' -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AnonymousAWSCredentials; -import org.apache.hadoop.conf.Configuration; -import java.net.URI; - -public class S3AnonymousProvider implements AWSCredentialsProvider { - public S3AnonymousProvider(URI uri, Configuration conf) {} - public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } - public void refresh() {} -} -EOF - -# Compile the shim against AWS SDK + Hadoop jars bundled in the presto -# image. The presto image ships with a JRE only, so use the trino image -# for the JDK and target Java 11 bytecode for compatibility. -sudo docker run --rm \ - -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' - set -e - cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" - javac --release 11 -cp "$CP" S3AnonymousProvider.java - jar cf S3AnonymousProvider.jar S3AnonymousProvider.class - ' - -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive-hadoop2 -hive.metastore=file -hive.metastore.catalog.dir=file:///data/meta -hive.config.resources=/etc/presto/core-site.xml -hive.non-managed-table-writes-enabled=true -EOF - -cat > etc/core-site.xml <<'EOF' - - - - presto.s3.credentials-provider - S3AnonymousProvider - - - presto.s3.endpoint - https://s3.eu-central-1.amazonaws.com - - -EOF - -cat > etc/jvm.config <<'EOF' --server --Xmx48G --XX:+UseG1GC --XX:G1HeapRegionSize=32M --XX:+UseGCOverheadLimit --XX:+ExplicitGCInvokesConcurrent --XX:+HeapDumpOnOutOfMemoryError --XX:+ExitOnOutOfMemoryError --Djdk.attach.allowAttachSelf=true ---add-opens=java.base/java.io=ALL-UNNAMED ---add-opens=java.base/java.lang=ALL-UNNAMED ---add-opens=java.base/java.lang.ref=ALL-UNNAMED ---add-opens=java.base/java.lang.reflect=ALL-UNNAMED ---add-opens=java.base/java.net=ALL-UNNAMED ---add-opens=java.base/java.nio=ALL-UNNAMED ---add-opens=java.base/java.security=ALL-UNNAMED ---add-opens=java.base/javax.security.auth=ALL-UNNAMED ---add-opens=java.base/javax.security.auth.login=ALL-UNNAMED ---add-opens=java.base/java.text=ALL-UNNAMED ---add-opens=java.base/java.util=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.locks=ALL-UNNAMED ---add-opens=java.base/java.util.regex=ALL-UNNAMED ---add-opens=java.base/sun.nio.cs=ALL-UNNAMED ---add-opens=java.base/sun.security.action=ALL-UNNAMED ---add-opens=java.base/sun.security.util=ALL-UNNAMED ---add-opens=java.base/sun.util.calendar=ALL-UNNAMED ---add-opens=java.management/javax.management=ALL-UNNAMED ---add-opens=java.management/javax.management.openmbean=ALL-UNNAMED ---add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED ---add-opens=java.sql/java.sql=ALL-UNNAMED -EOF - -cat > etc/config.properties <<'EOF' -coordinator=true -node-scheduler.include-coordinator=true -http-server.http.port=8080 -discovery-server.enabled=true -discovery.uri=http://localhost:8080 -query.max-memory=24GB -query.max-memory-per-node=16GB -query.max-total-memory-per-node=24GB -memory.heap-headroom-per-node=8GB -EOF - -sudo docker rm -f presto 2>/dev/null || true -sudo docker run -d --name presto \ - -p 8081:8080 \ - -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ - -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ - -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ - -v "$PWD/etc/core-site.xml:/etc/presto/core-site.xml:ro" \ - -v "$PWD/data/meta:/data/meta" \ - -v "$PWD/shim/S3AnonymousProvider.jar:/opt/presto-server/plugin/hive-hadoop2/S3AnonymousProvider.jar:ro" \ - prestodb/presto:${PRESTO_VERSION} - -until sudo docker logs presto 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -LOAD_START=$(date +%s) -java -jar presto-cli.jar --server http://localhost:8081 --file create.sql -LOAD_END=$(date +%s) - -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: 14779976446" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Datalake variant: Parquet is read directly from public S3, no download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/presto-datalake/check b/presto-datalake/check new file mode 100755 index 0000000000..fd74785f0c --- /dev/null +++ b/presto-datalake/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Presto's coordinator emits "SERVER STARTED" when ready. +sudo docker logs presto 2>&1 | grep -q "SERVER STARTED" diff --git a/presto-datalake/data-size b/presto-datalake/data-size new file mode 100755 index 0000000000..8a280d60f8 --- /dev/null +++ b/presto-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Data is read from S3 on demand; report the published single-file size. +echo 14779976446 diff --git a/presto-datalake/install b/presto-datalake/install new file mode 100755 index 0000000000..46bb615d52 --- /dev/null +++ b/presto-datalake/install @@ -0,0 +1,134 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +# Presto's S3 client uses the AWS default credentials chain, which fails on +# anonymous public buckets. To read the public bucket we drop a tiny shim +# that returns AnonymousAWSCredentials into the hive-hadoop2 plugin and +# point presto.s3.credentials-provider at it. + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y openjdk-21-jre-headless bc wget + +if [ ! -f presto-cli.jar ]; then + wget --continue --quiet -O presto-cli.jar \ + "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" + chmod +x presto-cli.jar +fi + +sudo docker pull prestodb/presto:${PRESTO_VERSION} +sudo docker pull trinodb/trino:latest + +mkdir -p data/meta etc/catalog shim +# The trino container used to compile the shim runs as uid 1000. Make +# sure that uid can write here even when scripts run as root (cloud-init). +sudo chown 1000:1000 shim + +cat > shim/S3AnonymousProvider.java <<'EOF' +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AnonymousAWSCredentials; +import org.apache.hadoop.conf.Configuration; +import java.net.URI; + +public class S3AnonymousProvider implements AWSCredentialsProvider { + public S3AnonymousProvider(URI uri, Configuration conf) {} + public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } + public void refresh() {} +} +EOF + +# Compile the shim against AWS SDK + Hadoop jars bundled in the presto +# image. The presto image ships with a JRE only, so use the trino image +# for the JDK and target Java 11 bytecode for compatibility. Skip if +# already built. +if [ ! -f shim/S3AnonymousProvider.jar ]; then + sudo docker run --rm \ + -v "$PWD/shim:/shim" \ + --entrypoint sh trinodb/trino:latest -c ' + set -e + cd /shim + CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + javac --release 11 -cp "$CP" S3AnonymousProvider.java + jar cf S3AnonymousProvider.jar S3AnonymousProvider.class + ' +fi + +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive-hadoop2 +hive.metastore=file +hive.metastore.catalog.dir=file:///data/meta +hive.config.resources=/etc/presto/core-site.xml +hive.non-managed-table-writes-enabled=true +EOF + +cat > etc/core-site.xml <<'EOF' + + + + presto.s3.credentials-provider + S3AnonymousProvider + + + presto.s3.endpoint + https://s3.eu-central-1.amazonaws.com + + +EOF + +RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo) +HEAP_GB=$(( RAM_GB * 70 / 100 )) +[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4 +QUERY_GB=$(( HEAP_GB / 2 )) +HEADROOM_GB=$(( HEAP_GB / 8 + 1 )) + +cat > etc/jvm.config < etc/config.properties <&2 diff --git a/presto-datalake/run.sh b/presto-datalake/run.sh deleted file mode 100755 index 8697089faa..0000000000 --- a/presto-datalake/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - java -jar presto-cli.jar --server http://localhost:8081 \ - --catalog hive --schema clickbench \ - --session offset_clause_enabled=true \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/presto-datalake/start b/presto-datalake/start new file mode 100755 index 0000000000..388bceb7c1 --- /dev/null +++ b/presto-datalake/start @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +if sudo docker ps --format '{{.Names}}' | grep -qx presto; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx presto; then + sudo docker start presto + exit 0 +fi + +sudo docker run -d --name presto \ + -p 8081:8080 \ + -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ + -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ + -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ + -v "$PWD/etc/core-site.xml:/etc/presto/core-site.xml:ro" \ + -v "$PWD/data/meta:/data/meta" \ + -v "$PWD/shim/S3AnonymousProvider.jar:/opt/presto-server/plugin/hive-hadoop2/S3AnonymousProvider.jar:ro" \ + prestodb/presto:${PRESTO_VERSION} diff --git a/presto-datalake/stop b/presto-datalake/stop new file mode 100755 index 0000000000..2211543c33 --- /dev/null +++ b/presto-datalake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop presto 2>/dev/null || true +exit 0 diff --git a/presto-partitioned/benchmark.sh b/presto-partitioned/benchmark.sh index f86e87c427..6a7f45d3a1 100755 --- a/presto-partitioned/benchmark.sh +++ b/presto-partitioned/benchmark.sh @@ -1,100 +1,5 @@ #!/bin/bash - -set -e - -PRESTO_VERSION=0.297 - -# Install Docker (Presto's official image bundles its own JRE) and the Presto CLI. -sudo apt-get update -y -sudo apt-get install -y docker.io openjdk-21-jre-headless bc wget - -wget --continue --quiet -O presto-cli.jar \ - "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" -chmod +x presto-cli.jar - -# Download the partitioned dataset (100 parquet files). -mkdir -p data/hits -cd data/hits -seq 0 99 | xargs -P16 -I{} wget --continue --quiet \ - "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet" -cd ../.. - -# Presto Hive catalog: file metastore on the local filesystem. -mkdir -p etc/catalog -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive-hadoop2 -hive.metastore=file -hive.metastore.catalog.dir=file:///clickbench/metastore -hive.allow-drop-table=true -hive.non-managed-table-writes-enabled=true -EOF - -cat > etc/jvm.config <<'EOF' --server --Xmx48G --XX:+UseG1GC --XX:G1HeapRegionSize=32M --XX:+UseGCOverheadLimit --XX:+ExplicitGCInvokesConcurrent --XX:+HeapDumpOnOutOfMemoryError --XX:+ExitOnOutOfMemoryError --Djdk.attach.allowAttachSelf=true ---add-opens=java.base/java.io=ALL-UNNAMED ---add-opens=java.base/java.lang=ALL-UNNAMED ---add-opens=java.base/java.lang.ref=ALL-UNNAMED ---add-opens=java.base/java.lang.reflect=ALL-UNNAMED ---add-opens=java.base/java.net=ALL-UNNAMED ---add-opens=java.base/java.nio=ALL-UNNAMED ---add-opens=java.base/java.security=ALL-UNNAMED ---add-opens=java.base/javax.security.auth=ALL-UNNAMED ---add-opens=java.base/javax.security.auth.login=ALL-UNNAMED ---add-opens=java.base/java.text=ALL-UNNAMED ---add-opens=java.base/java.util=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.locks=ALL-UNNAMED ---add-opens=java.base/java.util.regex=ALL-UNNAMED ---add-opens=java.base/sun.nio.cs=ALL-UNNAMED ---add-opens=java.base/sun.security.action=ALL-UNNAMED ---add-opens=java.base/sun.security.util=ALL-UNNAMED ---add-opens=java.base/sun.util.calendar=ALL-UNNAMED ---add-opens=java.management/javax.management=ALL-UNNAMED ---add-opens=java.management/javax.management.openmbean=ALL-UNNAMED ---add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED ---add-opens=java.sql/java.sql=ALL-UNNAMED -EOF - -cat > etc/config.properties <<'EOF' -coordinator=true -node-scheduler.include-coordinator=true -http-server.http.port=8080 -discovery-server.enabled=true -discovery.uri=http://localhost:8080 -query.max-memory=24GB -query.max-memory-per-node=16GB -query.max-total-memory-per-node=24GB -memory.heap-headroom-per-node=8GB -EOF - -sudo docker rm -f presto 2>/dev/null || true -sudo docker run -d --name presto \ - -p 8081:8080 \ - -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ - -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ - -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ - -v "$PWD/data:/clickbench" \ - prestodb/presto:${PRESTO_VERSION} - -until sudo docker logs presto 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -LOAD_START=$(date +%s) -java -jar presto-cli.jar --server http://localhost:8081 --file create.sql -LOAD_END=$(date +%s) - -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: $(du -bcs data/hits/*.parquet | tail -n1 | cut -f1)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/presto-partitioned/check b/presto-partitioned/check new file mode 100755 index 0000000000..fd74785f0c --- /dev/null +++ b/presto-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Presto's coordinator emits "SERVER STARTED" when ready. +sudo docker logs presto 2>&1 | grep -q "SERVER STARTED" diff --git a/presto-partitioned/data-size b/presto-partitioned/data-size new file mode 100755 index 0000000000..a464ea0c70 --- /dev/null +++ b/presto-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs data/hits/hits_*.parquet | tail -n1 | cut -f1 diff --git a/presto-partitioned/install b/presto-partitioned/install new file mode 100755 index 0000000000..c1c25ea431 --- /dev/null +++ b/presto-partitioned/install @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +# Install Docker (Presto's official image bundles its own JRE) and the +# Presto CLI (a Java fat jar, runs against any JRE). +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y openjdk-21-jre-headless bc wget + +if [ ! -f presto-cli.jar ]; then + wget --continue --quiet -O presto-cli.jar \ + "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" + chmod +x presto-cli.jar +fi + +sudo docker pull prestodb/presto:${PRESTO_VERSION} + +# Presto Hive catalog: file metastore on the local filesystem, no external +# Hive Metastore Service or Hadoop required. +mkdir -p etc/catalog +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive-hadoop2 +hive.metastore=file +hive.metastore.catalog.dir=file:///clickbench/metastore +hive.allow-drop-table=true +hive.non-managed-table-writes-enabled=true +EOF + +# Presto's default 1 GB heap is too small for ClickBench. Bump it together +# with the matching query-memory configuration. +RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo) +HEAP_GB=$(( RAM_GB * 70 / 100 )) +[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4 +QUERY_GB=$(( HEAP_GB / 2 )) +HEADROOM_GB=$(( HEAP_GB / 8 + 1 )) + +cat > etc/jvm.config < etc/config.properties <&2 diff --git a/presto-partitioned/run.sh b/presto-partitioned/run.sh deleted file mode 100755 index 8697089faa..0000000000 --- a/presto-partitioned/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - java -jar presto-cli.jar --server http://localhost:8081 \ - --catalog hive --schema clickbench \ - --session offset_clause_enabled=true \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/presto-partitioned/start b/presto-partitioned/start new file mode 100755 index 0000000000..92bbe10997 --- /dev/null +++ b/presto-partitioned/start @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +if sudo docker ps --format '{{.Names}}' | grep -qx presto; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx presto; then + sudo docker start presto + exit 0 +fi + +sudo docker run -d --name presto \ + -p 8081:8080 \ + -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ + -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ + -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ + -v "$PWD/data:/clickbench" \ + prestodb/presto:${PRESTO_VERSION} diff --git a/presto-partitioned/stop b/presto-partitioned/stop new file mode 100755 index 0000000000..2211543c33 --- /dev/null +++ b/presto-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop presto 2>/dev/null || true +exit 0 diff --git a/presto/benchmark.sh b/presto/benchmark.sh index b3491059d0..b851876173 100755 --- a/presto/benchmark.sh +++ b/presto/benchmark.sh @@ -1,111 +1,5 @@ #!/bin/bash - -set -e - -PRESTO_VERSION=0.297 - -# Install Docker (Presto's official image bundles its own JRE) and the Presto CLI. -sudo apt-get update -y -sudo apt-get install -y docker.io openjdk-21-jre-headless bc wget - -wget --continue --quiet -O presto-cli.jar \ - "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" -chmod +x presto-cli.jar - -# Download the dataset. -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.parquet' - -# Place the parquet file inside its own directory; the Hive connector -# reads every file in the table's external_location. -mkdir -p data/hits -ln -f hits.parquet data/hits/hits.parquet - -# Presto Hive catalog: file metastore on the local filesystem, no external -# Hive Metastore Service or Hadoop required. -mkdir -p etc/catalog -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive-hadoop2 -hive.metastore=file -hive.metastore.catalog.dir=file:///clickbench/metastore -hive.allow-drop-table=true -hive.non-managed-table-writes-enabled=true -EOF - -# Presto's default 1 GB heap is too small for ClickBench. Bump it together -# with the matching query-memory configuration. -cat > etc/jvm.config <<'EOF' --server --Xmx48G --XX:+UseG1GC --XX:G1HeapRegionSize=32M --XX:+UseGCOverheadLimit --XX:+ExplicitGCInvokesConcurrent --XX:+HeapDumpOnOutOfMemoryError --XX:+ExitOnOutOfMemoryError --Djdk.attach.allowAttachSelf=true ---add-opens=java.base/java.io=ALL-UNNAMED ---add-opens=java.base/java.lang=ALL-UNNAMED ---add-opens=java.base/java.lang.ref=ALL-UNNAMED ---add-opens=java.base/java.lang.reflect=ALL-UNNAMED ---add-opens=java.base/java.net=ALL-UNNAMED ---add-opens=java.base/java.nio=ALL-UNNAMED ---add-opens=java.base/java.security=ALL-UNNAMED ---add-opens=java.base/javax.security.auth=ALL-UNNAMED ---add-opens=java.base/javax.security.auth.login=ALL-UNNAMED ---add-opens=java.base/java.text=ALL-UNNAMED ---add-opens=java.base/java.util=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED ---add-opens=java.base/java.util.concurrent.locks=ALL-UNNAMED ---add-opens=java.base/java.util.regex=ALL-UNNAMED ---add-opens=java.base/sun.nio.cs=ALL-UNNAMED ---add-opens=java.base/sun.security.action=ALL-UNNAMED ---add-opens=java.base/sun.security.util=ALL-UNNAMED ---add-opens=java.base/sun.util.calendar=ALL-UNNAMED ---add-opens=java.management/javax.management=ALL-UNNAMED ---add-opens=java.management/javax.management.openmbean=ALL-UNNAMED ---add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED ---add-opens=java.sql/java.sql=ALL-UNNAMED -EOF - -cat > etc/config.properties <<'EOF' -coordinator=true -node-scheduler.include-coordinator=true -http-server.http.port=8080 -discovery-server.enabled=true -discovery.uri=http://localhost:8080 -query.max-memory=24GB -query.max-memory-per-node=16GB -query.max-total-memory-per-node=24GB -memory.heap-headroom-per-node=8GB -EOF - -# Start the Presto server. The data dir is exposed at /clickbench so the -# external_location URI in create.sql resolves correctly inside the -# container. -sudo docker rm -f presto 2>/dev/null || true -sudo docker run -d --name presto \ - -p 8081:8080 \ - -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ - -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ - -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ - -v "$PWD/data:/clickbench" \ - prestodb/presto:${PRESTO_VERSION} - -# Wait for Presto to finish starting up. -until sudo docker logs presto 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -# Create the schema, the external table over the parquet file and a view -# that exposes the standard ClickBench column types. -LOAD_START=$(date +%s) -java -jar presto-cli.jar --server http://localhost:8081 --file create.sql -LOAD_END=$(date +%s) - -# Run the benchmark queries. -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: $(stat -c %s hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/presto/check b/presto/check new file mode 100755 index 0000000000..fd74785f0c --- /dev/null +++ b/presto/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Presto's coordinator emits "SERVER STARTED" when ready. +sudo docker logs presto 2>&1 | grep -q "SERVER STARTED" diff --git a/presto/data-size b/presto/data-size new file mode 100755 index 0000000000..11a50607bc --- /dev/null +++ b/presto/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +stat -c %s data/hits/hits.parquet diff --git a/presto/install b/presto/install new file mode 100755 index 0000000000..da9ebe34b7 --- /dev/null +++ b/presto/install @@ -0,0 +1,91 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +# Install Docker (Presto's official image bundles its own JRE) and the +# Presto CLI (a Java fat jar, runs against any JRE). +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y openjdk-21-jre-headless bc wget + +if [ ! -f presto-cli.jar ]; then + wget --continue --quiet -O presto-cli.jar \ + "https://github.com/prestodb/presto/releases/download/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar" + chmod +x presto-cli.jar +fi + +sudo docker pull prestodb/presto:${PRESTO_VERSION} + +# Presto Hive catalog: file metastore on the local filesystem, no external +# Hive Metastore Service or Hadoop required. +mkdir -p etc/catalog +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive-hadoop2 +hive.metastore=file +hive.metastore.catalog.dir=file:///clickbench/metastore +hive.allow-drop-table=true +hive.non-managed-table-writes-enabled=true +EOF + +# Presto's default 1 GB heap is too small for ClickBench. Scale the heap +# and matching query-memory caps to host RAM — the previous fixed +# -Xmx48G + 24 GB query.max-memory crashed mid-query on c6a.4xlarge +# (32 GiB RAM) with `unexpected end of stream` once the JVM tried to +# allocate over physical RAM and earlyoom killed it. +RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo) +HEAP_GB=$(( RAM_GB * 70 / 100 )) +[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4 +QUERY_GB=$(( HEAP_GB / 2 )) +HEADROOM_GB=$(( HEAP_GB / 8 + 1 )) + +cat > etc/jvm.config < etc/config.properties <&2 diff --git a/presto/run.sh b/presto/run.sh deleted file mode 100755 index 8697089faa..0000000000 --- a/presto/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - java -jar presto-cli.jar --server http://localhost:8081 \ - --catalog hive --schema clickbench \ - --session offset_clause_enabled=true \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/presto/start b/presto/start new file mode 100755 index 0000000000..92bbe10997 --- /dev/null +++ b/presto/start @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +PRESTO_VERSION=0.297 + +if sudo docker ps --format '{{.Names}}' | grep -qx presto; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx presto; then + sudo docker start presto + exit 0 +fi + +sudo docker run -d --name presto \ + -p 8081:8080 \ + -v "$PWD/etc/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties:ro" \ + -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ + -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ + -v "$PWD/data:/clickbench" \ + prestodb/presto:${PRESTO_VERSION} diff --git a/presto/stop b/presto/stop new file mode 100755 index 0000000000..2211543c33 --- /dev/null +++ b/presto/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop presto 2>/dev/null || true +exit 0 diff --git a/questdb/benchmark.sh b/questdb/benchmark.sh index b33a8728af..1aa9264b91 100755 --- a/questdb/benchmark.sh +++ b/questdb/benchmark.sh @@ -1,82 +1,5 @@ #!/bin/bash - -# Install - -qdb_version="9.3.1" -if [[ $(arch) == "aarch64" ]] || [[ $(arch) == "arm"* ]]; then - # ARM uses no-JRE binary, so we need to install JDK - wget --continue --progress=dot:giga https://github.com/graalvm/graalvm-ce-builds/releases/download/jdk-17.0.9/graalvm-community-jdk-17.0.9_linux-aarch64_bin.tar.gz - tar xf graalvm-community-*.tar.gz --one-top-level=graalvm --strip-components 1 - export JAVA_HOME=$PWD/graalvm - - wget --continue --progress=dot:giga https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-no-jre-bin.tar.gz - tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 - mkdir questdb/bin - mv questdb/* questdb/bin -else - wget --continue --progress=dot:giga https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-rt-linux-x86-64.tar.gz - tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 -fi - -questdb/bin/questdb.sh start - -while ! nc -z localhost 9000; do - sleep 0.1 -done - -sed -i 's/query.timeout.sec=60/query.timeout.sec=500/' ~/.questdb/conf/server.conf -sed -i "s|cairo.sql.copy.root=import|cairo.sql.copy.root=$PWD|" ~/.questdb/conf/server.conf -questdb/bin/questdb.sh stop -questdb/bin/questdb.sh start - -# Import the data - -../download-hits-csv - -curl -G --data-urlencode "query=$(cat create.sql)" 'http://localhost:9000/exec' - -if [[ "$(nproc)" -ge 96 ]] -then - # SQL COPY works best on metal instances: - start=$(date +%s) - - curl -G --data-urlencode "query=copy hits from 'hits.csv' with timestamp 'EventTime' format 'yyyy-MM-dd HH:mm:ss';" 'http://localhost:9000/exec' - - echo 'waiting for import to finish...' - until [ "$(curl -s -G --data-urlencode "query=select * from sys.text_import_log where phase is null and status='finished';" 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do - echo '.' - sleep 5 - done - - end=$(date +%s) - echo "Load time: $((end - start))" -else - # On smaller instances use this: - start=$(date +%s) - - curl -F data=@hits.csv 'http://localhost:9000/imp?name=hits&maxUncommittedRows=5000000' - - echo 'waiting for rows to become readable...' - until [ "$(curl -s -G --data-urlencode "query=select 1 from (select count() c from hits) where c = 99997497;" 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do - echo '.' - sleep 5 - done - - end=$(date +%s) - echo "Load time: $((end - start))" -fi - -# Run queries - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -du -bcs ~/.questdb/db/hits* | grep total - -cat log.txt | \ - grep -P '"timings"|"error"|null' | \ - sed -r -e 's/^.*"error".*$/null/; s/^.*"execute":([0-9]*),.*$/\1/' | \ - awk '{ print ($1) / 1000000000 }' | \ - awk '{ printf "%.3f\n", $1 }' | \ - sed -r -e 's/^0$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/questdb/check b/questdb/check new file mode 100755 index 0000000000..3f929fb25b --- /dev/null +++ b/questdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf -G --data-urlencode 'query=SELECT 1' 'http://localhost:9000/exec' >/dev/null diff --git a/questdb/data-size b/questdb/data-size new file mode 100755 index 0000000000..937ec07810 --- /dev/null +++ b/questdb/data-size @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +# QuestDB stores per-table data under ~/.questdb/db/. The previous +# `~/.questdb/db/hits*` glob silently broke on v9.x layouts that don't +# suffix the directory name; the run logged +# du: cannot access '/root/.questdb/db/hits*': No such file or directory +# followed by "Data size: 0", and the materialized view's parser rejected +# the output for missing data_size. Measure the whole db tree instead +# (it only contains the bench's `hits` table). +sudo du -bcs ~/.questdb/db 2>/dev/null | tail -1 | awk '{print $1}' diff --git a/questdb/install b/questdb/install new file mode 100755 index 0000000000..9bb3cbe070 --- /dev/null +++ b/questdb/install @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +qdb_version="${QDB_VERSION:-9.3.1}" + +if [ -d questdb/bin ]; then + exit 0 +fi + +if [[ $(arch) == "aarch64" ]] || [[ $(arch) == arm* ]]; then + # ARM uses no-JRE binary, so we install GraalVM JDK alongside. + wget --continue --progress=dot:giga \ + https://github.com/graalvm/graalvm-ce-builds/releases/download/jdk-17.0.9/graalvm-community-jdk-17.0.9_linux-aarch64_bin.tar.gz + tar xf graalvm-community-*.tar.gz --one-top-level=graalvm --strip-components 1 + export JAVA_HOME=$PWD/graalvm + + wget --continue --progress=dot:giga \ + "https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-no-jre-bin.tar.gz" + tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 + mkdir questdb/bin + mv questdb/* questdb/bin +else + wget --continue --progress=dot:giga \ + "https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-rt-linux-x86-64.tar.gz" + tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 +fi diff --git a/questdb/load b/questdb/load new file mode 100755 index 0000000000..aba840e141 --- /dev/null +++ b/questdb/load @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +curl -sS -G --data-urlencode "query=$(cat create.sql)" 'http://localhost:9000/exec' >/dev/null + +if [[ "$(nproc)" -ge 96 ]]; then + # SQL COPY works best on metal instances. + curl -sS -G --data-urlencode "query=copy hits from 'hits.csv' with timestamp 'EventTime' format 'yyyy-MM-dd HH:mm:ss';" \ + 'http://localhost:9000/exec' >/dev/null + + until [ "$(curl -sS -G --data-urlencode "query=select * from sys.text_import_log where phase is null and status='finished';" \ + 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do + sleep 5 + done +else + # Smaller instances: HTTP /imp endpoint. + curl -sS -F data=@hits.csv 'http://localhost:9000/imp?name=hits&maxUncommittedRows=5000000' >/dev/null + + until [ "$(curl -sS -G --data-urlencode "query=select 1 from (select count() c from hits) where c = 99997497;" \ + 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do + sleep 5 + done +fi + +rm -f hits.csv +sync diff --git a/questdb/query b/questdb/query new file mode 100755 index 0000000000..1081c2d04b --- /dev/null +++ b/questdb/query @@ -0,0 +1,29 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via QuestDB HTTP /exec?timings=true. +# Stdout: query result (JSON). +# Stderr: query runtime in fractional seconds on the last line (parsed from +# the "timings.execute" field, in nanoseconds). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(curl -sS --max-time 600 -G --data-urlencode "query=${query}" \ + 'http://localhost:9000/exec?timings=true' 2>&1) + +if printf '%s\n' "$raw" | grep -q '"error"'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# Parse "execute": from the timings JSON object. +ns=$(printf '%s\n' "$raw" | grep -oP '"execute":\s*\K[0-9]+' | tail -n1) + +if [ -z "$ns" ]; then + echo "no timings.execute in questdb response" >&2 + exit 1 +fi + +awk -v n="$ns" 'BEGIN { printf "%.3f\n", n / 1000000000 }' >&2 diff --git a/questdb/results/20260509/c6a.4xlarge.json b/questdb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..d72489ba1f --- /dev/null +++ b/questdb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "QuestDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","time-series","lukewarm-cold-run"], + "load_time": 5371, + "data_size": 72881747968, + "result": [ + [0.018, 0.001, 0.001], + [0.338, 0.008, 0.008], + [0.208, 0.013, 0.013], + [2.141, 0.027, 0.022], + [1.294, 0.633, 0.642], + [8.393, 0.36, 0.354], + [0.146, 0.004, 0.002], + [0.408, 0.082, 0.074], + [4.019, 0.967, 0.96], + [1.873, 1.292, 1.323], + [0.815, 0.197, 0.169], + [0.762, 0.171, 0.167], + [8.515, 0.392, 0.399], + [2.675, 0.6, 0.572], + [1.233, 0.47, 0.444], + [2.584, 0.75, 0.745], + [9.069, 1.799, 1.784], + [2.557, 1.729, 1.692], + [3.618, 2.276, 2.287], + [0.145, 0.046, 0.041], + [39.971, 0.487, 0.471], + [3.084, 0.429, 0.42], + [48.21, 0.381, 0.381], + [1.845, 0.212, 0.039], + [0.157, 0.005, 0.01], + [7.807, 0.11, 0.117], + [0.115, 0.005, 0.004], + [null, null, null], + [null, null, null], + [0.361, 0.01, 0.009], + [8.196, 0.348, 0.372], + [2.491, 0.417, 0.403], + [5.202, 3.127, 3.058], + [40.694, 2.123, 2.148], + [3.116, 2.133, 2.201], + [1.654, 0.73, 0.697], + [4.817, 0.12, 0.069], + [4.686, 0.144, 0.059], + [3.131, 0.109, 0.055], + [4.504, 0.19, 0.129], + [2.957, 0.103, 0.055], + [2.029, 0.1, 0.075], + [0.214, 0.027, 0.02] +] +} + diff --git a/questdb/results/20260509/c6a.metal.json b/questdb/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..a73f104c4b --- /dev/null +++ b/questdb/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "QuestDB", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","time-series","lukewarm-cold-run"], + "load_time": 356, + "data_size": 72886999444, + "result": [ + [0.061, 0.002, 0.001], + [0.682, 0.041, 0.019], + [0.881, 0.009, 0.009], + [2.458, 0.013, 0.011], + [3.491, 0.408, 0.142], + [8.996, 0.32, 0.148], + [0.38, 0.028, 0.007], + [0.418, 0.021, 0.017], + [4.806, 0.402, 0.396], + [6.319, 0.437, 0.374], + [4.074, 0.547, 0.164], + [4.83, 0.548, 0.154], + [10.754, 0.155, 0.126], + [13.237, 0.246, 0.385], + [11.293, 0.171, 0.169], + [6.494, 0.141, 0.134], + [13.311, 0.276, 0.315], + [13.314, 0.281, 0.232], + [16.849, 0.387, 0.315], + [2.434, 0.015, 0.018], + [40.702, 0.088, 0.079], + [45.078, 0.073, 0.075], + [null, 0.662, 0.09], + [8.209, 0.25, 0.048], + [1.023, 0.003, 0.003], + [8.662, 0.296, 0.032], + [2.033, 0.004, 0.004], + [null, null, null], + [null, null, null], + [0.533, 0.043, 0.01], + [10.162, 0.127, 0.136], + [13.023, 0.147, 0.109], + [8.028, 0.508, 0.437], + [42.368, 0.634, 0.366], + [42.489, 1.139, 0.42], + [7.614, 0.351, 0.817], + [5.319, 0.34, 0.126], + [7.427, 0.304, 0.089], + [5.642, 0.185, 0.095], + [10.17, 0.539, 0.16], + [3.87, 0.213, 0.101], + [3.043, 0.397, 0.127], + [0.396, 0.115, 0.009] +] +} + diff --git a/questdb/run.sh b/questdb/run.sh deleted file mode 100755 index 0159343fd4..0000000000 --- a/questdb/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 - -questdb/bin/questdb.sh stop -questdb/bin/questdb.sh start -sleep 5 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query"; - for i in $(seq 1 $TRIES); do - curl -sS --max-time 600 -G --data-urlencode "query=${query}" 'http://localhost:9000/exec?timings=true' 2>&1 | grep '"timings"' - echo - done; -done; - -questdb/bin/questdb.sh stop diff --git a/questdb/start b/questdb/start new file mode 100755 index 0000000000..5958a3005e --- /dev/null +++ b/questdb/start @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# Idempotent: if HTTP API on :9000 already responds, do nothing. +if curl -sSf -G --data-urlencode 'query=SELECT 1' 'http://localhost:9000/exec' >/dev/null 2>&1; then + exit 0 +fi + +questdb/bin/questdb.sh start + +# Wait for HTTP port. +for _ in $(seq 1 60); do + if nc -z localhost 9000 2>/dev/null; then break; fi + sleep 1 +done + +# Tweak config (idempotent — sed -i with same value is safe). +mkdir -p ~/.questdb/conf +if [ -f ~/.questdb/conf/server.conf ]; then + sed -i 's/query.timeout.sec=60/query.timeout.sec=500/' ~/.questdb/conf/server.conf + sed -i "s|cairo.sql.copy.root=import|cairo.sql.copy.root=$PWD|" ~/.questdb/conf/server.conf + questdb/bin/questdb.sh stop + sleep 2 + questdb/bin/questdb.sh start +fi diff --git a/questdb/stop b/questdb/stop new file mode 100755 index 0000000000..6f914fe753 --- /dev/null +++ b/questdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +questdb/bin/questdb.sh stop 2>/dev/null || true diff --git a/quickwit/benchmark.sh b/quickwit/benchmark.sh index 8d438a2652..a6c6639bc8 100755 --- a/quickwit/benchmark.sh +++ b/quickwit/benchmark.sh @@ -1,90 +1,8 @@ #!/bin/bash -set -eo pipefail - -export DEBIAN_FRONTEND=noninteractive - -# Install prerequisites quietly -sudo apt-get update -qq >/dev/null -sudo apt-get install -y -qq wget curl jq bc docker.io >/dev/null -sudo systemctl start docker - -# We use the Quickwit v0.9 release candidate. Stable v0.8.2 is missing -# `cardinality`, `wildcard`, and several other features the benchmark relies -# on; only the v0.9 line (still unreleased as of writing) provides them. -QW_IMAGE="quickwit/quickwit:v0.9.0-rc" -sudo docker pull -q "$QW_IMAGE" >/dev/null - -# Quickwit's data directory (shared between the server and the local-ingest -# container). -QW_DATA="$(pwd)/qwdata" -sudo rm -rf "$QW_DATA" -mkdir -p "$QW_DATA" - -# Start the server in the background. Quickwit defaults: REST on 7280, gRPC on 7281. -# Mount node-config.yaml on top of the image's default config to bump the -# searcher timeouts (defaults are 30s, which is too low for some of the -# nested high-cardinality aggregations on the full 100M-row dataset). -sudo docker run -d --name qw --network host \ - -v "$QW_DATA":/quickwit/qwdata \ - -v "$(pwd)/node-config.yaml":/quickwit/config/quickwit.yaml \ - "$QW_IMAGE" run >/dev/null -echo "Quickwit container started" - -# Wait for the server to come up. -for i in $(seq 1 60); do - if curl -sS -f http://localhost:7280/api/v1/version >/dev/null 2>&1; then - echo "Quickwit is ready" - break - fi - sleep 1 -done - -# Create the index from the YAML config. -curl -sS -X POST http://localhost:7280/api/v1/indexes \ - -H 'Content-Type: application/yaml' \ - --data-binary @index_config.yaml | jq -r '.index_uid // .message' - -# Download the data quietly (the dataset is ~14 GB; full progress would -# dominate the captured benchmark log). -wget --continue -q 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' - -START=$(date +%s) - -# Use `quickwit tool local-ingest` instead of the Elasticsearch-compatible -# bulk endpoint. v0.9's sharded ingest-v2 API caps single-node throughput -# to a few MB/s and gets stuck waiting for shards to scale, while -# `local-ingest` builds splits directly and writes them to the index -# storage. The running server picks up new splits on its next metastore -# poll (default 30s). -# -# local-ingest emits a "Num docs ... Thrghput ... Time" progress line -# roughly once per second; we throttle that to once per ~30 seconds so -# the captured log stays compact, and pass the surrounding lines through -# unchanged. -zcat hits.json.gz | sudo docker run --rm -i --network host \ - -v "$QW_DATA":/quickwit/qwdata \ - "$QW_IMAGE" tool local-ingest --index hits -y 2>&1 \ - | awk '/Num docs/ { n = systime(); if (n - last >= 30) { print; fflush(); last = n } next } - { print; fflush() }' - -# Wait long enough for the server to refresh its metastore view. -sleep 35 - -# Show stats. -curl -sS "http://localhost:7280/api/v1/indexes/hits/describe" \ - | jq '{num_published_docs, num_published_splits, size_published_splits}' \ - | tee stats.json - -END=$(date +%s) -echo "Load time: $((END - START))" - -# Data size on disk. -echo -n "Data size: " -sudo du -sb "$QW_DATA" | awk '{print $1}' - -# Run queries -chmod +x run.sh -./run.sh - -sudo docker stop qw 2>/dev/null || true -sudo docker rm qw 2>/dev/null || true +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Quickwit takes Elasticsearch-format JSON queries; the load script fetches +# hits.json.gz directly so no shared download-hits-* script applies. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +export BENCH_QUERIES_FILE="queries.json" +exec ../lib/benchmark-common.sh diff --git a/quickwit/check b/quickwit/check new file mode 100755 index 0000000000..82598ad0e5 --- /dev/null +++ b/quickwit/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS -f http://localhost:7280/api/v1/version >/dev/null diff --git a/quickwit/data-size b/quickwit/data-size new file mode 100755 index 0000000000..bbbdd965b5 --- /dev/null +++ b/quickwit/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo du -sb qwdata | awk '{print $1}' diff --git a/quickwit/install b/quickwit/install new file mode 100755 index 0000000000..cfd392a31e --- /dev/null +++ b/quickwit/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +QW_IMAGE="quickwit/quickwit:v0.9.0-rc" + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y wget curl jq bc + +sudo systemctl start docker 2>/dev/null || true +sudo docker pull "$QW_IMAGE" + +mkdir -p qwdata diff --git a/quickwit/load b/quickwit/load new file mode 100755 index 0000000000..f1b27d11c4 --- /dev/null +++ b/quickwit/load @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +QW_IMAGE="quickwit/quickwit:v0.9.0-rc" + +# Create the index from the YAML config. +curl -sS -X POST http://localhost:7280/api/v1/indexes \ + -H 'Content-Type: application/yaml' \ + --data-binary @index_config.yaml | jq -r '.index_uid // .message' + +# No download-hits-json shared script; fetch directly. ~14 GB compressed. +wget --continue -q 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +# Use `quickwit tool local-ingest` instead of the Elasticsearch-compatible +# bulk endpoint. v0.9's sharded ingest-v2 API caps single-node throughput +# to a few MB/s; local-ingest builds splits directly and writes them to +# the index storage. The running server picks up new splits on its next +# metastore poll (default 30s). +# +# Throttle the per-second "Num docs ... Thrghput ... Time" progress lines +# to once per ~30s so the captured log stays compact. +zcat hits.json.gz | sudo docker run --rm -i --network host \ + -v "$PWD/qwdata":/quickwit/qwdata \ + "$QW_IMAGE" tool local-ingest --index hits -y 2>&1 \ + | awk '/Num docs/ { n = systime(); if (n - last >= 30) { print; fflush(); last = n } next } + { print; fflush() }' + +# Wait long enough for the server to refresh its metastore view. +sleep 35 + +curl -sS "http://localhost:7280/api/v1/indexes/hits/describe" \ + | jq '{num_published_docs, num_published_splits, size_published_splits}' + +rm -f hits.json.gz +sync diff --git a/quickwit/query b/quickwit/query new file mode 100755 index 0000000000..0a7eb7d05e --- /dev/null +++ b/quickwit/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads one query line from stdin (an Elasticsearch-format JSON object, or +# the literal "null" for queries not expressible in Quickwit). +# Stdout: raw JSON response from /_elastic/hits/_search. +# Stderr: query runtime in fractional seconds on the last line, parsed +# from .took (engine-internal latency, milliseconds). +# Exit non-zero on error or when the query is "null". +set -e + +query=$(cat) + +if [ "$query" = "null" ] || [ -z "$query" ]; then + echo "query not expressible in Quickwit" >&2 + exit 1 +fi + +resp=$(curl -sS -X POST \ + -H 'Content-Type: application/json' \ + -d "$query" \ + http://localhost:7280/api/v1/_elastic/hits/_search) + +took=$(printf '%s' "$resp" | jq -r 'if has("error") or has("status") then empty else (.took | tostring) end') +if [ -z "$took" ]; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" + +awk -v ms="$took" 'BEGIN { printf "%.4f\n", ms / 1000 }' >&2 diff --git a/quickwit/results/20260509/c6a.4xlarge.json b/quickwit/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..4bcaac5ed3 --- /dev/null +++ b/quickwit/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Quickwit", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","search"], + "load_time": 2962, + "data_size": 46308733491, + "result": [ + [0.038, 0.003, 0.003], + [0.11, 0.053, 0.053], + [0.218, 0.145, 0.152], + [1.911, 0.079, 0.076], + [2.243, 0.542, 0.541], + [1.298, 0.579, 0.604], + [0.166, 0.16, 0.161], + [0.08, 0.055, 0.052], + [3.764, 1.253, 1.198], + [4.305, 1.042, 1.04], + [2.343, 0.118, 0.111], + [2.56, 0.184, 0.167], + [0.948, 0.177, 0.167], + [54.965, 53.304, 53.505], + [1.394, 0.388, 0.386], + [2.314, 0.541, 0.503], + [null, null, null], + [null, null, null], + [null, null, null], + [0.089, 0.007, 0.007], + [3.573, 1.882, 1.852], + [5.155, 1.914, 1.954], + [11.166, 3.288, 3.415], + [4.015, 1.5, 1.439], + [0.355, 0.087, 0.087], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [2.548, 0.83, 0.804], + [null, null, null], + [null, null, null], + [2.448, 0.284, 0.316], + [2.444, 0.331, 0.259], + [null, null, null], + [0.457, 0.028, 0.023], + [0.401, 0.022, 0.021], + [0.411, 0.054, 0.047], + [null, null, null], + [1.271, 0.849, 0.881], + [0.102, 0.036, 0.034], + [0.145, 0.032, 0.032] +] +} + diff --git a/quickwit/results/20260509/c6a.metal.json b/quickwit/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..1e94a4ad5b --- /dev/null +++ b/quickwit/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Quickwit", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","search"], + "load_time": 2458, + "data_size": 74442146552, + "result": [ + [0.047, 0.004, 0.003], + [0.146, 0.052, 0.032], + [0.281, 0.132, 0.134], + [1.94, 0.071, 0.072], + [2.252, 0.383, 0.384], + [1.208, 0.459, 0.475], + [0.214, 0.132, 0.134], + [0.14, 0.033, 0.032], + [3.586, 0.993, 1.017], + [4.108, 0.867, 0.879], + [2.353, 0.091, 0.091], + [2.568, 0.144, 0.144], + [0.964, 0.12, 0.118], + [53.313, 51.635, 50.494], + [1.37, 0.283, 0.289], + [2.209, 0.379, 0.401], + [690.145, null, null], + [null, null, null], + [null, null, null], + [0.082, 0.007, 0.007], + [3.092, 1.446, 1.468], + [4.782, 1.454, 1.473], + [10.744, 2.253, 2.293], + [3.848, 1.248, 1.22], + [0.391, 0.06, 0.06], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [2.293, 0.663, 0.658], + [672.157, 669.012, 663.457], + [null, null, null], + [2.44, 0.198, 0.196], + [2.445, 0.201, 0.212], + [null, null, null], + [0.494, 0.029, 0.027], + [0.498, 0.022, 0.021], + [0.563, 0.046, 0.049], + [null, null, null], + [1.255, 0.806, 0.803], + [0.248, 0.035, 0.032], + [0.212, 0.035, 0.035] +] +} + diff --git a/quickwit/run.sh b/quickwit/run.sh deleted file mode 100755 index bfbf5f7c2c..0000000000 --- a/quickwit/run.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -TRIES=3 -SEARCH_URL="http://localhost:7280/api/v1/_elastic/hits/_search" - -while IFS= read -r QUERY; do - if [ "$QUERY" != "null" ]; then - # Restart Quickwit before each query to clear all in-process caches - # (fast_field_cache, split_footer_cache). Result-style caches - # (partial_request_cache, predicate_cache) are already disabled in - # node-config.yaml. Then drop the OS page cache. This makes the first - # run cold; runs 2 and 3 may benefit from caches re-warmed by run 1. - sudo docker restart qw >/dev/null - until curl -sS -f http://localhost:7280/api/v1/version >/dev/null 2>&1; do sleep 1; done - sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - fi - - echo -n "[" - - for i in $(seq 1 $TRIES); do - if [ "$QUERY" = "null" ]; then - # Query is not expressible in Quickwit (e.g. text-field sort, - # scripts, REGEXP_REPLACE). - echo -n "null" - else - START=$(date +%s.%N) - QW_RSP=$(curl -s -X POST "$SEARCH_URL" -H 'Content-Type: application/json' -d "$QUERY") - END=$(date +%s.%N) - - # Quickwit returns "took" in milliseconds (engine-internal latency). - QW_TIME=$(echo "$QW_RSP" | jq -r 'if has("error") or has("status") then "null" else (.took | tostring) end') - - if [ "$QW_TIME" = "null" ] || [ -z "$QW_TIME" ]; then - echo -n "null" - else - printf "%.4f" "$(echo "scale=4; $QW_TIME / 1000" | bc)" - fi - fi - - [ "$i" != "$TRIES" ] && echo -n ", " - done - - echo "]," -done < queries.json diff --git a/quickwit/start b/quickwit/start new file mode 100755 index 0000000000..a5d74b5190 --- /dev/null +++ b/quickwit/start @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +QW_IMAGE="quickwit/quickwit:v0.9.0-rc" + +if sudo docker ps --format '{{.Names}}' | grep -qx qw; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx qw; then + sudo docker start qw + exit 0 +fi + +# Quickwit defaults: REST on 7280, gRPC on 7281. node-config.yaml bumps +# searcher timeouts (defaults are 30s, too low for the high-cardinality +# aggregations on the full 100M-row dataset). +sudo docker run -d --name qw --network host \ + -v "$PWD/qwdata":/quickwit/qwdata \ + -v "$PWD/node-config.yaml":/quickwit/config/quickwit.yaml \ + "$QW_IMAGE" run diff --git a/quickwit/stop b/quickwit/stop new file mode 100755 index 0000000000..8ecd1d4bf5 --- /dev/null +++ b/quickwit/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop qw 2>/dev/null || true +exit 0 diff --git a/run-benchmark.sh b/run-benchmark.sh index f706c8d5ac..48738551f6 100755 --- a/run-benchmark.sh +++ b/run-benchmark.sh @@ -8,7 +8,18 @@ branch="${4:-main}" arch=$(aws ec2 describe-instance-types --instance-types $machine --query 'InstanceTypes[0].ProcessorInfo.SupportedArchitectures' --output text) ami=$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04*" "Name=architecture,Values=${arch}" "Name=state,Values=available" --query 'sort_by(Images, &CreationDate) | [-1].[ImageId]' --output text) -sed "s^@system@^${system}^; s^@repo@^${repo}^; s^@branch@^${branch}^;" < cloud-init.sh.in > cloud-init.sh +# Global per-system benchmark timeout — substituted at render time. +# Default keeps the 10h cap that worked for the slowest OLTP systems. +timeout="${timeout:-36000}" + +awk -v sys="$system" -v repo="$repo" -v branch="$branch" -v t="$timeout" ' +{ + gsub(/@system@/, sys) + gsub(/@repo@/, repo) + gsub(/@branch@/, branch) + gsub(/@timeout@/, t) + print +}' cloud-init.sh.in > cloud-init.sh AWS_PAGER='' aws ec2 run-instances --image-id $ami --instance-type $machine \ --block-device-mappings 'DeviceName=/dev/sda1,Ebs={DeleteOnTermination=true,VolumeSize=500,VolumeType=gp2}' \ diff --git a/sail-partitioned/benchmark.sh b/sail-partitioned/benchmark.sh index 3909a58f18..3b63e772a6 100755 --- a/sail-partitioned/benchmark.sh +++ b/sail-partitioned/benchmark.sh @@ -1,66 +1,5 @@ #!/bin/bash - -# https://github.com/rust-lang/rust/issues/97234#issuecomment-1133564556 -ulimit -n 65536 - -# Install - -export DEBIAN_FRONTEND=noninteractive - -# When you run Sail on Amazon Linux, you may encounter the following error: -# failed to get system time zone: No such file or directory (os error 2) -# The reason is that /etc/localtime is supposed to be a symlink when retrieving the system time zone, but on Amazon Linux it is a regular file. -# There is a GitHub issue for this problem, but it has not been resolved yet: https://github.com/amazonlinux/amazon-linux-2023/issues/526 -echo "Set Timezone" -export TZ=Etc/UTC -sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:deadsnakes/ppa -y -sudo apt-get update -y -sudo apt-get install -y \ - gcc protobuf-compiler \ - libprotobuf-dev \ - pkg-config \ - libssl-dev \ - python3.11 \ - python3.11-dev \ - python3.11-venv \ - python3.11-distutils - -echo "Set Python alternatives" -sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ - sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - -echo "Install Python packages" -python3 -m venv myenv -source myenv/bin/activate -pip install --upgrade setuptools wheel -pip install --no-cache-dir "pysail==0.5.2" -pip install "pyspark-client==4.1.1" \ - pandas \ - psutil - -# Load the data - -echo "Download benchmark target data, partitioned" -../download-hits-parquet-partitioned partitioned - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -bcs partitioned/hits*.parquet | grep total)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sail-partitioned/check b/sail-partitioned/check new file mode 100755 index 0000000000..140fda4c10 --- /dev/null +++ b/sail-partitioned/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import pysail" >/dev/null diff --git a/sail-partitioned/data-size b/sail-partitioned/data-size new file mode 100755 index 0000000000..503090478c --- /dev/null +++ b/sail-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs partitioned/hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/sail-partitioned/install b/sail-partitioned/install new file mode 100755 index 0000000000..3249b4e982 --- /dev/null +++ b/sail-partitioned/install @@ -0,0 +1,53 @@ +#!/bin/bash +set -e + +ulimit -n 65536 + +export DEBIAN_FRONTEND=noninteractive + +export TZ=Etc/UTC +sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +echo $TZ | sudo tee /etc/timezone >/dev/null + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.cargo/env" + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt-get update -y +sudo apt-get install -y \ + gcc protobuf-compiler \ + libprotobuf-dev \ + pkg-config \ + libssl-dev \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils + +sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 || true +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 || true +if ! python3.11 -m pip --version >/dev/null 2>&1; then + # See sail/install — get-pip.py needs --ignore-installed to avoid + # tripping over Ubuntu 24.04's RECORD-less `packaging 24.0`. + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - --ignore-installed +fi + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate + +# See sail/install — Ubuntu 24.04's apt-installed `packaging` has no +# RECORD file, so `--upgrade` fails to uninstall it. `--ignore-installed` +# avoids the uninstall and just lays a fresh copy in the venv. +pip install --upgrade --ignore-installed setuptools wheel +pip install --no-cache-dir "pysail==0.5.2" +pip install "pyspark-client==4.1.1" pandas psutil diff --git a/sail-partitioned/load b/sail-partitioned/load new file mode 100755 index 0000000000..c110e43728 --- /dev/null +++ b/sail-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +# sail-partitioned reads partitioned/*.parquet via Spark. Move the +# downloaded files into the expected subdir. +set -e + +mkdir -p partitioned +mv hits_*.parquet partitioned/ 2>/dev/null || true +sync diff --git a/sail-partitioned/query b/sail-partitioned/query new file mode 100755 index 0000000000..ae1546f7ab --- /dev/null +++ b/sail-partitioned/query @@ -0,0 +1,53 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via pysail (Spark Connect server) +# against ./partitioned/*.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +ulimit -n 65536 + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Stage stdin into a temp file: `python3 - <<'PY'` already consumes stdin to +# read the program, so sys.stdin.read() inside the heredoc returns "". +query_file=$(mktemp) +trap 'rm -f "$query_file"' EXIT +cat > "$query_file" + +python3 - "$query_file" <<'PY' +import os +import re +import sys +import timeit + +os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" +os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" +os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" + +from pysail.spark import SparkConnectServer +from pyspark.sql import SparkSession + +with open(sys.argv[1]) as f: + query = f.read() +query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) + +server = SparkConnectServer() +server.start() +_, port = server.listening_address +spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() + +df = spark.read.parquet("partitioned") +df.createOrReplaceTempView("hits") + +start = timeit.default_timer() +res = spark.sql(query).toPandas() +end = timeit.default_timer() + +print(res) + +spark.stop() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/sail-partitioned/query.py b/sail-partitioned/query.py deleted file mode 100755 index 705550130e..0000000000 --- a/sail-partitioned/query.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from pysail.spark import SparkConnectServer -from pyspark.sql import SparkSession -import pyspark.sql.functions as F - -import timeit -import psutil -import sys -import re - -query = sys.stdin.read() -# Replace \1 to $1 because spark recognizes only this pattern style (in query 28) -query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) -print(query) - -import os -os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" -os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" -os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" - -server = SparkConnectServer() -server.start() -_, port = server.listening_address - -spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() - -df = spark.read.parquet("partitioned") -df.createOrReplaceTempView("hits") - -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - res = result.toPandas() - end = timeit.default_timer() - if try_num == 0: - print(res) - print("Time: ", round(end - start, 3)) - except Exception as e: - print(e) - print("Failure!") - -spark.stop() diff --git a/sail-partitioned/results/20260509/c6a.4xlarge.json b/sail-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..c0799f4f9a --- /dev/null +++ b/sail-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Sail (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented"], + "load_time": 21, + "data_size": 14737666736, + "result": [ + [0.236, 0.08, 0.08], + [0.354, 0.106, 0.102], + [0.444, 0.137, 0.139], + [0.939, 0.144, 0.14], + [1.192, 0.849, 0.837], + [1.586, 0.898, 0.902], + [0.274, 0.081, 0.083], + [0.333, 0.107, 0.107], + [1.284, 0.975, 0.954], + [1.631, 1.063, 1.081], + [0.82, 0.301, 0.303], + [0.911, 0.33, 0.329], + [1.379, 0.954, 0.957], + [2.826, 1.481, 1.335], + [1.446, 0.933, 0.927], + [1.248, 1.028, 0.99], + [3.017, 1.884, 1.903], + [2.993, 1.846, 1.855], + [5.706, 3.555, 3.691], + [0.446, 0.137, 0.137], + [9.946, 1.407, 1.389], + [11.605, 1.576, 1.581], + [22.296, 3.286, 3.329], + [55.626, 9.78, 9.708], + [3.043, 0.446, 0.45], + [1.084, 0.347, 0.346], + [3.04, 0.459, 0.459], + [9.995, 1.824, 1.816], + [8.767, 3.356, 3.331], + [0.935, 0.737, 0.733], + [2.627, 0.809, 0.801], + [6.308, 0.963, 0.982], + [5.086, 3.575, 3.65], + [11.239, 4.949, 4.913], + [11.21, 4.958, 4.935], + [1.341, 1.137, 1.07], + [0.413, 0.144, 0.146], + [0.381, 0.13, 0.128], + [0.423, 0.147, 0.146], + [0.501, 0.187, 0.192], + [0.35, 0.11, 0.107], + [0.348, 0.102, 0.101], + [0.356, 0.098, 0.098] +] +} + diff --git a/sail-partitioned/results/20260509/c6a.metal.json b/sail-partitioned/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..e3012d7f06 --- /dev/null +++ b/sail-partitioned/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Sail (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented"], + "load_time": 65, + "data_size": 14737666736, + "result": [ + [0.247, 0.081, 0.082], + [0.301, 0.122, 0.122], + [0.337, 0.12, 0.117], + [0.721, 0.149, 0.157], + [0.903, 0.541, 0.564], + [1.258, 0.523, 0.521], + [0.218, 0.081, 0.083], + [0.342, 0.162, 0.149], + [1.048, 0.766, 0.807], + [1.368, 0.617, 0.614], + [0.726, 0.371, 0.362], + [0.781, 0.391, 0.38], + [1.11, 0.576, 0.575], + [2.635, 0.964, 0.953], + [1.223, 0.599, 0.612], + [0.941, 0.815, 0.821], + [2.469, 1.093, 1.055], + [2.469, 0.99, 0.997], + [4.653, 1.941, 2.025], + [0.374, 0.161, 0.162], + [9.68, 0.601, 0.615], + [11.403, 0.721, 0.732], + [21.896, 1.191, 1.081], + [55.825, 4.853, 4.763], + [2.833, 0.323, 0.342], + [0.996, 0.234, 0.237], + [2.793, 0.324, 0.331], + [9.768, 0.723, 0.752], + [8.425, 0.999, 1.026], + [0.493, 0.275, 0.289], + [2.505, 0.678, 0.661], + [6.209, 1.078, 1.064], + [5.285, 3.706, 3.781], + [10.334, 2.074, 2.026], + [10.249, 1.969, 1.998], + [0.979, 0.892, 0.889], + [0.396, 0.178, 0.186], + [0.383, 0.168, 0.167], + [0.475, 0.177, 0.182], + [0.547, 0.225, 0.234], + [0.375, 0.151, 0.148], + [0.346, 0.138, 0.136], + [0.366, 0.136, 0.151] +] +} + diff --git a/sail-partitioned/run.sh b/sail-partitioned/run.sh deleted file mode 100755 index 64df8c6082..0000000000 --- a/sail-partitioned/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/sail-partitioned/start b/sail-partitioned/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/sail-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail-partitioned/stop b/sail-partitioned/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/sail-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail/benchmark.sh b/sail/benchmark.sh index aef392d60d..fc4bacc8f3 100755 --- a/sail/benchmark.sh +++ b/sail/benchmark.sh @@ -1,66 +1,5 @@ #!/bin/bash - -# https://github.com/rust-lang/rust/issues/97234#issuecomment-1133564556 -ulimit -n 65536 - -# Install - -export DEBIAN_FRONTEND=noninteractive - -# When you run Sail on Amazon Linux, you may encounter the following error: -# failed to get system time zone: No such file or directory (os error 2) -# The reason is that /etc/localtime is supposed to be a symlink when retrieving the system time zone, but on Amazon Linux it is a regular file. -# There is a GitHub issue for this problem, but it has not been resolved yet: https://github.com/amazonlinux/amazon-linux-2023/issues/526 -echo "Set Timezone" -export TZ=Etc/UTC -sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:deadsnakes/ppa -y -sudo apt-get update -y -sudo apt-get install -y \ - gcc protobuf-compiler \ - libprotobuf-dev \ - pkg-config \ - libssl-dev \ - python3.11 \ - python3.11-dev \ - python3.11-venv \ - python3.11-distutils - -echo "Set Python alternatives" -sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ - sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - -echo "Install Python packages" -python3 -m venv myenv -source myenv/bin/activate -pip install --upgrade setuptools wheel -pip install --no-cache-dir "pysail==0.5.2" -pip install "pyspark-client==4.1.1" \ - pandas \ - psutil - -# Load the data - -echo "Download benchmark target data, single file" -../download-hits-parquet-single - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sail/check b/sail/check new file mode 100755 index 0000000000..140fda4c10 --- /dev/null +++ b/sail/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import pysail" >/dev/null diff --git a/sail/data-size b/sail/data-size new file mode 100755 index 0000000000..708c0b72e7 --- /dev/null +++ b/sail/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/sail/install b/sail/install new file mode 100755 index 0000000000..4aa2a2411a --- /dev/null +++ b/sail/install @@ -0,0 +1,59 @@ +#!/bin/bash +set -e + +ulimit -n 65536 + +export DEBIAN_FRONTEND=noninteractive + +# Sail needs a real /etc/localtime symlink; make sure it exists. +export TZ=Etc/UTC +sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +echo $TZ | sudo tee /etc/timezone >/dev/null + +# Rust toolchain (used by pysail's native build, if needed). +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.cargo/env" + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt-get update -y +sudo apt-get install -y \ + gcc protobuf-compiler \ + libprotobuf-dev \ + pkg-config \ + libssl-dev \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils + +sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 || true +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 || true +if ! python3.11 -m pip --version >/dev/null 2>&1; then + # get-pip.py installs pip + setuptools + wheel; wheel pulls in + # packaging>=24.0, but Ubuntu 24.04's apt-shipped `packaging 24.0` + # has no RECORD file so the uninstall step blows up. `--ignore- + # installed` skips the uninstall and lays the new copy on top. + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - --ignore-installed +fi + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Ubuntu 24.04's apt-installed `packaging` lacks a RECORD file, so +# `pip install --upgrade setuptools wheel` blows up trying to uninstall +# it ("Cannot uninstall packaging 24.0 ... no RECORD file was found"). +# `--ignore-installed` skips the uninstall and just lays the new copy on +# top, which is what we want inside this venv anyway. +pip install --upgrade --ignore-installed setuptools wheel +pip install --no-cache-dir "pysail==0.5.2" +pip install "pyspark-client==4.1.1" pandas psutil diff --git a/sail/load b/sail/load new file mode 100755 index 0000000000..0618f93f66 --- /dev/null +++ b/sail/load @@ -0,0 +1,5 @@ +#!/bin/bash +# sail reads hits.parquet directly via Spark's parquet reader. No persistent +# DB to load. +set -e +sync diff --git a/sail/query b/sail/query new file mode 100755 index 0000000000..51e3704097 --- /dev/null +++ b/sail/query @@ -0,0 +1,54 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via pysail (Spark Connect server) +# against hits.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +ulimit -n 65536 + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Stage stdin into a temp file: `python3 - <<'PY'` already consumes stdin to +# read the program, so sys.stdin.read() inside the heredoc returns "". +query_file=$(mktemp) +trap 'rm -f "$query_file"' EXIT +cat > "$query_file" + +python3 - "$query_file" <<'PY' +import os +import re +import sys +import timeit + +os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" +os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" +os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" + +from pysail.spark import SparkConnectServer +from pyspark.sql import SparkSession + +with open(sys.argv[1]) as f: + query = f.read() +# Spark expects $1 instead of \1 in REGEXP_REPLACE. +query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) + +server = SparkConnectServer() +server.start() +_, port = server.listening_address +spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() + +df = spark.read.parquet("hits.parquet") +df.createOrReplaceTempView("hits") + +start = timeit.default_timer() +res = spark.sql(query).toPandas() +end = timeit.default_timer() + +print(res) + +spark.stop() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/sail/query.py b/sail/query.py deleted file mode 100755 index 78ce8cf001..0000000000 --- a/sail/query.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from pysail.spark import SparkConnectServer -from pyspark.sql import SparkSession -import pyspark.sql.functions as F - -import timeit -import psutil -import sys -import re - -query = sys.stdin.read() -# Replace \1 to $1 because spark recognizes only this pattern style (in query 28) -query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) -print(query) - -import os -os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" -os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" -os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" - -server = SparkConnectServer() -server.start() -_, port = server.listening_address - -spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() - -df = spark.read.parquet("hits.parquet") -df.createOrReplaceTempView("hits") - -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - res = result.toPandas() - end = timeit.default_timer() - if try_num == 0: - print(res) - print("Time: ", round(end - start, 3)) - except Exception as e: - print(e) - print("Failure!") - -spark.stop() diff --git a/sail/results/20260509/c6a.4xlarge.json b/sail/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..bd5bed6b4e --- /dev/null +++ b/sail/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Sail (Parquet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented"], + "load_time": 7, + "data_size": 14779976446, + "result": [ + [0.209, 0.013, 0.014], + [0.291, 0.049, 0.049], + [0.345, 0.071, 0.074], + [0.549, 0.077, 0.078], + [0.97, 0.772, 0.794], + [1.22, 0.881, 0.885], + [0.221, 0.014, 0.014], + [0.316, 0.052, 0.054], + [1.219, 0.927, 0.894], + [1.593, 1.035, 1.01], + [0.767, 0.248, 0.257], + [0.736, 0.278, 0.269], + [1.299, 0.941, 0.947], + [2.835, 1.317, 1.338], + [1.385, 0.923, 0.916], + [1.199, 0.902, 0.898], + [3.068, 1.813, 1.838], + [3.061, 1.838, 1.836], + [5.644, 3.57, 3.528], + [0.365, 0.084, 0.085], + [9.739, 1.372, 1.378], + [11.449, 1.622, 1.586], + [22.403, 3.627, 3.627], + [56.023, 10.097, 10.11], + [2.775, 0.476, 0.468], + [0.902, 0.396, 0.401], + [2.773, 0.475, 0.465], + [9.79, 1.64, 1.685], + [8.853, 3.368, 3.318], + [0.935, 0.671, 0.657], + [2.411, 0.839, 0.824], + [5.914, 0.957, 0.962], + [4.727, 3.52, 3.58], + [10.964, 4.986, 4.947], + [10.911, 5.008, 4.973], + [1.296, 0.992, 1.003], + [0.518, 0.16, 0.15], + [0.419, 0.141, 0.129], + [0.517, 0.152, 0.151], + [0.689, 0.237, 0.234], + [0.378, 0.068, 0.066], + [0.355, 0.064, 0.064], + [0.355, 0.057, 0.058] +] +} + diff --git a/sail/results/20260509/c6a.metal.json b/sail/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..b549c4e91e --- /dev/null +++ b/sail/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Sail (Parquet)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["column-oriented"], + "load_time": 6, + "data_size": 14779976446, + "result": [ + [0.198, 0.014, 0.018], + [0.712, 0.502, 0.461], + [0.268, 0.069, 0.069], + [0.36, 0.083, 0.084], + [0.617, 0.443, 0.501], + [0.921, 0.456, 0.457], + [0.146, 0.014, 0.015], + [0.712, 0.463, 0.469], + [0.907, 0.745, 0.687], + [1.154, 0.532, 0.536], + [0.942, 0.672, 0.67], + [1.03, 0.727, 0.694], + [1.399, 0.834, 0.954], + [2.751, 1.213, 1.221], + [1.403, 0.872, 0.909], + [0.848, 0.65, 0.669], + [2.274, 1.038, 0.96], + [2.197, 0.952, 0.895], + [4.214, 1.812, 1.75], + [0.748, 0.544, 0.527], + [9.588, 0.561, 0.594], + [11.726, 1.033, 1.057], + [22.418, 1.529, 1.484], + [56.132, 5.124, 4.839], + [2.96, 0.558, 0.6], + [1.156, 0.5, 0.566], + [2.863, 0.622, 0.625], + [10.034, 1.004, 1.023], + [8.639, 1.304, 1.305], + [0.369, 0.212, 0.218], + [2.699, 0.999, 1.004], + [6.116, 1.281, 1.254], + [6.369, 3.835, 3.712], + [10.274, 2.001, 1.925], + [10.23, 1.931, 2.022], + [0.959, 0.789, 0.78], + [0.869, 0.511, 0.46], + [0.85, 0.446, 0.495], + [0.877, 0.455, 0.529], + [0.853, 0.619, 0.593], + [0.757, 0.418, 0.436], + [0.735, 0.416, 0.43], + [0.681, 0.451, 0.481] +] +} + diff --git a/sail/run.sh b/sail/run.sh deleted file mode 100755 index 64df8c6082..0000000000 --- a/sail/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/sail/start b/sail/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/sail/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail/stop b/sail/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/sail/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/selectdb/README.md b/selectdb/README.md deleted file mode 100644 index a67af26ffe..0000000000 --- a/selectdb/README.md +++ /dev/null @@ -1 +0,0 @@ -[SelectDB](https://selectdb.com/), the enterprise-grade cloud-native distribution for [Apache Doris](https://github.com/apache/doris). \ No newline at end of file diff --git a/selectdb/benchmark.sh b/selectdb/benchmark.sh deleted file mode 100755 index e92f667aac..0000000000 --- a/selectdb/benchmark.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -set -e - -# This benchmark should run on Ubuntu 20.04 - -# Install -ROOT=$(pwd) - -if [[ -n "$1" ]]; then - url="$1" -else - url='https://qa-build.oss-cn-beijing.aliyuncs.com/enterprise-doris-release-output/selectdb-doris-2.1.7-rc01-bin-x64.tar.gz' -fi -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop SelectDB and remove it first if execute this script multiple times -set +e -"$dir_name"/selectdb-doris-2.1.7-rc01-bin-x64/fe/bin/stop_fe.sh -"$dir_name"/selectdb-doris-2.1.7-rc01-bin-x64/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$ROOT/$dir_name/selectdb-doris-2.1.7-rc01-bin-x64" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk -sudo apt-get install -y mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -set +e -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades -sudo systemctl stop mysql-server -set -e - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - -# Create Database and table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -sleep 5 -mysql -h 127.0.0.1 -P9030 -uroot hits <"$ROOT"/create.sql - -# Download data -if [[ ! -f hits.tsv.gz ]] && [[ ! -f hits.tsv ]]; then - ../download-hits-tsv -fi - -# Load data -echo "start loading hits.tsv, estimated to take about 9 minutes ..." -date -START=$(date +%s) -curl --location-trusted \ - -u root: \ - -T "hits.tsv" \ - -H "label:hits" \ - -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ - http://localhost:8030/api/hits/hits/_stream_load -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "$LOADTIME" > loadtime - -# Dataset contains 99997497 rows, storage size is about 17319588503 bytes -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" -du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size - -echo "Data size: $(cat storage_size)" - -# Run queries -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' diff --git a/selectdb/results/20240919/c6a.metal.json b/selectdb/results/20240919/c6a.metal.json deleted file mode 100644 index 457d7e7777..0000000000 --- a/selectdb/results/20240919/c6a.metal.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "system": "SelectDB", - "date": "2024-09-19", - "machine": "c6a.metal", - "cluster_size": 1, - "proprietary": "no", - "hardware": "cpu", - "tuned": "no", - "comment": "", - "tags": ["C++", "column-oriented", "MySQL compatible", "ClickHouse derivative", "lukewarm-cold-run"], - "load_time": 459, - "data_size": 17365253189, - "result": [ -[0.09,0.03,0.02], -[0.13,0.02,0.03], -[1.12,0.04,0.04], -[1.70,0.04,0.04], -[1.68,0.17,0.16], -[1.40,0.21,0.20], -[0.05,0.02,0.02], -[0.18,0.03,0.03], -[2.68,0.22,0.21], -[3.57,0.25,0.25], -[2.34,0.08,0.08], -[3.10,0.08,0.08], -[2.22,0.23,0.20], -[3.76,0.29,0.26], -[2.17,0.25,0.23], -[1.48,0.12,0.12], -[3.86,0.31,0.27], -[2.82,0.36,0.09], -[4.89,0.48,0.49], -[0.04,0.01,0.02], -[11.32,0.24,0.11], -[13.35,0.12,0.06], -[25.55,0.17,0.09], -[7.56,0.06,0.07], -[2.64,0.09,0.07], -[2.26,0.05,0.06], -[3.00,0.10,0.07], -[11.58,0.24,0.22], -[9.47,0.97,0.93], -[0.50,0.05,0.03], -[5.03,0.14,0.12], -[6.98,0.17,0.15], -[4.97,0.91,0.83], -[11.65,1.06,1.01], -[11.62,1.03,0.99], -[0.89,0.18,0.16], -[1.74,0.04,0.03], -[1.97,0.03,0.03], -[2.13,0.03,0.02], -[2.25,0.08,0.08], -[1.42,0.03,0.03], -[1.82,0.02,0.03], -[1.43,0.02,0.03] - ] -} diff --git a/selectdb/results/20250710/c6a.2xlarge.json b/selectdb/results/20250710/c6a.2xlarge.json deleted file mode 100644 index bc331ac53d..0000000000 --- a/selectdb/results/20250710/c6a.2xlarge.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "system": "SelectDB", - "date": "2025-07-10", - "machine": "c6a.2xlarge", - "cluster_size": 1, - "proprietary": "no", - "hardware": "cpu", - "tuned": "no", - "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative", "lukewarm-cold-run"], - "load_time": 716, - "data_size": 16402424021, - "result": [ - [2.16, 0.25, 0.24], - [5.12, 0.49, 0.38], - [5.18, 0.53, 0.47], - [6.91, 0.47, 0.52], - [7.06, 1.51, 1.58], - [6.38, 3.03, 2.26], - [2.18, 0.22, 0.26], - [5.13, 0.44, 0.48], - [8.99, 1.36, 1.39], - [11.61, 1.5, 1.47], - [10.13, 0.56, 0.57], - [10.87, 0.57, 0.54], - [8.5, 1.39, 1.44], - [13.08, 3.21, 2.4], - [9.9, 1.82, 1.83], - [6.93, 1.15, 1.25], - [10.61, 4.68, null], - [9.49, 1.18, 1.06], - [null, null, null], - [0.15, 0.01, 0.01], - [17.28, 16.23, 15.97], - [20.12, 19.12, 18.7], - [31.4, 30.41, 29.79], - [9.53, 0.75, 0.7], - [3.24, 0.39, 0.39], - [8.46, 0.78, 0.76], - [3.16, 0.4, 0.38], - [17.78, 16.73, 15.63], - [20.1, null, null], - [3.73, 0.46, 0.37], - [15.16, 1.17, 1.17], - [18.43, 3.18, 1.57], - [null, null, null], - [null, null, null], - [null, null, null], - [5.44, 1.27, 1.28], - [8.83, 0.68, 0.57], - [11.02, 0.72, 0.72], - [3.55, 0.45, 0.46], - [4.25, 0.68, 0.66], - [3.25, 0.39, 0.43], - [3.37, 0.42, 0.39], - [3.11, 0.43, 0.42] -] -} diff --git a/selectdb/results/20250710/c6a.4xlarge.json b/selectdb/results/20250710/c6a.4xlarge.json deleted file mode 100644 index ab103861de..0000000000 --- a/selectdb/results/20250710/c6a.4xlarge.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "system": "SelectDB", - "date": "2025-07-10", - "machine": "c6a.4xlarge", - "cluster_size": 1, - "proprietary": "no", - "hardware": "cpu", - "tuned": "no", - "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative", "lukewarm-cold-run"], - "load_time": 487, - "data_size": 17103182575, - "result": [ - [0.1, 0.04, 0.04], - [1.36, 0.04, 0.04], - [2.06, 0.06, 0.07], - [2.33, 0.09, 0.1], - [2.29, 0.72, 0.68], - [2.19, 0.82, 0.81], - [0.07, 0.01, 0.01], - [1.39, 0.04, 0.04], - [4, 0.62, 0.63], - [5.76, 0.7, 0.71], - [3.71, 0.1, 0.11], - [4.3, 0.13, 0.12], - [2.98, 0.62, 0.63], - [5.22, 1.02, 0.94], - [4.05, 1.03, 1.03], - [2.21, 0.59, 0.49], - [4.67, 1.41, 1.46], - [4.46, 0.39, 0.39], - [6.28, 2.61, 2.58], - [0.05, 0.01, 0.01], - [11.96, 0.94, 0.91], - [14.51, 0.79, 0.74], - [26.81, 1.56, 1.55], - [6.1, 0.37, 0.16], - [1.85, 0.11, 0.13], - [3.26, 0.23, 0.21], - [1.74, 0.16, 0.21], - [12.19, 1.5, 1.51], - [11.1, 8.8, 8.76], - [1.31, 0.06, 0.07], - [7.01, 0.41, 0.39], - [9.01, 0.51, 0.52], - [6.53, 3.41, 3.42], - [14.29, 13.98, 12.62], - [14.27, null, 12.58], - [2.14, 0.63, 0.62], - [2.46, 0.08, 0.08], - [2.44, 0.05, 0.05], - [1.89, 0.04, 0.04], - [2.53, 0.26, 0.23], - [1.43, 0.03, 0.03], - [1.84, 0.03, 0.03], - [1.45, 0.03, 0.04] -] -} diff --git a/selectdb/results/20250830/c7a.metal-48xl.json b/selectdb/results/20250830/c7a.metal-48xl.json deleted file mode 100644 index 7e4a35b549..0000000000 --- a/selectdb/results/20250830/c7a.metal-48xl.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "system": "SelectDB", - "date": "2025-08-30", - "machine": "c7a.metal-48xl", - "cluster_size": 1, - "proprietary": "no", - "hardware": "cpu", - "tuned": "no", - "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative", "lukewarm-cold-run"], - "load_time": 364, - "data_size": 17361427624, - "result": [ - [0.04, 0.02, 0.01], - [0.36, 0.02, 0.02], - [1.27, 0.02, 0.02], - [1.7, 0.03, 0.03], - [1.63, 0.09, 0.12], - [1.47, 0.15, 0.13], - [0.03, 0.02, 0.01], - [0.4, 0.03, 0.02], - [2.76, 0.19, 0.18], - [3.81, 0.21, 0.21], - [2.42, 0.07, 0.07], - [3.07, 0.07, 0.07], - [1.92, 0.14, 0.14], - [3.6, 0.2, 0.19], - [2.3, 0.18, 0.17], - [1.48, 0.1, 0.1], - [3.52, 0.23, 0.22], - [2.74, 0.12, 0.05], - [4.72, 0.36, 0.37], - [0.06, 0.01, 0.01], - [11.14, 0.09, 0.05], - [12.93, 0.06, 0.05], - [24.79, 0.26, 0.08], - [7.33, 0.05, 0.72], - [2.68, 0.22, 0.14], - [2.03, 0.05, 0.05], - [2.8, 0.31, 0.16], - [11.37, 0.25, 0.12], - [9.24, 0.69, 0.67], - [0.51, 0.03, 0.03], - [5.14, 0.09, 0.08], - [6.77, 0.11, 0.11], - [4.88, 0.69, 0.61], - [11.38, 0.69, 0.64], - [11.38, 0.67, 0.64], - [0.87, 0.14, 0.14], - [1.85, 0.02, 0.02], - [2.11, 0.02, 0.02], - [2.01, 0.03, 0.02], - [2.61, 0.06, 0.06], - [1.47, 0.02, 0.03], - [1.9, 0.03, 0.03], - [1.37, 0.03, 0.02] -] -} diff --git a/selectdb/run.sh b/selectdb/run.sh deleted file mode 100755 index 57408f9ddf..0000000000 --- a/selectdb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -while read -r query; do - curl -sS http://127.0.0.1:8040/api/clear_cache/all - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done - -done < queries.sql diff --git a/siglens/benchmark.sh b/siglens/benchmark.sh index eeff99bfa8..13a9c1c84d 100755 --- a/siglens/benchmark.sh +++ b/siglens/benchmark.sh @@ -1,26 +1,8 @@ #!/bin/bash - -# Requires at least 300GB of free disk space on the main partition for the dataset, intermediate files, and SigLens data. - -echo "Install prerequisites" -sudo apt-get install -y git golang - -echo "Get and build SigLens" -git clone https://github.com/siglens/siglens.git --branch 1.0.54 -cd siglens -go mod tidy -go build -o siglens cmd/siglens/main.go -./siglens &> siglens.out & -cd .. - -echo "Download and unzip dataset" -sudo apt-get install -y pigz -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' -pigz -d -f hits.json.gz - -echo "Load data into SigLens, this can take a few hours" -echo -n "Load time: " -command time -f '%e' python3 send_datawithactionline.py - -echo "Run queries" -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# siglens ingests its own gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +# queries are SPL/Splunk QL, not SQL. +export BENCH_QUERIES_FILE="queries.spl" +exec ../lib/benchmark-common.sh diff --git a/siglens/check b/siglens/check new file mode 100755 index 0000000000..242bd86673 --- /dev/null +++ b/siglens/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# A search-API hit is sufficient — `/api/health` may not exist on this version. +curl -sSf -o /dev/null --max-time 5 'http://localhost:5122/' \ + || curl -sSf -o /dev/null --max-time 5 'http://localhost:8081/' diff --git a/siglens/data-size b/siglens/data-size new file mode 100755 index 0000000000..457b6d5b7f --- /dev/null +++ b/siglens/data-size @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +# siglens stores ingested data under the data/ directory inside the repo. +if [ -d siglens/data ]; then + du -bcs siglens/data | grep total | awk '{print $1}' +elif [ -d siglens/ingestnodes ]; then + du -bcs siglens/ingestnodes siglens/querynodes 2>/dev/null | grep total | awk '{print $1}' +else + du -bcs siglens | grep total | awk '{print $1}' +fi diff --git a/siglens/install b/siglens/install new file mode 100755 index 0000000000..19ffd675a5 --- /dev/null +++ b/siglens/install @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y git golang pigz python3 python3-pip jq bc + +# go mod / go build require GOPATH (and writable GOMODCACHE). With cloud-init +# running as root and HOME unset, `go mod tidy` errors with +# `module cache not found: neither GOMODCACHE nor GOPATH is set`. Set both +# defensively even if cloud-init.sh.in already exports HOME. +export HOME=${HOME:-/root} +export GOPATH=${GOPATH:-$HOME/go} +mkdir -p "$GOPATH" + +if [ ! -d siglens ]; then + git clone https://github.com/siglens/siglens.git --branch 1.0.54 +fi + +cd siglens +go mod tidy +if [ ! -x ./siglens ]; then + go build -o siglens cmd/siglens/main.go +fi + +# load script uses requests. +pip3 install --quiet --break-system-packages requests || pip3 install --quiet requests diff --git a/siglens/load b/siglens/load new file mode 100755 index 0000000000..dde5982cf0 --- /dev/null +++ b/siglens/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +pigz -d -f hits.json.gz + +python3 send_datawithactionline.py + +rm -f hits.json +sync diff --git a/siglens/query b/siglens/query new file mode 100755 index 0000000000..1ca8b5a918 --- /dev/null +++ b/siglens/query @@ -0,0 +1,39 @@ +#!/bin/bash +# Reads a SigLens SPL/Splunk QL query from stdin, runs it via the search API. +# Stdout: query response (JSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +querytxt=$(cat) + +# A "null" query in queries.spl means "not supported"; emit null timing. +if [ "$querytxt" = "null" ]; then + echo "{}" + echo "null" >&2 + exit 0 +fi + +JSON=$(jq -nc --arg q "$querytxt" '{ + state: "query", + searchText: $q, + startEpoch: "now-9000d", + endEpoch: "now", + indexName: "hits", + from: 0, + queryLanguage: "Splunk QL" +}') + +t1=$(date +%s.%N) +resp=$(curl -sS -k -X POST 'http://localhost:5122/api/search' \ + -H 'Content-Type: application/json' -d "$JSON") +t2=$(date +%s.%N) + +if [ "$(jq 'has("error")' <<<"$resp")" = "true" ]; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" + +awk -v a="$t1" -v b="$t2" 'BEGIN { printf "%.3f\n", b - a }' >&2 diff --git a/siglens/results/20260509/c6a.4xlarge.json b/siglens/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..87add1f2a4 --- /dev/null +++ b/siglens/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "SigLens", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["Go","search","lukewarm-cold-run"], + "load_time": 6313, + "data_size": 28908878543, + "result": [ + [0.121, 0.079, 0.078], + [1.628, 0.206, 0.2], + [0.123, 0.076, 0.078], + [0.119, 0.084, 0.078], + [0.12, 0.075, 0.079], + [0.116, 0.079, 0.074], + [0.12, 0.077, 0.077], + [0.523, 0.97, 0.162], + [4.096, 3.604, 3.58], + [6.161, 4.926, 4.899], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [0.218, 0.164, 0.156], + [0.281, 0.194, 0.191], + [0.276, 0.205, 0.196], + [null, null, null], + [0.838, 1.162, 0.176], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [0.142, 0.115, 0.106], + [null, null, null], + [null, null, null], + [0.469, 0.35, 0.331], + [0.387, 0.27, 0.255], + [0.379, 0.279, 0.255], + [0.225, 0.163, 0.147], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] +] +} + diff --git a/siglens/run.sh b/siglens/run.sh deleted file mode 100755 index bf378db728..0000000000 --- a/siglens/run.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -TRIES=3 - -QUERY_NUM=0 - -cat 'queries.spl' | while read -r QUERYTXT; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo -n "[" - - for i in $(seq 1 $TRIES); do - if [[ $QUERYTXT != "null" ]]; then - JSON="{ - \"state\": \"query\", - \"searchText\": \"$QUERYTXT\", - \"startEpoch\": \"now-9000d\", - \"endEpoch\": \"now\", - \"indexName\": \"hits\", - \"from\": 0, - \"queryLanguage\": \"Splunk QL\" - }" - - # start external timer - START=$(date +%s.%N) - - # Run Query directly through search API - SIG_RSP=$(curl -s -k -X POST "http://localhost:5122/api/search" -H 'Content-Type: application/json' -d"$JSON") - - # calculate timing outside of SigLens - END=$(date +%s.%N) - RES=$(echo "$END - $START" | bc -l | xargs printf "%.3f") - - # if SigLens returned an error, print null - [[ "$(jq 'has("error")' <<<$SIG_RSP)" == "true" ]] && echo -n "null" || echo -n "$RES" - [[ "$i" != $TRIES ]] && echo -n ", " - - if [[ "$(jq 'has("error")' <<<"$SIG_RSP")" == "true" ]]; then - echo -e "\n\nSigLens got error for query: $QUERYTXT" - echo $SIG_RSP - echo -e "\n" - FINAL_TIME="null" - else - FINAL_TIME="$RES" - fi - # output to result file - echo "${QUERY_NUM},${i},${FINAL_TIME}" >>result.csv - else - # Queries that are not supported write null for them - echo -n "null, " - echo "${QUERY_NUM},${i},null" >>result.csv - fi - done - - echo "]," - QUERY_NUM=$((QUERY_NUM + 1)) - -done diff --git a/siglens/start b/siglens/start new file mode 100755 index 0000000000..30ccbdfc56 --- /dev/null +++ b/siglens/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# Idempotent: ports 5122 (search API) and 8081 (ingest API) are siglens'. +if curl -sSf 'http://localhost:5122/api/health' >/dev/null 2>&1 \ + || curl -sSf 'http://localhost:8081' >/dev/null 2>&1; then + exit 0 +fi + +cd siglens +nohup ./siglens > siglens.out 2>&1 & +disown diff --git a/siglens/stop b/siglens/stop new file mode 100755 index 0000000000..e2a18feaa4 --- /dev/null +++ b/siglens/stop @@ -0,0 +1,8 @@ +#!/bin/bash + +pkill -x siglens 2>/dev/null || true +for _ in $(seq 1 15); do + pgrep -x siglens >/dev/null 2>&1 || exit 0 + sleep 1 +done +pkill -9 -x siglens 2>/dev/null || true diff --git a/singlestore/benchmark.sh b/singlestore/benchmark.sh index 663da391a2..fe936c8e4f 100755 --- a/singlestore/benchmark.sh +++ b/singlestore/benchmark.sh @@ -21,7 +21,7 @@ sudo docker exec -i memsql-ciab memsql -p"${ROOT_PASSWORD}" # Load the data -../download-hits-tsv +../lib/download-hits-tsv sudo docker cp hits.tsv memsql-ciab:/ sudo docker exec -i memsql-ciab memsql -p"${ROOT_PASSWORD}" -e "CREATE DATABASE test" diff --git a/sirius/benchmark.sh b/sirius/benchmark.sh index d1d1304e7f..361da7f6e7 100755 --- a/sirius/benchmark.sh +++ b/sirius/benchmark.sh @@ -1,63 +1,9 @@ #!/bin/bash -# ClickBench benchmark for Sirius (GPU-accelerated DuckDB extension) -# -# Usage: ./benchmark.sh -# Prerequisites: NVIDIA GPU with CUDA driver, internet access - -source dependencies.sh - -# Verify pixi is available -if ! command -v pixi &> /dev/null; then - echo "Error: pixi not found. Check dependencies.sh output." - exit 1 -fi - -# --------------------------------------------------------------------------- -# 1. Build Sirius -# --------------------------------------------------------------------------- -rm -rf sirius -git clone --recurse-submodules https://github.com/sirius-db/sirius.git -cd sirius - -set -e - -pixi install -export LIBCUDF_ENV_PREFIX="$(pwd)/.pixi/envs/default" -pixi run make -j"$(nproc)" - -# Make the build artifacts available -eval "$(pixi shell-hook)" -export PATH="$(pwd)/build/release:$PATH" -cd .. - -set +e - -# --------------------------------------------------------------------------- -# 2. Load data -# --------------------------------------------------------------------------- -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql -f load.sql - -# --------------------------------------------------------------------------- -# 3. Run benchmark -# --------------------------------------------------------------------------- -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -wc -c hits.db - -# --------------------------------------------------------------------------- -# 4. Format results -# --------------------------------------------------------------------------- -cat log.txt | \ - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | \ - sed -r -e 's/^.(Killed|Segmentation).$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | \ - awk '{ - buf[i++] = $1 - if (i == 4) { - printf "[%s,%s,%s],\n", buf[1], buf[2], buf[3] - i = 0 - } - }' \ No newline at end of file +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +# sirius's server.py initializes CUDA / cuDF on startup which can take +# several minutes on a cold instance — 900 s wasn't enough on the +# c6a.4xlarge runs we've seen. Bump again. +export BENCH_CHECK_TIMEOUT=1800 +exec ../lib/benchmark-common.sh diff --git a/sirius/check b/sirius/check new file mode 100755 index 0000000000..0c4b301a2d --- /dev/null +++ b/sirius/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/sirius/data-size b/sirius/data-size new file mode 100755 index 0000000000..365ad4ecc8 --- /dev/null +++ b/sirius/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/sirius/dependencies.sh b/sirius/dependencies.sh deleted file mode 100755 index bf8c225131..0000000000 --- a/sirius/dependencies.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -#!/bin/bash -# Install system dependencies required to build Sirius and run ClickBench. - -echo "Installing system dependencies..." -sudo apt-get update -y -sudo apt-get install -y git wget curl build-essential time - -# Install Pixi package manager (used to manage CUDA/cuDF toolchain) -if ! command -v pixi &> /dev/null; then - echo "Pixi not found. Installing..." - curl -fsSL https://pixi.sh/install.sh | sudo PIXI_BIN_DIR=/usr/local/bin PIXI_NO_PATH_UPDATE=1 bash -fi - -echo "All dependencies installed." diff --git a/sirius/install b/sirius/install new file mode 100755 index 0000000000..7a0206f653 --- /dev/null +++ b/sirius/install @@ -0,0 +1,41 @@ +#!/bin/bash +# Install Sirius (GPU-accelerated DuckDB extension) and the Python deps for +# this benchmark wrapper. +set -e + +# 1. System deps + pixi (CUDA/cuDF toolchain manager). +sudo apt-get update -y +sudo apt-get install -y git wget curl build-essential time python3-pip python3-venv + +if ! command -v pixi &>/dev/null; then + curl -fsSL https://pixi.sh/install.sh | sudo PIXI_BIN_DIR=/usr/local/bin PIXI_NO_PATH_UPDATE=1 bash +fi + +# 2. Build Sirius. +if [ ! -d sirius ]; then + git clone --recurse-submodules https://github.com/sirius-db/sirius.git +fi + +( + cd sirius + pixi install + export LIBCUDF_ENV_PREFIX="$(pwd)/.pixi/envs/default" + pixi run make -j"$(nproc)" +) + +# 3. Python venv for the FastAPI wrapper. +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet fastapi uvicorn + +# 4. Pre-baked env file used by start to put the duckdb+sirius binary on PATH +# and pull in the pixi shell-hook variables. +cat > .sirius_env <<'EOF' +SIRIUS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)/sirius" +eval "$(cd "$SIRIUS_DIR" && pixi shell-hook)" +export PATH="$SIRIUS_DIR/build/release:$PATH" +export LIBCUDF_ENV_PREFIX="$SIRIUS_DIR/.pixi/envs/default" +EOF diff --git a/sirius/load b/sirius/load new file mode 100755 index 0000000000..0fe7440546 --- /dev/null +++ b/sirius/load @@ -0,0 +1,29 @@ +#!/bin/bash +# Build hits.db from hits.parquet using the duckdb CLI (create.sql + load.sql), +# then ask the running server to initialise GPU buffers. +set -e + +if [ -f .sirius_env ]; then + # shellcheck disable=SC1091 + source .sirius_env +fi + +# Ingest into a local duckdb file. Done via CLI (not the server) because the +# server already holds the file open; we want this to be a one-shot job. +./stop || true +duckdb hits.db -f create.sql -f load.sql + +./start +# Wait for the freshly-restarted server to be healthy. +for _ in $(seq 1 60); do + if ./check >/dev/null 2>&1; then + break + fi + sleep 1 +done + +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported, gpu_buffer_init): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/sirius/query b/sirius/query new file mode 100755 index 0000000000..7fbdef441b --- /dev/null +++ b/sirius/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running sirius server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/sirius/results/20251022/lambda-GH200.json b/sirius/results/20251022/lambda-GH200.json index d0ed251efb..7332bf7a00 100644 --- a/sirius/results/20251022/lambda-GH200.json +++ b/sirius/results/20251022/lambda-GH200.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 26, "data_size": 26903326720, diff --git a/sirius/results/20251029/lambda-GH200.json b/sirius/results/20251029/lambda-GH200.json index 1e8573c289..ac16bf688a 100644 --- a/sirius/results/20251029/lambda-GH200.json +++ b/sirius/results/20251029/lambda-GH200.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 26, "data_size": 26903326720, diff --git a/sirius/results/20251107/lambda-GH200.json b/sirius/results/20251107/lambda-GH200.json index df28974fe1..39a0fe4dff 100644 --- a/sirius/results/20251107/lambda-GH200.json +++ b/sirius/results/20251107/lambda-GH200.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 26, "data_size": 26903326720, diff --git a/sirius/results/20251207/p5.4xlarge.json b/sirius/results/20251207/p5.4xlarge.json index 215f6bc6a8..9cb88f1057 100644 --- a/sirius/results/20251207/p5.4xlarge.json +++ b/sirius/results/20251207/p5.4xlarge.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 237, "data_size": 26921938944, diff --git a/sirius/results/20260306/lambda-GH200.json b/sirius/results/20260306/lambda-GH200.json index 5480972c1d..332cfa7163 100644 --- a/sirius/results/20260306/lambda-GH200.json +++ b/sirius/results/20260306/lambda-GH200.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 22, "data_size": 26887598080, diff --git a/sirius/results/20260309/lambda-GH200.json b/sirius/results/20260309/lambda-GH200.json index 0a29f992f3..a3216ab96e 100644 --- a/sirius/results/20260309/lambda-GH200.json +++ b/sirius/results/20260309/lambda-GH200.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 25, "data_size": 26887598080, diff --git a/sirius/results/20260309/p5.4xlarge.json b/sirius/results/20260309/p5.4xlarge.json index 7cafaf6560..8267bd8c76 100644 --- a/sirius/results/20260309/p5.4xlarge.json +++ b/sirius/results/20260309/p5.4xlarge.json @@ -9,7 +9,8 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ], "load_time": 237, "data_size": 26921938944, diff --git a/sirius/run.sh b/sirius/run.sh deleted file mode 100755 index ac5dd8243f..0000000000 --- a/sirius/run.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -TRIES=3 -GPU_CACHING_SIZE='80 GB' -GPU_PROCESSING_SIZE='40 GB' -CPU_PROCESSING_SIZE="100 GB" - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - cli_params+=("-c") - cli_params+=("call gpu_buffer_init(\"${GPU_CACHING_SIZE}\", \"${GPU_PROCESSING_SIZE}\", pinned_memory_size = \"${CPU_PROCESSING_SIZE}\");") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("call gpu_processing(\"${query}\");") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/sirius/server.py b/sirius/server.py new file mode 100644 index 0000000000..d2cc4734c2 --- /dev/null +++ b/sirius/server.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Sirius (GPU-accelerated DuckDB extension) so it +conforms to the ClickBench install/start/check/stop/load/query interface. + +Sirius is a DuckDB extension built from source; queries run on the GPU via +``call gpu_processing("");``. This server manages a long-lived ``duckdb`` +CLI subprocess so the GPU buffers initialised on /load remain hot across +queries. + +Routes: + GET /health -> 200 OK once the CLI subprocess is ready + POST /load -> opens hits.db, calls gpu_buffer_init, returns + {"elapsed": }. (Schema/data are loaded by + ./load before this runs.) + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + gpu_processing, returns {"elapsed": }. + GET /data-size -> bytes of hits.db on disk. +""" + +import os +import re +import subprocess +import threading +import timeit + +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +GPU_CACHING_SIZE = os.environ.get("SIRIUS_GPU_CACHING_SIZE", "80 GB") +GPU_PROCESSING_SIZE = os.environ.get("SIRIUS_GPU_PROCESSING_SIZE", "40 GB") +CPU_PROCESSING_SIZE = os.environ.get("SIRIUS_CPU_PROCESSING_SIZE", "100 GB") + +DB_PATH = os.environ.get("SIRIUS_DB", "hits.db") + +app = FastAPI() +proc: subprocess.Popen | None = None +proc_lock = threading.Lock() +buffers_initialized = False + +# Sentinel sent after each command to detect completion in stdout. +SENTINEL = "__SIRIUS_DONE__" + + +# Read query strings from queries.sql (canonical) on import. We expose the +# same shape as the pandas pilot — (sql, callable). The callable runs the +# SQL via gpu_processing on the persistent duckdb session. +def _load_query_strings() -> list[str]: + here = os.path.dirname(os.path.abspath(__file__)) + qpath = os.path.join(here, "queries.sql") + with open(qpath) as f: + return [line.rstrip("\n") for line in f if line.strip()] + + +_SQL_LIST = _load_query_strings() + + +def _make_runner(sql: str): + return lambda: _run_gpu(sql) + + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _spawn_duckdb() -> subprocess.Popen: + # Open a persistent duckdb CLI session against hits.db. The Sirius build + # places the duckdb binary on PATH (see install). + return subprocess.Popen( + ["duckdb", DB_PATH], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + +def _send(cmd: str) -> str: + """Send a SQL/CLI command to duckdb and read until the sentinel. Returns + the raw output (excluding the sentinel line).""" + assert proc is not None and proc.stdin is not None and proc.stdout is not None + with proc_lock: + proc.stdin.write(cmd.rstrip(";") + ";\n") + proc.stdin.write(f"select '{SENTINEL}';\n") + proc.stdin.flush() + out_lines: list[str] = [] + while True: + line = proc.stdout.readline() + if not line: + raise RuntimeError("duckdb subprocess closed unexpectedly") + if SENTINEL in line: + # Drain the trailing border row from the boxed select output. + # DuckDB emits the table for `select '...'` as several lines; + # readline on the SENTINEL line is enough — subsequent lines + # belong to the next command. + break + out_lines.append(line) + return "".join(out_lines) + + +def _run_gpu(sql: str) -> str: + # Wrap user SQL inside gpu_processing("..."); escape embedded double quotes. + escaped = sql.replace('"', '\\"') + return _send(f'call gpu_processing("{escaped}")') + + +@app.get("/health") +def health(): + if proc is None or proc.poll() is not None: + raise HTTPException(status_code=503, detail="duckdb subprocess not running") + return {"ok": True} + + +@app.on_event("startup") +def _startup(): + global proc + proc = _spawn_duckdb() + # Quiet down the CLI a bit. + _send(".mode list") + + +@app.on_event("shutdown") +def _shutdown(): + global proc + if proc is not None: + try: + proc.stdin.write(".quit\n") + proc.stdin.flush() + except Exception: + pass + try: + proc.wait(timeout=5) + except Exception: + proc.kill() + proc = None + + +@app.post("/load") +def load(): + """For Sirius the on-disk DuckDB database is created by the ``./load`` + script (which runs create.sql + load.sql). Here we just initialise the + GPU buffers on the persistent connection so subsequent queries are warm. + """ + global buffers_initialized + start = timeit.default_timer() + if not buffers_initialized: + _send( + f'call gpu_buffer_init("{GPU_CACHING_SIZE}", "{GPU_PROCESSING_SIZE}", ' + f'pinned_memory_size = "{CPU_PROCESSING_SIZE}")' + ) + buffers_initialized = True + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + out = _run_gpu(sql) + elapsed = round(timeit.default_timer() - start, 3) + # If duckdb reports an error, surface it. + if re.search(r"\bError\b", out): + raise HTTPException(status_code=500, detail=out.strip()[:500]) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + try: + return {"bytes": int(os.path.getsize(DB_PATH))} + except OSError: + return {"bytes": 0} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_SIRIUS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/sirius/start b/sirius/start new file mode 100755 index 0000000000..9b24312b54 --- /dev/null +++ b/sirius/start @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# Source the env populated by ./install so the sirius-built duckdb binary is +# on PATH and CUDA/cuDF libs resolve correctly. +if [ -f .sirius_env ]; then + # shellcheck disable=SC1091 + source .sirius_env +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/sirius/stop b/sirius/stop new file mode 100755 index 0000000000..787b35abcc --- /dev/null +++ b/sirius/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/sirius/template.json b/sirius/template.json index 3b60099c62..cf07e7c70c 100644 --- a/sirius/template.json +++ b/sirius/template.json @@ -6,6 +6,7 @@ "tags": [ "C++", "column-oriented", - "embedded" + "embedded", + "in-memory" ] } diff --git a/spark-auron/benchmark.sh b/spark-auron/benchmark.sh index b7bd867118..fc4bacc8f3 100755 --- a/spark-auron/benchmark.sh +++ b/spark-auron/benchmark.sh @@ -1,93 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.6 version is used (latest stable for Auron 5.0.0) -# - Auron installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.5 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Auron - -AURON_JAR_URL='https://github.com/apache/auron/releases/download/v5.0.0/blaze-engine-spark-3.5-release-5.0.0-SNAPSHOT.jar' - -wget --continue --progress=dot:giga $AURON_JAR_URL -O auron.jar - -# Run the queries - -./run.sh >log.txt 2>&1 - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -AURON_VERSION=$(echo $AURON_JAR_URL | grep -Po "\d.\d.\d" | head -n 1) -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -DATE=$(date -u +%Y-%m-%d) -YYYYMMDD=${DATE//-/} -mkdir -p "results/${YYYYMMDD}" - -( -cat << EOF -{ - "system": "Spark (Auron)", - "date": "${DATE}", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Auron ${AURON_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "Rust", "column-oriented", "Spark derivative", "DataFusion", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${YYYYMMDD}/${MACHINE}.json" - -echo "Results have been saved to results/${YYYYMMDD}/${MACHINE}.json" - +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-auron/check b/spark-auron/check new file mode 100755 index 0000000000..492bdfc9f7 --- /dev/null +++ b/spark-auron/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f auron.jar ] diff --git a/spark-auron/data-size b/spark-auron/data-size new file mode 100755 index 0000000000..1a34600a86 --- /dev/null +++ b/spark-auron/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-auron/install b/spark-auron/install new file mode 100755 index 0000000000..5e38d04eb2 --- /dev/null +++ b/spark-auron/install @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.5 psutil + +AURON_JAR_URL='https://github.com/apache/auron/releases/download/v5.0.0/blaze-engine-spark-3.5-release-5.0.0-SNAPSHOT.jar' +if [ ! -f auron.jar ]; then + wget --continue --progress=dot:giga "$AURON_JAR_URL" -O auron.jar +fi diff --git a/spark-auron/load b/spark-auron/load new file mode 100755 index 0000000000..f093c1086d --- /dev/null +++ b/spark-auron/load @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Parquet file is read directly by Spark — nothing to load. +sync diff --git a/spark-auron/query b/spark-auron/query new file mode 100755 index 0000000000..16c9fab1bb --- /dev/null +++ b/spark-auron/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-auron/query.py b/spark-auron/query.py index b07b29e6bb..859d76bab8 100755 --- a/spark-auron/query.py +++ b/spark-auron/query.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Auron, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and memoryOverhead (for Auron) -- Auron configuration is added to `SparkSession` +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -51,13 +50,15 @@ df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) # some queries should return more than 20 rows which is the default show limit + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-auron/run.sh b/spark-auron/run.sh deleted file mode 100755 index 8c9ca12890..0000000000 --- a/spark-auron/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-auron/start b/spark-auron/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-auron/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-auron/stop b/spark-auron/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-auron/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-comet/benchmark.sh b/spark-comet/benchmark.sh index 9a7eba1c0e..fc4bacc8f3 100755 --- a/spark-comet/benchmark.sh +++ b/spark-comet/benchmark.sh @@ -1,92 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.6 version is used (latest stable for Comet 0.9.0) -# - Comet installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.6 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Comet - -COMET_JAR_URL='https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.9.0/comet-spark-spark3.5_2.12-0.9.0.jar' - -wget --continue --progress=dot:giga $COMET_JAR_URL -O comet.jar - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -COMET_VERSION=$(echo $COMET_JAR_URL | grep -Po ".{5}(?=.jar)") -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -DATE=$(date -u +%Y-%m-%d) -YYYYMMDD=${DATE//-/} -mkdir -p "results/${YYYYMMDD}" - -( -cat << EOF -{ - "system": "Spark (Comet)", - "date": "${DATE}", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Comet ${COMET_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "Rust", "column-oriented", "Spark derivative", "DataFusion", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${YYYYMMDD}/${MACHINE}.json" - -echo "Results have been saved to results/${YYYYMMDD}/${MACHINE}.json" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-comet/check b/spark-comet/check new file mode 100755 index 0000000000..8c9998fc9e --- /dev/null +++ b/spark-comet/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f comet.jar ] diff --git a/spark-comet/data-size b/spark-comet/data-size new file mode 100755 index 0000000000..1a34600a86 --- /dev/null +++ b/spark-comet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-comet/install b/spark-comet/install new file mode 100755 index 0000000000..108f7c5121 --- /dev/null +++ b/spark-comet/install @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.6 psutil + +COMET_JAR_URL='https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.9.0/comet-spark-spark3.5_2.12-0.9.0.jar' +if [ ! -f comet.jar ]; then + wget --continue --progress=dot:giga "$COMET_JAR_URL" -O comet.jar +fi diff --git a/spark-comet/load b/spark-comet/load new file mode 100755 index 0000000000..1c31caf315 --- /dev/null +++ b/spark-comet/load @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sync diff --git a/spark-comet/query b/spark-comet/query new file mode 100755 index 0000000000..16c9fab1bb --- /dev/null +++ b/spark-comet/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-comet/query.py b/spark-comet/query.py index 50358069a0..59e2f4615a 100755 --- a/spark-comet/query.py +++ b/spark-comet/query.py @@ -1,12 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Comet, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and off-heap (for Comet) -- Comet configuration is added to `SparkSession` -- debug mode is added +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -46,7 +44,6 @@ .config("spark.comet.scan.allowIncompatible", True) ) -# Even more Comet configuration if os.getenv("DEBUG") == "1": builder.config("spark.comet.explainFallback.enabled", "true") builder.config("spark.sql.debug.maxToStringFields", "10000") @@ -54,18 +51,19 @@ spark = builder.getOrCreate() df = spark.read.parquet("hits.parquet") -# Do casting before creating the view so no need to change to unreadable integer dates in SQL df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-comet/results/20260509/c6a.4xlarge.json b/spark-comet/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..95a04a1f87 --- /dev/null +++ b/spark-comet/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Spark (Comet)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","Rust","column-oriented","Spark derivative"], + "load_time": 10, + "data_size": 14779976446, + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [12.436, 6.53, 6.464], + [22.028, 8.69, 8.666], + [5.844, 5.537, 5.558], + [5.871, 5.457, 5.47], + [null, null, null] +] +} + diff --git a/spark-comet/results/20260510/c6a.metal.json b/spark-comet/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..e3f5ad4d5b --- /dev/null +++ b/spark-comet/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Spark (Comet)", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","Rust","column-oriented","Spark derivative"], + "load_time": 1, + "data_size": 14779976446, + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [14.93, 7.416, 7.219], + [24.907, 9.28, 9.347], + [7.239, 6.538, 6.447], + [6.209, 6.22, 6.103], + [null, null, null] +] +} + diff --git a/spark-comet/run.sh b/spark-comet/run.sh deleted file mode 100755 index 8c9ca12890..0000000000 --- a/spark-comet/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-comet/start b/spark-comet/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-comet/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-comet/stop b/spark-comet/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-comet/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-gluten/benchmark.sh b/spark-gluten/benchmark.sh index 0334675547..fc4bacc8f3 100755 --- a/spark-gluten/benchmark.sh +++ b/spark-gluten/benchmark.sh @@ -1,94 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.2 version is used (latest stable for Gluten 1.4.0) -# - Gluten installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.2 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Gluten - -GLUTEN_JAR_URL='https://github.com/apache/incubator-gluten/releases/download/v1.4.0/apache-gluten-1.4.0-incubating-bin-spark35.tar.gz' - -wget --continue --progress=dot:giga $GLUTEN_JAR_URL -O gluten.gz -tar -xzf gluten.gz -mv gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar gluten.jar - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -GLUTEN_VERSION=$(echo $GLUTEN_JAR_URL | grep -Po "\d.\d.\d" | head -n 1) -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -DATE=$(date -u +%Y-%m-%d) -YYYYMMDD=${DATE//-/} -mkdir -p "results/${YYYYMMDD}" - -( -cat << EOF -{ - "system": "Spark (Gluten-on-Velox)", - "date": "${DATE}", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Gluten ${GLUTEN_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "C++", "column-oriented", "Spark derivative", "Velox", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${YYYYMMDD}/${MACHINE}.json" - -echo "Results have been saved to results/${YYYYMMDD}/${MACHINE}.json" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-gluten/check b/spark-gluten/check new file mode 100755 index 0000000000..f2375edcd4 --- /dev/null +++ b/spark-gluten/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f gluten.jar ] diff --git a/spark-gluten/data-size b/spark-gluten/data-size new file mode 100755 index 0000000000..1a34600a86 --- /dev/null +++ b/spark-gluten/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-gluten/install b/spark-gluten/install new file mode 100755 index 0000000000..681fef2b4c --- /dev/null +++ b/spark-gluten/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.2 psutil + +GLUTEN_JAR_URL='https://github.com/apache/incubator-gluten/releases/download/v1.4.0/apache-gluten-1.4.0-incubating-bin-spark35.tar.gz' +if [ ! -f gluten.jar ]; then + wget --continue --progress=dot:giga "$GLUTEN_JAR_URL" -O gluten.gz + tar -xzf gluten.gz + mv gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar gluten.jar + rm -f gluten.gz +fi diff --git a/spark-gluten/load b/spark-gluten/load new file mode 100755 index 0000000000..1c31caf315 --- /dev/null +++ b/spark-gluten/load @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sync diff --git a/spark-gluten/query b/spark-gluten/query new file mode 100755 index 0000000000..16c9fab1bb --- /dev/null +++ b/spark-gluten/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-gluten/query.py b/spark-gluten/query.py index 4d2c15d34f..a16a0034c0 100755 --- a/spark-gluten/query.py +++ b/spark-gluten/query.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Gluten, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and off-heap (for Gluten) -- Gluten configuration is added to `SparkSession` +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -19,7 +18,6 @@ query = sys.stdin.read() print(query) -# Calculate available memory to configurate SparkSession (in MB) ram = int(round(psutil.virtual_memory().available / (1024 ** 2) * 0.7)) heap = ram // 2 off_heap = ram - heap @@ -46,18 +44,19 @@ spark = builder.getOrCreate() df = spark.read.parquet("hits.parquet") -# Do casting before creating the view so no need to change to unreadable integer dates in SQL df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-gluten/results/20260509/c6a.4xlarge.json b/spark-gluten/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..7253d96e4a --- /dev/null +++ b/spark-gluten/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Spark (Gluten-on-Velox)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","C++","column-oriented","Spark derivative"], + "load_time": 10, + "data_size": 14779976446, + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, 2.605], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [4.155, 3.721, 3.648], + [5.376, 4.851, 4.691], + [4.313, 3.889, 3.772], + [4.241, 3.576, 3.628], + [4.434, 3.883, 3.854] +] +} + diff --git a/spark-gluten/run.sh b/spark-gluten/run.sh deleted file mode 100755 index 8c9ca12890..0000000000 --- a/spark-gluten/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-gluten/start b/spark-gluten/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-gluten/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-gluten/stop b/spark-gluten/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-gluten/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-velox/README.md b/spark-velox/README.md new file mode 100644 index 0000000000..12298bfd5d --- /dev/null +++ b/spark-velox/README.md @@ -0,0 +1,17 @@ +Spark + [Velox](https://velox-lib.io/) via [Apache Gluten](https://gluten.apache.org/). Velox is a C++ vectorised execution engine; the Gluten plugin offloads Spark Catalyst's physical plan to Velox. + +This entry is functionally close to [`spark-gluten/`](../spark-gluten/) — the difference is that the Gluten backend is pinned to `velox` explicitly via `spark.gluten.sql.columnar.backend.lib`, so the benchmark name reflects the engine actually doing the work (Gluten can in principle also use the ClickHouse backend). + +### Run + +``` +./benchmark.sh +``` + +Optionally pass a machine spec to tag the saved results: `./benchmark.sh c6a.8xlarge`. + +### Notes + +- Apache Gluten ships pre-built Velox bundles only for `linux_amd64`. ARM hosts have to build the bundle from source — see [Gluten's build guide](https://gluten.apache.org/docs/getting-started/build-guide/). +- Velox runs off-heap; the script splits available memory 50/50 between Spark's JVM heap and Gluten's native off-heap pool, matching [official guidance](https://apache.github.io/incubator-gluten/get-started/Velox.html#submit-the-spark-sql-job). +- See [spark-gluten/README.md](../spark-gluten/README.md) and [spark/README-accelerators.md](../spark/README-accelerators.md) for additional context. diff --git a/spark-velox/benchmark.sh b/spark-velox/benchmark.sh new file mode 100755 index 0000000000..fc4bacc8f3 --- /dev/null +++ b/spark-velox/benchmark.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-velox/check b/spark-velox/check new file mode 100755 index 0000000000..f2375edcd4 --- /dev/null +++ b/spark-velox/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f gluten.jar ] diff --git a/spark-velox/data-size b/spark-velox/data-size new file mode 100755 index 0000000000..1a34600a86 --- /dev/null +++ b/spark-velox/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-velox/install b/spark-velox/install new file mode 100755 index 0000000000..681fef2b4c --- /dev/null +++ b/spark-velox/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.2 psutil + +GLUTEN_JAR_URL='https://github.com/apache/incubator-gluten/releases/download/v1.4.0/apache-gluten-1.4.0-incubating-bin-spark35.tar.gz' +if [ ! -f gluten.jar ]; then + wget --continue --progress=dot:giga "$GLUTEN_JAR_URL" -O gluten.gz + tar -xzf gluten.gz + mv gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar gluten.jar + rm -f gluten.gz +fi diff --git a/spark-velox/load b/spark-velox/load new file mode 100755 index 0000000000..1c31caf315 --- /dev/null +++ b/spark-velox/load @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sync diff --git a/spark-velox/queries.sql b/spark-velox/queries.sql new file mode 100644 index 0000000000..8fafcbcf98 --- /dev/null +++ b/spark-velox/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/spark-velox/query b/spark-velox/query new file mode 100755 index 0000000000..16c9fab1bb --- /dev/null +++ b/spark-velox/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-velox/query.py b/spark-velox/query.py new file mode 100755 index 0000000000..5a65b267f3 --- /dev/null +++ b/spark-velox/query.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +""" +Spark + Velox via Apache Gluten. Reads SQL on stdin, runs it once via +PySpark, prints the result on stdout, and the runtime in fractional +seconds as the LAST line on stderr. + +The Gluten plugin offloads the physical plan from Spark Catalyst to +Velox (a C++ vectorised execution engine). spark-gluten/query.py uses +Gluten with whichever backend the bundle was built with; this file +pins the backend to velox explicitly via +spark.gluten.sql.columnar.backend.lib. + +Note: Keep in sync with spark-*/query.py (see spark/README-accelerators.md). +""" + +from pyspark.sql import SparkSession +import pyspark.sql.functions as F + +import psutil +import sys +import timeit + + +query = sys.stdin.read() +print(query) + +# Velox runs off-heap, so split available memory between Spark's JVM +# heap and Gluten's native off-heap pool. +ram = int(round(psutil.virtual_memory().available / (1024 ** 2) * 0.7)) +heap = ram // 2 +off_heap = ram - heap +print(f"SparkSession will use {heap} MB of heap and {off_heap} MB of off-heap memory (total {ram} MB)") + +builder = ( + SparkSession + .builder + .appName("ClickBench") + .config("spark.driver", "local[*]") # Use all cores + .config("spark.driver.memory", f"{heap}m") + .config("spark.sql.parquet.binaryAsString", True) # Correct length / text result + + # Gluten + Velox configuration + .config("spark.jars", "gluten.jar") + .config("spark.driver.extraClassPath", "gluten.jar") + .config("spark.plugins", "org.apache.gluten.GlutenPlugin") + .config("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .config("spark.gluten.sql.columnar.backend.lib", "velox") + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", f"{off_heap}m") + .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true") +) + +spark = builder.getOrCreate() + +df = spark.read.parquet("hits.parquet") +df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) +df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) +df.createOrReplaceTempView("hits") + +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-velox/results/20260509/c6a.4xlarge.json b/spark-velox/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..218bf89281 --- /dev/null +++ b/spark-velox/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Spark (Velox)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","C++","column-oriented","Spark derivative"], + "load_time": 8, + "data_size": 14779976446, + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [4.305, 3.602, 3.687], + [5.362, 4.777, 4.738], + [4.348, 3.773, 3.709], + [4.121, 3.548, 3.57], + [4.417, 3.94, 3.872] +] +} + diff --git a/spark-velox/start b/spark-velox/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-velox/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-velox/stop b/spark-velox/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/spark-velox/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-velox/template.json b/spark-velox/template.json new file mode 100644 index 0000000000..11157b7a2f --- /dev/null +++ b/spark-velox/template.json @@ -0,0 +1,12 @@ +{ + "system": "Spark (Velox)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Java", + "C++", + "column-oriented", + "Spark derivative" + ] +} diff --git a/spark/benchmark.sh b/spark/benchmark.sh index 573e403f72..5a4cc33c37 100755 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -1,29 +1,7 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==4.0.0 psutil - -# Load the data - -../download-hits-parquet-single - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +# Spark runs in-process per query — restart between queries is meaningless +# (and would re-download nothing). Skip restart. +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark/check b/spark/check new file mode 100755 index 0000000000..e61d3c5d60 --- /dev/null +++ b/spark/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Trivial: ensure the venv has pyspark importable. +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 diff --git a/spark/data-size b/spark/data-size new file mode 100755 index 0000000000..1a34600a86 --- /dev/null +++ b/spark/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark/install b/spark/install new file mode 100755 index 0000000000..c007f3084c --- /dev/null +++ b/spark/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.5 psutil diff --git a/spark/load b/spark/load new file mode 100755 index 0000000000..8d341815c1 --- /dev/null +++ b/spark/load @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Spark reads hits.parquet directly — nothing to "load". The parquet was +# downloaded by the shared driver. Just sync. +sync diff --git a/spark/query b/spark/query new file mode 100755 index 0000000000..72bb834544 --- /dev/null +++ b/spark/query @@ -0,0 +1,12 @@ +#!/bin/bash +# Reads SQL on stdin, runs it via PySpark. +# Stdout: query output. +# Stderr: runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark/query.py b/spark/query.py index 268b60e71a..55aa42d010 100755 --- a/spark/query.py +++ b/spark/query.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.sh (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark, prints the result on stdout +and the runtime in fractional seconds as the LAST line on stderr. + +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -34,13 +37,16 @@ df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e); - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) # some queries should return more than 20 rows which is the default show limit + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + # Last stderr line: fractional seconds (driver-required contract). + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark/results/20260509/c6a.metal.json b/spark/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..a025583c3d --- /dev/null +++ b/spark/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Spark", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","Spark derivative"], + "load_time": 6, + "data_size": 14779976446, + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [2.555, 2.534, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] +] +} + diff --git a/spark/run.sh b/spark/run.sh deleted file mode 100755 index 8c9ca12890..0000000000 --- a/spark/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark/start b/spark/start new file mode 100755 index 0000000000..8c8eb0c9ff --- /dev/null +++ b/spark/start @@ -0,0 +1,3 @@ +#!/bin/bash +# PySpark runs in-process per query — nothing to start. +exit 0 diff --git a/spark/stop b/spark/stop new file mode 100755 index 0000000000..42fca7c6a7 --- /dev/null +++ b/spark/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# PySpark is in-process — nothing to stop. +exit 0 diff --git a/sqlite/benchmark.sh b/sqlite/benchmark.sh index 705e9739ee..b0b9f4775a 100755 --- a/sqlite/benchmark.sh +++ b/sqlite/benchmark.sh @@ -1,21 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y sqlite3 - -sqlite3 mydb < create.sql - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' sqlite3 mydb '.import --csv hits.csv hits' -echo -n "Data size: " -wc -c mydb - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|Parse error' | - sed -r -e 's/^(Error|Parse error).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sqlite/check b/sqlite/check new file mode 100755 index 0000000000..3cca70d917 --- /dev/null +++ b/sqlite/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sqlite3 :memory: 'SELECT 1' >/dev/null diff --git a/sqlite/data-size b/sqlite/data-size new file mode 100755 index 0000000000..f94c4eccf8 --- /dev/null +++ b/sqlite/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < mydb diff --git a/sqlite/install b/sqlite/install new file mode 100755 index 0000000000..ff8710145c --- /dev/null +++ b/sqlite/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +if ! command -v sqlite3 >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y sqlite3 +fi diff --git a/sqlite/load b/sqlite/load new file mode 100755 index 0000000000..896f445abc --- /dev/null +++ b/sqlite/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +# Idempotent: blow away any prior DB. +rm -f mydb + +sqlite3 mydb < create.sql +sqlite3 mydb '.import --csv hits.csv hits' + +rm -f hits.csv +sync diff --git a/sqlite/query b/sqlite/query new file mode 100755 index 0000000000..e2ee624fd4 --- /dev/null +++ b/sqlite/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via sqlite3 against mydb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +# Exit non-zero on error. +set -e + +query=$(cat) + +# Use bash builtin `time` with TIMEFORMAT to get a single fractional-seconds +# value on stderr. Capture sqlite3 output separately. +TIMEFORMAT='%R' +{ time sqlite3 mydb "$query" 1>/tmp/sqlite.out.$$ 2>/tmp/sqlite.err.$$; } 2>/tmp/sqlite.time.$$ || status=$? +status=${status:-0} + +cat /tmp/sqlite.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/sqlite.err.$$ >&2 + rm -f /tmp/sqlite.out.$$ /tmp/sqlite.err.$$ /tmp/sqlite.time.$$ + exit "$status" +fi + +# Even on success sqlite3 may have warnings on stderr; pass them through but +# end with the timing as the last line. +cat /tmp/sqlite.err.$$ >&2 +cat /tmp/sqlite.time.$$ >&2 + +rm -f /tmp/sqlite.out.$$ /tmp/sqlite.err.$$ /tmp/sqlite.time.$$ diff --git a/sqlite/run.sh b/sqlite/run.sh deleted file mode 100755 index 5693ddd8e0..0000000000 --- a/sqlite/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - time sqlite3 mydb <<< "${query}" - done; -done; diff --git a/sqlite/start b/sqlite/start new file mode 100755 index 0000000000..6976d11cd1 --- /dev/null +++ b/sqlite/start @@ -0,0 +1,3 @@ +#!/bin/bash +# sqlite3 is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/sqlite/stop b/sqlite/stop new file mode 100755 index 0000000000..541aa5672b --- /dev/null +++ b/sqlite/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# sqlite3 is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/starrocks/benchmark.sh b/starrocks/benchmark.sh index 2abac95915..531bd65038 100755 --- a/starrocks/benchmark.sh +++ b/starrocks/benchmark.sh @@ -1,84 +1,5 @@ #!/bin/bash - -# This benchmark should run on Amazon Linux - -set -e - -VERSION=4.0.2-ubuntu-$(dpkg --print-architecture) -# Install -wget --continue --progress=dot:giga https://releases.starrocks.io/starrocks/StarRocks-$VERSION.tar.gz -O StarRocks-$VERSION.tar.gz -tar zxvf StarRocks-${VERSION}.tar.gz - -cd StarRocks-${VERSION}/ - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jre mariadb-client -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture) -export PATH=$JAVA_HOME/bin:$PATH - -# Create directory for FE and BE -IPADDR=`hostname -i` -export STARROCKS_HOME=`pwd` -mkdir -p meta storage - -# Start Frontend -printf "\nmeta_dir = ${STARROCKS_HOME}/meta \n" >> fe/conf/fe.conf -fe/bin/start_fe.sh --daemon - -# Start Backend -printf "\nstorage_root_path = ${STARROCKS_HOME}/storage\n" >> be/conf/be.conf -# Disable internal caches so that the cold run (1st of 3 tries) is actually cold. -# Without this, the BE process keeps decoded data in its own in-memory page cache -# (`storage_page_cache`, default ~20% of RAM) which `drop_caches` does not clear, -# so first-run timings reflect a warm cache and underreport cold-run latency. -# `datacache_enable=false` covers the unified Data Cache (page + block) path in v3.3+. -printf "\ndisable_storage_page_cache = true\n" >> be/conf/be.conf -printf "\ndatacache_enable = false\n" >> be/conf/be.conf -be/bin/start_be.sh --daemon - -# Setup cluster -# wait some seconds util fe can serve -sleep 30 -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '${IPADDR}:9050' " -# wait some seconds util be joins -sleep 30 - -# Prepare Data -cd ../ -../download-hits-tsv - -# Create Table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -mysql -h 127.0.0.1 -P9030 -uroot hits < create.sql - -# Load Data -START=$(date +%s) -echo "Start to load data..." -# `timeout:1000` header: see https://github.com/ClickHouse/ClickBench/pull/740 -curl --location-trusted \ - -u root: \ - -T "hits.tsv" \ - -H "label:hits_tsv_${START}" \ - -H "timeout:1000" \ - -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ - http://localhost:8030/api/hits/hits/_stream_load -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" - -# Dataset contains about 40GB of data when the import is just completed. -# This is because the trashed data generated during the compaction process. -# After about tens of minutes, when the gc is completed, the system includes about 16.5GB of data. -echo -n "Data size: " -du -bcs StarRocks-${VERSION}/storage/ | grep total -# Dataset contains 99997497 rows -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" - -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/starrocks/check b/starrocks/check new file mode 100755 index 0000000000..c6e836c8c1 --- /dev/null +++ b/starrocks/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/starrocks/data-size b/starrocks/data-size new file mode 100755 index 0000000000..c1c21f9f9b --- /dev/null +++ b/starrocks/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +SR_DIR=$(cat .sr_dir) +du -bcs "$SR_DIR/storage/" | awk '/total$/ {print $1}' diff --git a/starrocks/install b/starrocks/install new file mode 100755 index 0000000000..a197358dcb --- /dev/null +++ b/starrocks/install @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +VERSION="4.0.2-ubuntu-$(dpkg --print-architecture)" +SR_DIR="StarRocks-$VERSION" + +if [ ! -d "$SR_DIR" ]; then + if [ ! -f "$SR_DIR.tar.gz" ]; then + wget --continue --progress=dot:giga \ + "https://releases.starrocks.io/starrocks/$SR_DIR.tar.gz" \ + -O "$SR_DIR.tar.gz" + fi + tar zxf "$SR_DIR.tar.gz" + + # Configure FE/BE. + mkdir -p "$SR_DIR/meta" "$SR_DIR/storage" + printf "\nmeta_dir = $PWD/$SR_DIR/meta \n" >> "$SR_DIR/fe/conf/fe.conf" + printf "\nstorage_root_path = $PWD/$SR_DIR/storage\n" >> "$SR_DIR/be/conf/be.conf" + # Disable internal caches so the cold run is actually cold. + printf "\ndisable_storage_page_cache = true\n" >> "$SR_DIR/be/conf/be.conf" + printf "\ndatacache_enable = false\n" >> "$SR_DIR/be/conf/be.conf" +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jre mariadb-client bc + +echo "$SR_DIR" > .sr_dir diff --git a/starrocks/load b/starrocks/load new file mode 100755 index 0000000000..2fad538eb7 --- /dev/null +++ b/starrocks/load @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +mysql -h127.0.0.1 -P9030 -uroot hits < create.sql + +START=$(date +%s) +curl --location-trusted \ + -u root: \ + -T "hits.tsv" \ + -H "label:hits_tsv_${START}" \ + -H "timeout:1000" \ + -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ + http://localhost:8030/api/hits/hits/_stream_load + +rm -f hits.tsv +sync diff --git a/starrocks/query b/starrocks/query new file mode 100755 index 0000000000..025bb4f1e1 --- /dev/null +++ b/starrocks/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against StarRocks. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/starrocks/results/20260509/c6a.4xlarge.json b/starrocks/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..5a36dc0783 --- /dev/null +++ b/starrocks/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "StarRocks", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","lukewarm-cold-run"], + "load_time": 596, + "data_size": 21106452922, + "result": [ + [0.029, 0.033, 0.025], + [0.744, 0.055, 0.046], + [2.043, 0.093, 0.093], + [2.751, 0.125, 0.114], + [2.74, 0.356, 0.335], + [2.696, 0.877, 0.966], + [2.319, 0.035, 0.033], + [0.736, 0.053, 0.047], + [4.933, 0.363, 0.355], + [7.542, 0.715, 0.673], + [3.679, 0.278, 0.27], + [4.277, 0.324, 0.307], + [2.135, 0.682, 0.648], + [4.966, 0.942, 1.216], + [2.889, 0.836, 0.825], + [2.697, 0.348, 0.345], + [5.237, 1.536, 1.374], + [0.376, 0.083, 0.12], + [7.764, 2.393, 2.436], + [0.104, 0.023, 0.025], + [11.938, 0.799, 0.797], + [14.258, 0.595, 0.591], + [27.192, 1.784, 1.82], + [48.922, 2.802, 1.043], + [2.405, 0.085, 0.083], + [1.951, 0.149, 0.142], + [2.466, 0.088, 0.08], + [12.012, 1.059, 1.032], + [10.748, 9.488, 9.562], + [1.371, 0.139, 0.118], + [7.407, 0.483, 0.495], + [10.245, 0.681, 0.658], + [6.403, 3.017, 2.913], + [13.918, 5.21, 5.24], + [13.931, 5.249, 5.273], + [2.184, 0.617, 0.587], + [1.372, 0.1, 0.094], + [1.311, 0.089, 0.085], + [1.275, 0.074, 0.07], + [2.324, 0.203, 0.2], + [0.954, 0.059, 0.055], + [1.076, 0.067, 0.06], + [0.84, 0.049, 0.044] +] +} + diff --git a/starrocks/results/20260509/c6a.metal.json b/starrocks/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..ab4bb18fc2 --- /dev/null +++ b/starrocks/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "StarRocks", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","lukewarm-cold-run"], + "load_time": 424, + "data_size": 20515317781, + "result": [ + [0.943, 0.056, 0.049], + [0.056, 0.036, 0.035], + [0.764, 0.067, 0.067], + [1.57, 0.085, 0.07], + [1.6, 0.101, 0.089], + [1.285, 0.252, 0.239], + [0.995, 0.048, 0.047], + [0.142, 0.051, 0.048], + [3.182, 0.124, 0.115], + [4.559, 0.363, 0.344], + [2.204, 0.11, 0.095], + [2.757, 0.118, 0.1], + [1.141, 0.209, 0.204], + [3.676, 0.289, 0.288], + [1.458, 0.208, 0.201], + [1.609, 0.098, 0.088], + [3.474, 0.273, 0.246], + [0.446, 0.106, 0.317], + [5.63, 0.439, 0.374], + [0.388, 0.018, 0.016], + [11.536, 0.253, 0.229], + [13.258, 0.252, 0.226], + [24.967, 0.432, 0.409], + [37.335, 0.929, 0.278], + [1.308, 0.062, 0.047], + [1.116, 0.091, 0.07], + [1.315, 0.07, 0.049], + [11.433, 0.278, 0.25], + [9.401, 1.12, 1.15], + [0.708, 0.104, 0.083], + [4.935, 0.197, 0.169], + [7.427, 0.227, 0.195], + [4.525, 0.594, 0.617], + [11.759, 1.046, 1.066], + [11.794, 1.132, 1.128], + [1.048, 0.172, 0.156], + [1.405, 0.073, 0.061], + [1.39, 0.07, 0.066], + [1.36, 0.094, 0.064], + [2.59, 0.114, 0.102], + [0.986, 0.061, 0.049], + [1.17, 0.056, 0.056], + [0.782, 0.068, 0.057] +] +} + diff --git a/starrocks/run.sh b/starrocks/run.sh deleted file mode 100755 index 6b9200c118..0000000000 --- a/starrocks/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done -done; diff --git a/starrocks/start b/starrocks/start new file mode 100755 index 0000000000..e279fa9c63 --- /dev/null +++ b/starrocks/start @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +SR_DIR=$(cat .sr_dir) +export STARROCKS_HOME="$PWD/$SR_DIR" +export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture) +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +IPADDR=$(hostname -i) + +"$STARROCKS_HOME/fe/bin/start_fe.sh" --daemon +"$STARROCKS_HOME/be/bin/start_be.sh" --daemon + +sleep 30 +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '${IPADDR}:9050'" 2>/dev/null || true +sleep 30 diff --git a/starrocks/stop b/starrocks/stop new file mode 100755 index 0000000000..d34da09e35 --- /dev/null +++ b/starrocks/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +SR_DIR=$(cat .sr_dir 2>/dev/null) || exit 0 +"$SR_DIR/fe/bin/stop_fe.sh" 2>/dev/null || true +"$SR_DIR/be/bin/stop_be.sh" 2>/dev/null || true +exit 0 diff --git a/supabase/benchmark.sh b/supabase/benchmark.sh index daeb9c2238..04ce4ca87d 100755 --- a/supabase/benchmark.sh +++ b/supabase/benchmark.sh @@ -12,7 +12,7 @@ sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y sudo apt-get update -y sudo apt-get install -y postgresql-$PGVERSION -../download-hits-tsv +../lib/download-hits-tsv psql ${SUPABASE_CONNECTION_STRING} -c 'CREATE DATABASE test' psql ${SUPABASE_CONNECTION_STRING} -t &1 | tee load_out.txt diff --git a/tablespace/benchmark.sh b/tablespace/benchmark.sh index c1a4dcc711..abdeeab267 100755 --- a/tablespace/benchmark.sh +++ b/tablespace/benchmark.sh @@ -6,7 +6,7 @@ PASSWORD="" sudo apt-get update -y sudo apt-get install -y postgresql-client -../download-hits-tsv +../lib/download-hits-tsv chmod 777 ~ hits.tsv psql "host=$HOSTNAME port=5432 dbname=csdb user=csuser password=$PASSWORD sslmode=require" < create.sql 2>&1 | tee load_out.txt diff --git a/tablespace/queries.sql b/tablespace/queries.sql index a5f4eccb25..31f65fc898 100644 --- a/tablespace/queries.sql +++ b/tablespace/queries.sql @@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate > SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; \ No newline at end of file +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/tembo-olap/README.md b/tembo-olap/README.md index 871cd64043..3359f283aa 100644 --- a/tembo-olap/README.md +++ b/tembo-olap/README.md @@ -1,3 +1,10 @@ +## Status + +Tembo's OLAP cloud stack has been discontinued — `cloud.tembo.io` no +longer resolves. The result under `results/20240209/` is preserved as a +historical data point and tagged `"historical"`; this benchmark cannot be +re-run as written. The instructions below are kept for reference. + # Instantiation 1) Create Account and Login to https://cloud.tembo.io diff --git a/tembo-olap/benchmark.sh b/tembo-olap/benchmark.sh index 212ee9c21d..98444c437f 100755 --- a/tembo-olap/benchmark.sh +++ b/tembo-olap/benchmark.sh @@ -6,7 +6,7 @@ PASSWORD="" sudo apt-get update -y sudo apt-get install -y postgresql-client -../download-hits-tsv +../lib/download-hits-tsv chmod 777 ~ hits.tsv psql postgresql://postgres:$PASSWORD@$HOSTNAME:5432 -t -c 'CREATE DATABASE test' diff --git a/tembo-olap/queries.sql b/tembo-olap/queries.sql index a5f4eccb25..31f65fc898 100644 --- a/tembo-olap/queries.sql +++ b/tembo-olap/queries.sql @@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate > SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; \ No newline at end of file +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/tembo-olap/results/20240209/tembo-olap-col-c6a.json b/tembo-olap/results/20240209/tembo-olap-col-c6a.json index 601aea5054..9484da8690 100644 --- a/tembo-olap/results/20240209/tembo-olap-col-c6a.json +++ b/tembo-olap/results/20240209/tembo-olap-col-c6a.json @@ -6,9 +6,9 @@ "proprietary": "no", "hardware": "cpu", "tuned": "no", - "comment": "", + "comment": "Historical: cloud.tembo.io is no longer operational; the OLAP stack on Tembo Cloud was discontinued.", - "tags": ["C", "PostgreSQL compatible", "column-oriented", "lukewarm-cold-run"], + "tags": ["C", "PostgreSQL compatible", "column-oriented", "lukewarm-cold-run", "historical"], "load_time": 4903, "data_size": 33864704000, diff --git a/tidb/benchmark.sh b/tidb/benchmark.sh index 0bcb3c1065..09d595b93d 100755 --- a/tidb/benchmark.sh +++ b/tidb/benchmark.sh @@ -1,126 +1,6 @@ #!/bin/bash - -shopt -s expand_aliases - -MODE="$1" -if [[ -z "$MODE" ]] -then - MODE=tiflash -fi - -TIDBVERSION=8.5.1 - -TIUP_HOME=$(pwd) -export TIUP_HOME -DB_NAME=test -TABLE_NAME=hits -DATA_DIR=/tmp/data - -if [[ ! $MODE =~ ^(tikv|tikv-tiflash|tiflash)$ ]]; then - echo "Unknown mode: '$MODE'. Expected one of 'tikv', 'tikv-tiflash', 'tiflash'" - exit 1 -fi - -sudo apt-get update -y -# TiUp installer depends on curl -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl mysql-client -# Needs to be installed and setup for TiFlash; 2-107 corresponds to America/New_York -printf "2\n107\n" | sudo DEBIAN_FRONTEND=noninteractive apt-get install --reinstall tzdata - -wget --https-only --secure-protocol=TLSv1_2 --quiet --continue --progress=dot:giga https://tiup-mirrors.pingcap.com/install.sh -sudo chmod +x ./install.sh -./install.sh -PATH="$TIUP_HOME/bin/:$PATH" -export PATH - -tiup update --self && tiup update cluster - -if [[ $MODE == "tikv" ]]; then - echo "Running benchmark on TiKV only" - DB_CONFIG_FILE=./config/tidb-tikv.toml - NUM_TIFLASH_INSTANCES=0 -elif [[ $MODE == "tiflash" ]]; then - echo "Running benchmark on TiFlash only" - DB_CONFIG_FILE=./config/tidb-tiflash.toml - NUM_TIFLASH_INSTANCES=1 -fi; - -echo "Using configuration file $DB_CONFIG_FILE" -echo "Using $NUM_TIFLASH_INSTANCES TiFlash instances" - -nohup tiup playground $TIDBVERSION --db 1 --pd 1 --kv 1 --tiflash $NUM_TIFLASH_INSTANCES --db.config $DB_CONFIG_FILE --without-monitor > tiup-cluster.out 2>&1 & -while [ ! -f tiup-cluster.out ]; do sleep 1; done -# Might take a while because dependencies need to be downloaded -while ! grep -q 'TiDB Playground Cluster is started' tiup-cluster.out; do - echo "Cluster is not running yet. Checking again in 10 seconds..." - sleep 10 -done - -echo "Cluster is running!" -tiup playground display - -alias mysql="mysql --host 127.0.0.1 --port 4000 --connect-timeout 10800 -u root" - -# Deactivate query plan cache -# For details see https://docs.pingcap.com/tidb/v8.5/sql-non-prepared-plan-cache/ -mysql -e "SET GLOBAL tidb_enable_non_prepared_plan_cache = OFF;" - -rm -rf $DATA_DIR -mkdir $DATA_DIR -# File name must correspond to .. -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' -O "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv.gz" -gzip -d -f "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv.gz" -chmod 444 "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" - -START=$(date +%s) - -mysql -e "DROP DATABASE IF EXISTS $DB_NAME;" -mysql -e "CREATE DATABASE $DB_NAME;" -mysql test < create.sql - -if [[ $MODE == "tiflash" || $MODE == "tikv-tiflash" ]]; then - echo "Enabling TiFlash" - mysql test -e "ALTER TABLE $TABLE_NAME SET TIFLASH REPLICA 1;" -fi; - -rm -rf /tmp/sorted-kv-dir -mkdir /tmp/sorted-kv-dir -nohup tiup tidb-lightning -config ./config/tidb-lightning.toml > tiup-tidb-lightning.out 2>&1 & -while [ ! -f tidb-lightning.log ]; do sleep 1; done -echo "Starting to check for completion on $(date +"%T")" -while ! grep -q 'the whole procedure completed' tidb-lightning.log; do - if grep -q 'tidb lightning exit.*finished=false' tidb-lightning.log || grep -q 'ERROR' tidb-lightning.log; then - echo "An error occurred during the import. Check the log file for details." - cat tiup-tidb-lightning.out - cat tidb-lightning.log - exit 1 - fi; - grep 'progress.*total' tidb-lightning.log | tail -n 1 - echo "Data loading is not done yet. Checking again in 10 seconds..." - sleep 10 -done - -echo "Data loading is done! Checking log file for time taken to load the data." -grep 'the whole procedure completed' tidb-lightning.log | sed -r -e 's/^.+\[takeTime=([0-9\.hms])+\].+?$/\1/' -command time -f '%e' mysql test -e "ANALYZE TABLE $TABLE_NAME;" - -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -./run.sh 2>&1 | tee log.txt - -# Take storage size of TiKV for ALL modes into account, because directly loading data into TiFlash only is currently not supported -echo "Calculating storage size of TiKV in bytes..." -echo "Data size: " -mysql test -e "SELECT (DATA_LENGTH + INDEX_LENGTH) AS TIKV_STORAGE_SIZE_BYTES FROM information_schema.tables WHERE table_schema = '$DB_NAME' AND table_name = '$TABLE_NAME';" | tail -n1 - -if [[ $MODE == "tiflash" || $MODE == "tikv-tiflash" ]]; then - echo "Calculating additional storage size of TiFlash in bytes..." - echo "Data size: " - mysql test -e "SELECT TOTAL_SIZE AS TIFLASH_STORAGE_SIZE_BYTES FROM information_schema.tiflash_tables WHERE TIDB_DATABASE = '$DB_NAME' AND TIDB_TABLE = '$TABLE_NAME';" | tail -n1 -fi; - -grep -P 'rows? in set|Empty set|^ERROR' log.txt | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# TiDB Lightning loads from .
.csv files; we use the CSV download. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/tidb/check b/tidb/check new file mode 100755 index 0000000000..3c3a15187f --- /dev/null +++ b/tidb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql --host 127.0.0.1 --port 4000 -u root -e "SELECT 1" >/dev/null diff --git a/tidb/data-size b/tidb/data-size new file mode 100755 index 0000000000..d6f137bd52 --- /dev/null +++ b/tidb/data-size @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +DB_NAME=test +TABLE_NAME=hits + +MYSQL="mysql --host 127.0.0.1 --port 4000 -u root --silent --skip-column-names" + +# TiKV storage (always present). +tikv_size=$($MYSQL test -e \ + "SELECT (DATA_LENGTH + INDEX_LENGTH) FROM information_schema.tables \ + WHERE table_schema = '$DB_NAME' AND table_name = '$TABLE_NAME';") + +# Optional TiFlash storage (may not exist if mode is tikv-only). +tiflash_size=$($MYSQL test -e \ + "SELECT IFNULL(SUM(TOTAL_SIZE), 0) FROM information_schema.tiflash_tables \ + WHERE TIDB_DATABASE = '$DB_NAME' AND TIDB_TABLE = '$TABLE_NAME';" 2>/dev/null || echo 0) + +awk -v a="$tikv_size" -v b="$tiflash_size" 'BEGIN { printf "%d\n", a + b }' diff --git a/tidb/install b/tidb/install new file mode 100755 index 0000000000..feb614280c --- /dev/null +++ b/tidb/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +# Defaults match the original benchmark.sh; override via env if needed. +TIDB_MODE=${TIDB_MODE:-tiflash} +TIDBVERSION=${TIDBVERSION:-8.5.1} + +if [[ ! $TIDB_MODE =~ ^(tikv|tikv-tiflash|tiflash)$ ]]; then + echo "Unknown TIDB_MODE: '$TIDB_MODE'. Expected 'tikv', 'tikv-tiflash', or 'tiflash'." >&2 + exit 1 +fi + +sudo apt-get update -y +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl mysql-client wget +# tzdata install (2-107 = America/New_York) is required for TiFlash. +printf "2\n107\n" | sudo DEBIAN_FRONTEND=noninteractive apt-get install --reinstall tzdata + +TIUP_HOME=$(pwd) +export TIUP_HOME + +if [ ! -x "$TIUP_HOME/bin/tiup" ]; then + wget --https-only --secure-protocol=TLSv1_2 --quiet --continue --progress=dot:giga \ + https://tiup-mirrors.pingcap.com/install.sh + chmod +x ./install.sh + ./install.sh +fi + +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +tiup update --self +tiup update cluster diff --git a/tidb/load b/tidb/load new file mode 100755 index 0000000000..f64d316237 --- /dev/null +++ b/tidb/load @@ -0,0 +1,50 @@ +#!/bin/bash +set -eu + +TIDB_MODE=${TIDB_MODE:-tiflash} +DB_NAME=test +TABLE_NAME=hits +DATA_DIR=/tmp/data + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +MYSQL="mysql --host 127.0.0.1 --port 4000 --connect-timeout 10800 -u root" + +# Stage data file where TiDB Lightning expects it: .
.csv +rm -rf $DATA_DIR +mkdir $DATA_DIR +mv hits.csv "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" +chmod 444 "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" + +$MYSQL -e "DROP DATABASE IF EXISTS $DB_NAME;" +$MYSQL -e "CREATE DATABASE $DB_NAME;" +$MYSQL test < create.sql + +if [[ $TIDB_MODE == "tiflash" || $TIDB_MODE == "tikv-tiflash" ]]; then + $MYSQL test -e "ALTER TABLE $TABLE_NAME SET TIFLASH REPLICA 1;" +fi + +rm -rf /tmp/sorted-kv-dir +mkdir /tmp/sorted-kv-dir +rm -f tidb-lightning.log +nohup tiup tidb-lightning -config ./config/tidb-lightning.toml > tiup-tidb-lightning.out 2>&1 & +while [ ! -f tidb-lightning.log ]; do sleep 1; done + +while ! grep -q 'the whole procedure completed' tidb-lightning.log; do + if grep -q 'tidb lightning exit.*finished=false' tidb-lightning.log || grep -q 'ERROR' tidb-lightning.log; then + echo "Error during import:" >&2 + cat tiup-tidb-lightning.out >&2 + cat tidb-lightning.log >&2 + exit 1 + fi + grep 'progress.*total' tidb-lightning.log | tail -n 1 || true + sleep 10 +done + +$MYSQL test -e "ANALYZE TABLE $TABLE_NAME;" + +rm -rf $DATA_DIR +sync diff --git a/tidb/query b/tidb/query new file mode 100755 index 0000000000..915ad7f4b9 --- /dev/null +++ b/tidb/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against TiDB on :4000. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(mysql --host 127.0.0.1 --port 4000 -u root test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/tidb/run.sh b/tidb/run.sh deleted file mode 100755 index 9b6d56803a..0000000000 --- a/tidb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - for i in $(seq 1 $TRIES); do - mysql --host 127.0.0.1 --port 4000 -u root test -vvv -e "${query}" - done; -done; diff --git a/tidb/start b/tidb/start new file mode 100755 index 0000000000..ba73e94e31 --- /dev/null +++ b/tidb/start @@ -0,0 +1,56 @@ +#!/bin/bash +set -eu + +TIDB_MODE=${TIDB_MODE:-tiflash} +TIDBVERSION=${TIDBVERSION:-8.5.1} + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +# Idempotent: if MySQL protocol on :4000 already responds, do nothing. +if mysql --host 127.0.0.1 --port 4000 -u root -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi + +if [[ $TIDB_MODE == "tikv" ]]; then + DB_CONFIG_FILE=./config/tidb-tikv.toml + NUM_TIFLASH_INSTANCES=0 +else + DB_CONFIG_FILE=./config/tidb-tiflash.toml + NUM_TIFLASH_INSTANCES=1 +fi + +nohup tiup playground "$TIDBVERSION" --db 1 --pd 1 --kv 1 \ + --tiflash $NUM_TIFLASH_INSTANCES \ + --db.config "$DB_CONFIG_FILE" \ + --without-monitor > tiup-cluster.out 2>&1 & + +while [ ! -f tiup-cluster.out ]; do sleep 1; done +while ! grep -q 'TiDB Playground Cluster is started' tiup-cluster.out; do + echo "Cluster is not running yet. Checking again in 10 seconds..." + sleep 10 +done + +# Disable non-prepared plan cache (matches original benchmark behavior). +mysql --host 127.0.0.1 --port 4000 -u root \ + -e "SET GLOBAL tidb_enable_non_prepared_plan_cache = OFF;" + +# `TiDB Playground Cluster is started` fires when tidb itself is up, but +# tiflash registers later. ./load then runs `ALTER TABLE ... SET TIFLASH +# REPLICA 1` immediately, and that fails with +# the tiflash replica count: 1 should be less than the total tiflash server count: 0 +# if no tiflash store has joined yet. Poll information_schema.cluster_info +# (the canonical "what components does PD see" view) until at least one +# tiflash store shows up. 5 minutes is enough for a fresh tiup playground +# in practice. +if [[ "$TIDB_MODE" != "tikv" ]]; then + for _ in $(seq 1 60); do + cnt=$(mysql --host 127.0.0.1 --port 4000 -u root -B -N -e \ + "SELECT COUNT(*) FROM information_schema.cluster_info WHERE type = 'tiflash';" \ + 2>/dev/null || echo 0) + [ "$cnt" -ge 1 ] && break + sleep 5 + done +fi diff --git a/tidb/stop b/tidb/stop new file mode 100755 index 0000000000..512a56b588 --- /dev/null +++ b/tidb/stop @@ -0,0 +1,19 @@ +#!/bin/bash + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +tiup playground display >/dev/null 2>&1 || exit 0 + +# tiup playground exposes no clean stop; kill the playground process group. +pids=$(pgrep -f 'tiup playground' || true) +if [ -n "$pids" ]; then + kill $pids 2>/dev/null || true + sleep 5 + pids=$(pgrep -f 'tiup playground' || true) + if [ -n "$pids" ]; then + kill -9 $pids 2>/dev/null || true + fi +fi diff --git a/timescaledb-no-columnstore/benchmark.sh b/timescaledb-no-columnstore/benchmark.sh index 4db746a843..531bd65038 100755 --- a/timescaledb-no-columnstore/benchmark.sh +++ b/timescaledb-no-columnstore/benchmark.sh @@ -1,45 +1,5 @@ #!/bin/bash - -# Install - -export DEBIAN_FRONTEND=noninteractive -sudo apt-get update -y -sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y -sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' -wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - -sudo apt-get update -y -sudo apt-get install -y timescaledb-2-postgresql-17 postgresql-client-17 -sudo timescaledb-tune -yes -sudo systemctl restart postgresql - -sudo -u postgres psql -c "CREATE DATABASE nocolumnstore" -sudo -u postgres psql nocolumnstore -c "CREATE EXTENSION timescaledb WITH VERSION '2.17.2';" - -../download-hits-tsv -sudo chmod og+rX ~ -chmod 777 hits.tsv - -#import -sudo -u postgres psql nocolumnstore < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -sudo -u postgres psql nocolumnstore -q -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day')" -sudo -u postgres psql nocolumnstore -q -c "CREATE INDEX ix_counterid ON hits (counterid)" -sudo -u postgres psql -c "ALTER DATABASE nocolumnstore SET work_mem TO '1GB';" -sudo -u postgres psql -c "ALTER DATABASE nocolumnstore SET min_parallel_table_scan_size TO '0';" - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql nocolumnstore -q -t -c "\\copy hits FROM 'hits.tsv'" -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql nocolumnstore -q -t -c "vacuum freeze analyze hits;" - -echo -n "Data size: " -sudo -u postgres psql nocolumnstore -q -t -c "SELECT hypertable_size('hits');" - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/timescaledb-no-columnstore/check b/timescaledb-no-columnstore/check new file mode 100755 index 0000000000..5c6f711234 --- /dev/null +++ b/timescaledb-no-columnstore/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/timescaledb-no-columnstore/data-size b/timescaledb-no-columnstore/data-size new file mode 100755 index 0000000000..33aa229a0c --- /dev/null +++ b/timescaledb-no-columnstore/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo -u postgres psql nocolumnstore -A -t -c "SELECT hypertable_size('hits');" diff --git a/timescaledb-no-columnstore/install b/timescaledb-no-columnstore/install new file mode 100755 index 0000000000..128c0820c4 --- /dev/null +++ b/timescaledb-no-columnstore/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +sudo apt-get update -y +sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' +wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - + +sudo apt-get update -y +sudo apt-get install -y timescaledb-2-postgresql-$PGVERSION postgresql-client-$PGVERSION +sudo timescaledb-tune -yes + +sudo systemctl restart postgresql@$PGVERSION-main diff --git a/timescaledb-no-columnstore/load b/timescaledb-no-columnstore/load new file mode 100755 index 0000000000..a5c87ec6eb --- /dev/null +++ b/timescaledb-no-columnstore/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +sudo chmod og+rX ~ +chmod 777 hits.tsv + +sudo -u postgres psql -t -c "DROP DATABASE IF EXISTS nocolumnstore" +sudo -u postgres psql -t -c "CREATE DATABASE nocolumnstore" +sudo -u postgres psql nocolumnstore -c "CREATE EXTENSION IF NOT EXISTS timescaledb;" + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore < create.sql + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day')" +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -c "CREATE INDEX ix_counterid ON hits (counterid)" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE nocolumnstore SET work_mem TO '1GB';" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE nocolumnstore SET min_parallel_table_scan_size TO '0';" + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -t -c "\\copy hits FROM 'hits.tsv'" +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -t -c "vacuum freeze analyze hits;" + +rm -f hits.tsv +sync diff --git a/timescaledb-no-columnstore/query b/timescaledb-no-columnstore/query new file mode 100755 index 0000000000..9be35a6364 --- /dev/null +++ b/timescaledb-no-columnstore/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `nocolumnstore` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql nocolumnstore -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/timescaledb-no-columnstore/run.sh b/timescaledb-no-columnstore/run.sh deleted file mode 100755 index e87c0ae261..0000000000 --- a/timescaledb-no-columnstore/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo -u postgres psql nocolumnstore -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/timescaledb-no-columnstore/start b/timescaledb-no-columnstore/start new file mode 100755 index 0000000000..941f213c51 --- /dev/null +++ b/timescaledb-no-columnstore/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/timescaledb-no-columnstore/stop b/timescaledb-no-columnstore/stop new file mode 100755 index 0000000000..47969378d7 --- /dev/null +++ b/timescaledb-no-columnstore/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/timescaledb/benchmark.sh b/timescaledb/benchmark.sh index 266782bda5..531bd65038 100755 --- a/timescaledb/benchmark.sh +++ b/timescaledb/benchmark.sh @@ -1,54 +1,5 @@ #!/bin/bash - -# Install -export DEBIAN_FRONTEND=noninteractive -sudo apt-get update -y -sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y -sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' -wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - -sudo apt-get update -y -sudo apt-get install -y timescaledb-2-postgresql-17 postgresql-client-17 -sudo timescaledb-tune -yes - -sudo systemctl restart postgresql - -sudo -u postgres psql -c "CREATE DATABASE test" -sudo -u postgres psql test -c "CREATE EXTENSION timescaledb WITH VERSION '2.17.2';" - -# Import the data -../download-hits-tsv -sudo chmod og+rX ~ -chmod 777 hits.tsv - -sudo -u postgres psql test < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -sudo -u postgres psql test -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day', create_default_indexes => false)" -sudo -u postgres psql test -c "ALTER TABLE hits SET (timescaledb.compress, timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'counterid, userid, eventtime')" -sudo -u postgres psql test -c "ALTER DATABASE test SET timescaledb.enable_chunk_skipping to ON;" -sudo -u postgres psql -c "ALTER DATABASE test SET work_mem TO '1GB';" -sudo -u postgres psql -c "ALTER DATABASE test SET min_parallel_table_scan_size TO '0';" -sudo -u postgres psql test -c "SELECT enable_chunk_skipping('hits', 'counterid');" - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -t -c "\\copy hits FROM 'hits.tsv'" - -# See https://github.com/timescale/timescaledb/issues/4473#issuecomment-1167095245 -# https://docs.timescale.com/timescaledb/latest/how-to-guides/compression/manually-compress-chunks/#compress-chunks-manually -# TimescaleDB benchmark wihout compression is available in timescaledb no columnstore directory - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -q -c "SELECT compress_chunk(i, if_not_compressed => true) FROM show_chunks('hits') i" -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -q -t -c "vacuum freeze analyze hits;" - -echo -n "Data size: " -sudo -u postgres psql test -q -c "\t" -c "SELECT hypertable_size('hits');" - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/timescaledb/check b/timescaledb/check new file mode 100755 index 0000000000..5c6f711234 --- /dev/null +++ b/timescaledb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/timescaledb/data-size b/timescaledb/data-size new file mode 100755 index 0000000000..5bf6f670e5 --- /dev/null +++ b/timescaledb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# Report the timescaledb hypertable's logical size in bytes. +sudo -u postgres psql test -A -t -c "SELECT hypertable_size('hits');" diff --git a/timescaledb/install b/timescaledb/install new file mode 100755 index 0000000000..84dda3f196 --- /dev/null +++ b/timescaledb/install @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +# PGDG repo for matching PG version + timescale repo for the extension. +sudo apt-get update -y +sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' +wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - + +sudo apt-get update -y +sudo apt-get install -y timescaledb-2-postgresql-$PGVERSION postgresql-client-$PGVERSION +sudo timescaledb-tune -yes + +sudo systemctl restart postgresql@$PGVERSION-main diff --git a/timescaledb/load b/timescaledb/load new file mode 100755 index 0000000000..03917af0c5 --- /dev/null +++ b/timescaledb/load @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +sudo chmod og+rX ~ +chmod 777 hits.tsv + +sudo -u postgres psql -t -c "DROP DATABASE IF EXISTS test" +sudo -u postgres psql -t -c "CREATE DATABASE test" +sudo -u postgres psql test -c "CREATE EXTENSION IF NOT EXISTS timescaledb;" + +sudo -u postgres psql -v ON_ERROR_STOP=1 test < create.sql + +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day', create_default_indexes => false)" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "ALTER TABLE hits SET (timescaledb.compress, timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'counterid, userid, eventtime')" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "ALTER DATABASE test SET timescaledb.enable_chunk_skipping to ON;" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE test SET work_mem TO '1GB';" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE test SET min_parallel_table_scan_size TO '0';" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "SELECT enable_chunk_skipping('hits', 'counterid');" + +sudo -u postgres psql -v ON_ERROR_STOP=1 test -t -c "\\copy hits FROM 'hits.tsv'" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -q -c "SELECT compress_chunk(i, if_not_compressed => true) FROM show_chunks('hits') i" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -q -t -c "vacuum freeze analyze hits;" + +rm -f hits.tsv +sync diff --git a/timescaledb/query b/timescaledb/query new file mode 100755 index 0000000000..cafe324f89 --- /dev/null +++ b/timescaledb/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql test -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/timescaledb/results/20260509/c6a.4xlarge.json b/timescaledb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..0a37747393 --- /dev/null +++ b/timescaledb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "TimescaleDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C","PostgreSQL compatible","column-oriented","time-series","lukewarm-cold-run"], + "load_time": 2811, + "data_size": 19310886912, + "result": [ + [0.357, 0.032, 0.032], + [33.573, 0.159, 0.159], + [65.491, 0.259, 0.264], + [38.821, 0.195, 0.194], + [48.476, 10.282, 10.258], + [58.671, 15.16, 15.23], + [33.76, 0.161, 0.161], + [33.663, 0.163, 0.163], + [88.659, 19.034, 18.97], + [137.328, 22.926, 22.919], + [70.417, 1.812, 1.78], + [80.779, 2.349, 2.273], + [46.558, 6.069, 5.934], + [85.354, 8.014, 8], + [75.922, 6.89, 6.966], + [43.576, 13.175, 13.033], + [91.327, 25.226, 25.21], + [80.162, 12.822, 13.034], + [437.849, 108.446, 108.958], + [10.052, 0.078, 0.078], + [81.607, 1.402, 1.414], + [81.966, 1.418, 1.437], + [79.273, 1.545, 1.548], + [1.873, 0.072, 0.072], + [1.999, 0.039, 0.038], + [44.574, 0.39, 0.39], + [2.007, 0.038, 0.038], + [97.78, 6.353, 6.357], + [98.615, 62.102, 61.35], + [33.832, 11.642, 11.757], + [135.245, 7.516, 7.536], + [163.156, 13.618, 13.615], + [233.212, 131.648, 130.497], + [96.449, 28.097, 28.055], + [99.624, 32.986, 33.544], + [41.97, 12.488, 12.55], + [2.182, 0.436, 0.432], + [1.622, 0.111, 0.111], + [2.007, 0.052, 0.052], + [3.844, 0.64, 0.641], + [4.577, 0.057, 0.057], + [3.918, 0.044, 0.044], + [1.728, 0.082, 0.081] +] +} + diff --git a/timescaledb/run.sh b/timescaledb/run.sh deleted file mode 100755 index be1c9b661f..0000000000 --- a/timescaledb/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo -u postgres psql test -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/timescaledb/start b/timescaledb/start new file mode 100755 index 0000000000..941f213c51 --- /dev/null +++ b/timescaledb/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/timescaledb/stop b/timescaledb/stop new file mode 100755 index 0000000000..47969378d7 --- /dev/null +++ b/timescaledb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/trino-datalake-partitioned/benchmark.sh b/trino-datalake-partitioned/benchmark.sh index 9152548630..7215fbe151 100755 --- a/trino-datalake-partitioned/benchmark.sh +++ b/trino-datalake-partitioned/benchmark.sh @@ -1,100 +1,6 @@ #!/bin/bash - -set -e - -# Trino's S3 client uses the AWS default credentials chain, which fails on -# anonymous public buckets, and Trino has no built-in flag for unsigned -# requests (see trinodb/trino#27512, PR #27758 closed unmerged). To read -# the public bucket we drop a tiny shim that returns AnonymousAWSCredentials -# into the legacy HDFS S3 plugin and point trino.s3.credentials-provider -# at it. - -sudo apt-get update -y -sudo apt-get install -y docker.io bc - -mkdir -p data/meta etc/catalog shim -# The Trino container runs as uid 1000 ("trino") and writes the file -# metastore into data/meta and the compiled credentials shim into shim. -# Make sure that uid can write here even when benchmark.sh runs as root -# (cloud-init). -sudo chown 1000:1000 data/meta shim - -cat > shim/S3AnonymousProvider.java <<'EOF' -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AnonymousAWSCredentials; -import org.apache.hadoop.conf.Configuration; -import java.net.URI; - -public class S3AnonymousProvider implements AWSCredentialsProvider { - public S3AnonymousProvider(URI uri, Configuration conf) {} - public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } - public void refresh() {} -} -EOF - -# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino -# image. Target Java 11 bytecode for portability. -sudo docker run --rm \ - -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' - set -e - cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" - javac --release 11 -cp "$CP" S3AnonymousProvider.java - jar cf S3AnonymousProvider.jar S3AnonymousProvider.class - ' - -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive -hive.metastore=file -hive.metastore.catalog.dir=local:///meta -local.location=/data -fs.native-local.enabled=true -fs.hadoop.enabled=true -hive.config.resources=/etc/trino/core-site.xml -hive.non-managed-table-writes-enabled=true -EOF - -cat > etc/core-site.xml <<'EOF' - - - - trino.s3.credentials-provider - S3AnonymousProvider - - - trino.s3.endpoint - https://s3.eu-central-1.amazonaws.com - - - trino.s3.region - eu-central-1 - - -EOF - -sudo docker rm -f trino 2>/dev/null || true -sudo docker run -d --name trino \ - -p 8080:8080 \ - -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ - -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ - -v "$PWD/data/meta:/data/meta" \ - -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ - trinodb/trino:latest - -until sudo docker logs trino 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -LOAD_START=$(date +%s) -sudo docker cp create.sql trino:/tmp/create.sql -sudo docker exec -i trino trino --file /tmp/create.sql -LOAD_END=$(date +%s) - -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -# Data is read from S3 on demand; published partitioned size is ~14.7 GB. -echo "Data size: 14737666736" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Datalake variant: Parquet is read directly from public S3, no download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/trino-datalake-partitioned/check b/trino-datalake-partitioned/check new file mode 100755 index 0000000000..693af38ac4 --- /dev/null +++ b/trino-datalake-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +out=$(curl -sf http://localhost:8080/v1/info) +echo "$out" | grep -q '"starting":false' diff --git a/trino-datalake-partitioned/data-size b/trino-datalake-partitioned/data-size new file mode 100755 index 0000000000..03827db0ee --- /dev/null +++ b/trino-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Data is read from S3 on demand; published partitioned size is ~14.7 GB. +echo 14737666736 diff --git a/trino-datalake-partitioned/install b/trino-datalake-partitioned/install new file mode 100755 index 0000000000..cd6fd35403 --- /dev/null +++ b/trino-datalake-partitioned/install @@ -0,0 +1,77 @@ +#!/bin/bash +set -e + +# Trino's S3 client uses the AWS default credentials chain, which fails on +# anonymous public buckets, and Trino has no built-in flag for unsigned +# requests (see trinodb/trino#27512, PR #27758 closed unmerged). To read +# the public bucket we drop a tiny shim that returns AnonymousAWSCredentials +# into the legacy HDFS S3 plugin and point trino.s3.credentials-provider +# at it. + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y bc + +sudo docker pull trinodb/trino:latest + +mkdir -p data/meta etc/catalog shim +sudo chown 1000:1000 data/meta shim + +cat > shim/S3AnonymousProvider.java <<'EOF' +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AnonymousAWSCredentials; +import org.apache.hadoop.conf.Configuration; +import java.net.URI; + +public class S3AnonymousProvider implements AWSCredentialsProvider { + public S3AnonymousProvider(URI uri, Configuration conf) {} + public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } + public void refresh() {} +} +EOF + +# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino +# image. Target Java 11 bytecode for portability. Skip if already built. +if [ ! -f shim/S3AnonymousProvider.jar ]; then + sudo docker run --rm \ + -v "$PWD/shim:/shim" \ + --entrypoint sh trinodb/trino:latest -c ' + set -e + cd /shim + CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + javac --release 11 -cp "$CP" S3AnonymousProvider.java + jar cf S3AnonymousProvider.jar S3AnonymousProvider.class + ' +fi + +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive +hive.metastore=file +hive.metastore.catalog.dir=local:///meta +local.location=/data +fs.native-local.enabled=true +fs.hadoop.enabled=true +hive.config.resources=/etc/trino/core-site.xml +hive.non-managed-table-writes-enabled=true +EOF + +cat > etc/core-site.xml <<'EOF' + + + + trino.s3.credentials-provider + S3AnonymousProvider + + + trino.s3.endpoint + https://s3.eu-central-1.amazonaws.com + + + trino.s3.region + eu-central-1 + + +EOF diff --git a/trino-datalake-partitioned/load b/trino-datalake-partitioned/load new file mode 100755 index 0000000000..e44ca0191f --- /dev/null +++ b/trino-datalake-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Schema-only load: Parquet data is read directly from S3 on demand. +sudo docker cp create.sql trino:/tmp/create.sql +sudo docker exec -i trino trino --file /tmp/create.sql + +sync diff --git a/trino-datalake-partitioned/query b/trino-datalake-partitioned/query new file mode 100755 index 0000000000..0d7e70cc23 --- /dev/null +++ b/trino-datalake-partitioned/query @@ -0,0 +1,16 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the trino CLI in the running +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +sudo docker exec -i trino trino --catalog hive --schema clickbench \ + --output-format=NULL --execute "$query" +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino-datalake-partitioned/results/20260509/c6a.4xlarge.json b/trino-datalake-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..77af8b3587 --- /dev/null +++ b/trino-datalake-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (data lake, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 2, + "data_size": 14737666736, + "result": [ + [4.422, 1.552, 1.464], + [5.146, 2.064, 1.987], + [5.516, 2.143, 2.048], + [5.383, 1.956, 1.808], + [6.44, 2.384, 2.258], + [7.718, 3.521, 3.273], + [5.073, 1.926, 1.846], + [5.251, 2.03, 2.056], + [7.387, 3.187, 2.845], + [11.433, 5.757, 5.695], + [6.388, 2.58, 2.504], + [6.499, 2.614, 2.492], + [8.628, 3.9, 3.701], + [11.345, 5.533, 5.225], + [9.103, 3.81, 3.874], + [7.474, 2.883, 2.723], + [11.142, 6.342, 5.848], + [10.61, 5.842, 5.605], + [15.529, 9.929, 9.957], + [5.475, 2.041, 1.929], + [8.986, 4.817, 4.588], + [8.854, 4.997, 4.74], + [12.812, 8.095, 7.742], + [20.232, 15.337, 15.35], + [6.594, 2.612, 2.551], + [6.126, 2.257, 2.206], + [6.687, 2.745, 2.633], + [9.408, 5.088, 4.69], + [21.269, 15.943, 15.295], + [13.14, 7.878, 7.524], + [9.151, 3.555, 3.247], + [10.535, 4.871, 3.828], + [20.827, 14.825, 14.407], + [19.967, 14.683, 14.677], + [20.512, 14.698, 14.246], + [8.992, 4.036, 3.938], + [5.356, 2.043, 1.938], + [5.026, 1.866, 1.783], + [5.16, 1.829, 1.73], + [5.931, 2.425, 2.318], + [5.09, 1.703, 1.672], + [4.971, 1.694, 1.767], + [4.964, 1.637, 1.617] +] +} + diff --git a/trino-datalake-partitioned/results/20260510/c6a.metal.json b/trino-datalake-partitioned/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..e332fbf7d7 --- /dev/null +++ b/trino-datalake-partitioned/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (data lake, partitioned)", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 2, + "data_size": 14737666736, + "result": [ + [5.224, 1.354, 1.301], + [6.032, 1.488, 1.402], + [6.039, 1.847, 1.473], + [5.972, 1.762, 1.52], + [8.093, 1.818, 1.704], + [8.841, 2.623, 2.392], + [5.736, 1.457, 1.632], + [5.954, 1.572, 1.418], + [8.159, 2.409, 1.939], + [11.549, 3.347, 3.256], + [6.728, 1.715, 1.621], + [6.919, 2.274, 1.558], + [9.462, 2.535, 2.434], + [11.99, 4.737, 4.514], + [9.599, 2.713, 2.357], + [8.853, 2.674, 2.217], + [9.958, 3.945, 3.592], + [10.701, 3.653, 3.454], + [14.022, 5.825, 5.907], + [6.859, 1.816, 1.412], + [7.808, 2.616, 2.842], + [8.666, 2.996, 2.7], + [9.066, 3.825, 3.688], + [11.062, 6.392, 5.988], + [6.911, 1.658, 1.652], + [6.557, 1.585, 1.478], + [6.948, 1.863, 1.923], + [7.805, 2.705, 2.731], + [12.575, 6.436, 5.861], + [12.128, 3.627, 2.845], + [8.842, 2.707, 2.391], + [10.298, 2.936, 2.781], + [19.794, 10.704, 12.283], + [18.302, 10.621, 11.14], + [19.197, 10.017, 11.623], + [8.769, 2.447, 2.279], + [6.621, 2.17, 2.035], + [6.018, 1.9, 1.699], + [6.295, 1.858, 1.821], + [6.532, 2.417, 2.296], + [5.866, 1.733, 1.579], + [6.379, 1.769, 1.63], + [5.837, 1.593, 1.52] +] +} + diff --git a/trino-datalake-partitioned/run.sh b/trino-datalake-partitioned/run.sh deleted file mode 100755 index f02cae3cfd..0000000000 --- a/trino-datalake-partitioned/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/trino-datalake-partitioned/start b/trino-datalake-partitioned/start new file mode 100755 index 0000000000..78c6d033f7 --- /dev/null +++ b/trino-datalake-partitioned/start @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +if sudo docker ps --format '{{.Names}}' | grep -qx trino; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx trino; then + sudo docker start trino + exit 0 +fi + +sudo docker run -d --name trino \ + -p 8080:8080 \ + -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ + -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ + -v "$PWD/data/meta:/data/meta" \ + -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ + trinodb/trino:latest diff --git a/trino-datalake-partitioned/stop b/trino-datalake-partitioned/stop new file mode 100755 index 0000000000..956100aa20 --- /dev/null +++ b/trino-datalake-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop trino 2>/dev/null || true +exit 0 diff --git a/trino-datalake/benchmark.sh b/trino-datalake/benchmark.sh index bb891b353c..7215fbe151 100755 --- a/trino-datalake/benchmark.sh +++ b/trino-datalake/benchmark.sh @@ -1,100 +1,6 @@ #!/bin/bash - -set -e - -# Trino's S3 client uses the AWS default credentials chain, which fails on -# anonymous public buckets, and Trino has no built-in flag for unsigned -# requests (see trinodb/trino#27512, PR #27758 closed unmerged). To read -# the public bucket we drop a tiny shim that returns AnonymousAWSCredentials -# into the legacy HDFS S3 plugin and point trino.s3.credentials-provider -# at it. - -sudo apt-get update -y -sudo apt-get install -y docker.io bc - -mkdir -p data/meta etc/catalog shim -# The Trino container runs as uid 1000 ("trino") and writes the file -# metastore into data/meta and the compiled credentials shim into shim. -# Make sure that uid can write here even when benchmark.sh runs as root -# (cloud-init). -sudo chown 1000:1000 data/meta shim - -cat > shim/S3AnonymousProvider.java <<'EOF' -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AnonymousAWSCredentials; -import org.apache.hadoop.conf.Configuration; -import java.net.URI; - -public class S3AnonymousProvider implements AWSCredentialsProvider { - public S3AnonymousProvider(URI uri, Configuration conf) {} - public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } - public void refresh() {} -} -EOF - -# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino -# image. Target Java 11 bytecode for portability. -sudo docker run --rm \ - -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' - set -e - cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" - javac --release 11 -cp "$CP" S3AnonymousProvider.java - jar cf S3AnonymousProvider.jar S3AnonymousProvider.class - ' - -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive -hive.metastore=file -hive.metastore.catalog.dir=local:///meta -local.location=/data -fs.native-local.enabled=true -fs.hadoop.enabled=true -hive.config.resources=/etc/trino/core-site.xml -hive.non-managed-table-writes-enabled=true -EOF - -cat > etc/core-site.xml <<'EOF' - - - - trino.s3.credentials-provider - S3AnonymousProvider - - - trino.s3.endpoint - https://s3.eu-central-1.amazonaws.com - - - trino.s3.region - eu-central-1 - - -EOF - -sudo docker rm -f trino 2>/dev/null || true -sudo docker run -d --name trino \ - -p 8080:8080 \ - -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ - -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ - -v "$PWD/data/meta:/data/meta" \ - -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ - trinodb/trino:latest - -until sudo docker logs trino 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -LOAD_START=$(date +%s) -sudo docker cp create.sql trino:/tmp/create.sql -sudo docker exec -i trino trino --file /tmp/create.sql -LOAD_END=$(date +%s) - -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -# Data is read from S3 on demand; report the published single-file size. -echo "Data size: 14779976446" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Datalake variant: Parquet is read directly from public S3, no download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/trino-datalake/check b/trino-datalake/check new file mode 100755 index 0000000000..693af38ac4 --- /dev/null +++ b/trino-datalake/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +out=$(curl -sf http://localhost:8080/v1/info) +echo "$out" | grep -q '"starting":false' diff --git a/trino-datalake/data-size b/trino-datalake/data-size new file mode 100755 index 0000000000..8a280d60f8 --- /dev/null +++ b/trino-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Data is read from S3 on demand; report the published single-file size. +echo 14779976446 diff --git a/trino-datalake/install b/trino-datalake/install new file mode 100755 index 0000000000..cd6fd35403 --- /dev/null +++ b/trino-datalake/install @@ -0,0 +1,77 @@ +#!/bin/bash +set -e + +# Trino's S3 client uses the AWS default credentials chain, which fails on +# anonymous public buckets, and Trino has no built-in flag for unsigned +# requests (see trinodb/trino#27512, PR #27758 closed unmerged). To read +# the public bucket we drop a tiny shim that returns AnonymousAWSCredentials +# into the legacy HDFS S3 plugin and point trino.s3.credentials-provider +# at it. + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y bc + +sudo docker pull trinodb/trino:latest + +mkdir -p data/meta etc/catalog shim +sudo chown 1000:1000 data/meta shim + +cat > shim/S3AnonymousProvider.java <<'EOF' +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AnonymousAWSCredentials; +import org.apache.hadoop.conf.Configuration; +import java.net.URI; + +public class S3AnonymousProvider implements AWSCredentialsProvider { + public S3AnonymousProvider(URI uri, Configuration conf) {} + public AWSCredentials getCredentials() { return new AnonymousAWSCredentials(); } + public void refresh() {} +} +EOF + +# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino +# image. Target Java 11 bytecode for portability. Skip if already built. +if [ ! -f shim/S3AnonymousProvider.jar ]; then + sudo docker run --rm \ + -v "$PWD/shim:/shim" \ + --entrypoint sh trinodb/trino:latest -c ' + set -e + cd /shim + CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + javac --release 11 -cp "$CP" S3AnonymousProvider.java + jar cf S3AnonymousProvider.jar S3AnonymousProvider.class + ' +fi + +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive +hive.metastore=file +hive.metastore.catalog.dir=local:///meta +local.location=/data +fs.native-local.enabled=true +fs.hadoop.enabled=true +hive.config.resources=/etc/trino/core-site.xml +hive.non-managed-table-writes-enabled=true +EOF + +cat > etc/core-site.xml <<'EOF' + + + + trino.s3.credentials-provider + S3AnonymousProvider + + + trino.s3.endpoint + https://s3.eu-central-1.amazonaws.com + + + trino.s3.region + eu-central-1 + + +EOF diff --git a/trino-datalake/load b/trino-datalake/load new file mode 100755 index 0000000000..e44ca0191f --- /dev/null +++ b/trino-datalake/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Schema-only load: Parquet data is read directly from S3 on demand. +sudo docker cp create.sql trino:/tmp/create.sql +sudo docker exec -i trino trino --file /tmp/create.sql + +sync diff --git a/trino-datalake/query b/trino-datalake/query new file mode 100755 index 0000000000..0d7e70cc23 --- /dev/null +++ b/trino-datalake/query @@ -0,0 +1,16 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the trino CLI in the running +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +sudo docker exec -i trino trino --catalog hive --schema clickbench \ + --output-format=NULL --execute "$query" +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino-datalake/results/20260509/c6a.4xlarge.json b/trino-datalake/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..6bad2c3c76 --- /dev/null +++ b/trino-datalake/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (data lake, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [6.424, 3.186, 2.948], + [7.524, 3.422, 3.263], + [8.082, 3.23, 3.2], + [7.661, 3.499, 2.978], + [8.857, 4.088, 3.656], + [10.304, 4.793, 4.582], + [7.63, 3.514, 3.246], + [7.517, 3.287, 3.031], + [9.923, 4.695, 4.561], + [12.998, 7.273, 6.888], + [8.443, 3.729, 3.319], + [8.492, 3.632, 3.347], + [10.897, 4.985, 4.472], + [13.166, 7.052, 6.701], + [10.761, 4.975, 4.649], + [9.783, 4.216, 3.896], + [13.46, 7.686, 7.611], + [12.956, 7.447, 6.902], + [17.799, 11.627, 11.341], + [7.558, 3.365, 3.188], + [11.175, 5.821, 5.747], + [11.029, 5.981, 5.64], + [14.355, 8.939, 8.693], + [22.13, 16.554, 16.58], + [8.788, 3.808, 3.583], + [8.196, 3.685, 3.273], + [8.718, 3.797, 3.593], + [10.935, 5.653, 5.519], + [22.636, 17.381, 16.709], + [14.178, 8.507, 8.274], + [10.731, 4.845, 4.175], + [11.807, 5.239, 5.072], + [23.236, 15.875, 15.808], + [21.913, 16.807, 15.905], + [21.868, 16.235, 16.058], + [10.68, 5.139, 4.913], + [7.908, 3.182, 3.425], + [7.611, 3.084, 2.932], + [7.752, 3.158, 2.972], + [8.425, 3.569, 3.533], + [7.586, 3.078, 3.097], + [7.516, 3.181, 2.893], + [7.335, 3.156, 2.929] +] +} + diff --git a/trino-datalake/results/20260510/c6a.metal.json b/trino-datalake/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..d73cf7f657 --- /dev/null +++ b/trino-datalake/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (data lake, single)", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [6.349, 1.696, 1.927], + [6.766, 1.904, 1.915], + [7.648, 2.192, 1.818], + [6.991, 1.917, 1.611], + [9.291, 2.823, 1.916], + [10.167, 2.833, 2.5], + [7.105, 1.999, 1.619], + [8.002, 2.331, 1.885], + [9.566, 2.475, 2.239], + [12.537, 4.311, 3.91], + [8.119, 2.088, 1.899], + [8.379, 2.09, 1.919], + [11.732, 2.981, 2.623], + [12.841, 5.095, 4.566], + [10.54, 3.017, 2.577], + [11.157, 3.438, 2.387], + [13.048, 4.192, 3.923], + [10.815, 4.017, 3.638], + [14.259, 6.662, 6.376], + [8.287, 1.916, 1.67], + [9.19, 2.871, 2.97], + [8.197, 3.585, 2.996], + [10.058, 4.021, 3.642], + [13.003, 6.674, 5.982], + [8.238, 2.03, 2.102], + [7.975, 2.055, 2.174], + [7.718, 2.06, 2.107], + [8.369, 3.01, 3.113], + [14.257, 6.091, 7.263], + [12.059, 4.012, 3.339], + [10.541, 2.645, 2.462], + [12.766, 3.277, 3.035], + [19.446, 10.776, 10.902], + [17.465, 10.854, 11.342], + [18.999, 11.45, 12.65], + [10.748, 2.828, 2.503], + [8.36, 2.651, 2.39], + [7.408, 1.934, 1.959], + [7.8, 2.367, 2.131], + [9.375, 2.872, 2.33], + [7.49, 2.391, 1.865], + [7.314, 2.201, 1.902], + [6.925, 2.274, 1.803] +] +} + diff --git a/trino-datalake/run.sh b/trino-datalake/run.sh deleted file mode 100755 index f02cae3cfd..0000000000 --- a/trino-datalake/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/trino-datalake/start b/trino-datalake/start new file mode 100755 index 0000000000..78c6d033f7 --- /dev/null +++ b/trino-datalake/start @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +if sudo docker ps --format '{{.Names}}' | grep -qx trino; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx trino; then + sudo docker start trino + exit 0 +fi + +sudo docker run -d --name trino \ + -p 8080:8080 \ + -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ + -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ + -v "$PWD/data/meta:/data/meta" \ + -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ + trinodb/trino:latest diff --git a/trino-datalake/stop b/trino-datalake/stop new file mode 100755 index 0000000000..956100aa20 --- /dev/null +++ b/trino-datalake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop trino 2>/dev/null || true +exit 0 diff --git a/trino-partitioned/benchmark.sh b/trino-partitioned/benchmark.sh index f5d23ffc8c..6a7f45d3a1 100755 --- a/trino-partitioned/benchmark.sh +++ b/trino-partitioned/benchmark.sh @@ -1,59 +1,5 @@ #!/bin/bash - -set -e - -# Install Docker (Trino's official image bundles its own JRE). -sudo apt-get update -y -sudo apt-get install -y docker.io bc - -# Download the partitioned dataset (100 parquet files). -mkdir -p data/hits -cd data/hits -seq 0 99 | xargs -P16 -I{} wget --continue --quiet \ - "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet" -cd ../.. - -# The Trino container runs as uid 1000 ("trino"), and writes the file -# metastore into this directory. Make sure that uid can write here even -# when benchmark.sh runs as root (cloud-init). -sudo chown -R 1000:1000 data - -# Trino catalog configuration: Hive connector backed by a file metastore -# stored on the local filesystem, no Hadoop or external metastore required. -mkdir -p etc/catalog -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive -hive.metastore=file -hive.metastore.catalog.dir=local:///metastore -local.location=/clickbench -fs.native-local.enabled=true -hive.non-managed-table-writes-enabled=true -EOF - -# Start the Trino server. The data dir is exposed at /clickbench so it -# matches local.location above. -sudo docker rm -f trino 2>/dev/null || true -sudo docker run -d --name trino \ - -p 8080:8080 \ - -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ - -v "$PWD/data:/clickbench" \ - trinodb/trino:latest - -# Wait for Trino to finish starting up. -until sudo docker logs trino 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -# Create the schema, the external table over the parquet directory and a -# view that exposes the standard ClickBench column types. -LOAD_START=$(date +%s) -sudo docker cp create.sql trino:/tmp/create.sql -sudo docker exec -i trino trino --file /tmp/create.sql -LOAD_END=$(date +%s) - -# Run the benchmark queries. -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: $(du -bcs data/hits/*.parquet | tail -n1 | cut -f1)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/trino-partitioned/check b/trino-partitioned/check new file mode 100755 index 0000000000..693af38ac4 --- /dev/null +++ b/trino-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +out=$(curl -sf http://localhost:8080/v1/info) +echo "$out" | grep -q '"starting":false' diff --git a/trino-partitioned/data-size b/trino-partitioned/data-size new file mode 100755 index 0000000000..82076413bc --- /dev/null +++ b/trino-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Sum the 100 partitioned Parquet file sizes. +du -bcs data/hits/hits_*.parquet | tail -n1 | cut -f1 diff --git a/trino-partitioned/install b/trino-partitioned/install new file mode 100755 index 0000000000..7e165866f9 --- /dev/null +++ b/trino-partitioned/install @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +# Install Docker (Trino's official image bundles its own JRE) and bc. +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y bc + +sudo docker pull trinodb/trino:latest + +mkdir -p etc/catalog +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive +hive.metastore=file +hive.metastore.catalog.dir=local:///metastore +local.location=/clickbench +fs.native-local.enabled=true +hive.non-managed-table-writes-enabled=true +EOF + +mkdir -p data/hits +sudo chown -R 1000:1000 data diff --git a/trino-partitioned/load b/trino-partitioned/load new file mode 100755 index 0000000000..0a2fe55ba3 --- /dev/null +++ b/trino-partitioned/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Hardlink the 100 partitioned Parquet files into the Hive +# external_location directory; the Trino container reads /clickbench/hits. +for f in hits_*.parquet; do + ln -f "$f" "data/hits/$f" +done +sudo chown -R 1000:1000 data + +sudo docker cp create.sql trino:/tmp/create.sql +sudo docker exec -i trino trino --file /tmp/create.sql + +rm -f hits_*.parquet +sync diff --git a/trino-partitioned/query b/trino-partitioned/query new file mode 100755 index 0000000000..0d7e70cc23 --- /dev/null +++ b/trino-partitioned/query @@ -0,0 +1,16 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the trino CLI in the running +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +sudo docker exec -i trino trino --catalog hive --schema clickbench \ + --output-format=NULL --execute "$query" +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino-partitioned/results/20260509/c6a.4xlarge.json b/trino-partitioned/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..115acb5f3f --- /dev/null +++ b/trino-partitioned/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (Parquet, partitioned)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 20, + "data_size": 14737666736, + "result": [ + [3.045, 1.338, 1.264], + [3.514, 1.395, 1.319], + [3.989, 1.619, 1.478], + [3.753, 1.448, 1.387], + [5.077, 2.269, 2.092], + [6.574, 3.236, 2.849], + [3.523, 1.599, 1.546], + [3.515, 1.427, 1.333], + [6.164, 2.996, 2.698], + [9.094, 4.859, 4.62], + [4.524, 1.755, 1.638], + [4.787, 1.911, 1.597], + [6.952, 3.124, 2.845], + [8.881, 4.684, 4.273], + [7.034, 3.347, 3.203], + [5.879, 2.589, 2.521], + [9.545, 5.286, 5.023], + [8.327, 4.923, 4.738], + [12.898, 8.506, 8.206], + [3.614, 1.415, 1.329], + [12.33, 3.715, 3.682], + [14.126, 3.742, 3.642], + [25.142, 6.257, 6.2], + [57.09, 26.934, 26.773], + [5.955, 2.073, 1.907], + [4.388, 1.937, 1.806], + [5.811, 2.033, 2.023], + [12.63, 3.903, 3.78], + [18.848, 14.816, 14.575], + [11.232, 7.511, 7.197], + [7.004, 2.955, 2.921], + [10.233, 3.503, 3.306], + [17.418, 13.165, 11.706], + [17.448, 12.72, 12.711], + [17.803, 12.966, 12.703], + [7.025, 3.524, 3.373], + [3.998, 1.651, 1.616], + [3.594, 1.497, 1.421], + [3.644, 1.491, 1.35], + [4.423, 1.997, 1.899], + [3.508, 1.403, 1.34], + [3.468, 1.421, 1.32], + [3.58, 1.424, 1.332] +] +} + diff --git a/trino-partitioned/results/20260510/c6a.metal.json b/trino-partitioned/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..458d098974 --- /dev/null +++ b/trino-partitioned/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (Parquet, partitioned)", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 63, + "data_size": 14737666736, + "result": [ + [3.615, 1.237, 1.169], + [4.365, 1.299, 1.215], + [4.379, 1.653, 1.265], + [4.086, 1.299, 1.477], + [5.82, 1.731, 1.583], + [6.581, 2.868, 2.32], + [4.099, 1.559, 1.215], + [4.298, 1.528, 1.211], + [7.222, 2.004, 1.75], + [9.978, 3.348, 3.138], + [4.937, 1.443, 1.516], + [5.327, 1.502, 1.29], + [8.849, 2.541, 2.5], + [10.821, 4.575, 4.58], + [7.562, 2.686, 2.421], + [6.997, 2.217, 1.899], + [9.769, 3.695, 3.391], + [8.8, 3.682, 3.345], + [14.442, 6.341, 6.657], + [4.502, 1.263, 1.176], + [12.494, 2.287, 2.569], + [14.326, 2.007, 1.9], + [25.267, 2.721, 2.716], + [58.125, 4.448, 4.156], + [5.682, 1.459, 1.34], + [4.375, 1.769, 1.365], + [5.865, 1.445, 1.338], + [12.879, 2.209, 1.904], + [13.238, 5.414, 5.198], + [8.455, 3.404, 3.163], + [9.196, 2.14, 1.984], + [9.941, 2.806, 2.714], + [17.328, 10.876, 10.281], + [18.172, 9.817, 11.232], + [19.034, 10.916, 11.154], + [8.822, 2.514, 2.156], + [4.402, 1.939, 1.65], + [3.935, 1.628, 1.334], + [4.231, 1.778, 1.371], + [4.957, 1.986, 1.807], + [3.933, 1.432, 1.305], + [3.936, 1.432, 1.271], + [3.928, 1.418, 1.248] +] +} + diff --git a/trino-partitioned/run.sh b/trino-partitioned/run.sh deleted file mode 100755 index f02cae3cfd..0000000000 --- a/trino-partitioned/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/trino-partitioned/start b/trino-partitioned/start new file mode 100755 index 0000000000..da87d704b4 --- /dev/null +++ b/trino-partitioned/start @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +if sudo docker ps --format '{{.Names}}' | grep -qx trino; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx trino; then + sudo docker start trino + exit 0 +fi + +sudo docker run -d --name trino \ + -p 8080:8080 \ + -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ + -v "$PWD/data:/clickbench" \ + trinodb/trino:latest diff --git a/trino-partitioned/stop b/trino-partitioned/stop new file mode 100755 index 0000000000..956100aa20 --- /dev/null +++ b/trino-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop trino 2>/dev/null || true +exit 0 diff --git a/trino/benchmark.sh b/trino/benchmark.sh index 21f7798632..b851876173 100755 --- a/trino/benchmark.sh +++ b/trino/benchmark.sh @@ -1,60 +1,5 @@ #!/bin/bash - -set -e - -# Install Docker (Trino's official image bundles its own JRE). -sudo apt-get update -y -sudo apt-get install -y docker.io bc - -# Download the dataset. -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.parquet' - -# Place the parquet file inside its own directory; the Hive connector -# reads every file in the table's external_location. -mkdir -p data/hits -ln -f hits.parquet data/hits/hits.parquet - -# The Trino container runs as uid 1000 ("trino"), and writes the file -# metastore into this directory. Make sure that uid can write here even -# when benchmark.sh runs as root (cloud-init). -sudo chown -R 1000:1000 data - -# Trino catalog configuration: Hive connector backed by a file metastore -# stored on the local filesystem, no Hadoop or external metastore required. -mkdir -p etc/catalog -cat > etc/catalog/hive.properties <<'EOF' -connector.name=hive -hive.metastore=file -hive.metastore.catalog.dir=local:///metastore -local.location=/clickbench -fs.native-local.enabled=true -hive.non-managed-table-writes-enabled=true -EOF - -# Start the Trino server. The container exposes the data dir as -# /clickbench so it matches local.location above. -sudo docker rm -f trino 2>/dev/null || true -sudo docker run -d --name trino \ - -p 8080:8080 \ - -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ - -v "$PWD/data:/clickbench" \ - trinodb/trino:latest - -# Wait for Trino to finish starting up. -until sudo docker logs trino 2>&1 | grep -q "SERVER STARTED"; do - sleep 3 -done -sleep 3 - -# Create the schema, the external table over the parquet file, and a view -# that exposes the ClickBench EventTime/EventDate column types. -LOAD_START=$(date +%s) -sudo docker cp create.sql trino:/tmp/create.sql -sudo docker exec -i trino trino --file /tmp/create.sql -LOAD_END=$(date +%s) - -# Run the benchmark queries. -./run.sh 2>&1 | tee log.txt - -echo "Load time: $((LOAD_END - LOAD_START))" -echo "Data size: $(stat -c %s hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/trino/check b/trino/check new file mode 100755 index 0000000000..1425e226a4 --- /dev/null +++ b/trino/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Trino exposes /v1/info. The "starting" field flips to false when ready. +out=$(curl -sf http://localhost:8080/v1/info) +echo "$out" | grep -q '"starting":false' diff --git a/trino/data-size b/trino/data-size new file mode 100755 index 0000000000..265f258e5d --- /dev/null +++ b/trino/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# External Parquet table — report the source file size. +stat -c %s data/hits/hits.parquet diff --git a/trino/install b/trino/install new file mode 100755 index 0000000000..dd6dc6e97d --- /dev/null +++ b/trino/install @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +# Install Docker (Trino's official image bundles its own JRE) and bc. +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y bc + +# Pull image (idempotent — Docker caches). +sudo docker pull trinodb/trino:latest + +# Trino catalog configuration: Hive connector backed by a file metastore +# stored on the local filesystem, no Hadoop or external metastore required. +mkdir -p etc/catalog +cat > etc/catalog/hive.properties <<'EOF' +connector.name=hive +hive.metastore=file +hive.metastore.catalog.dir=local:///metastore +local.location=/clickbench +fs.native-local.enabled=true +hive.non-managed-table-writes-enabled=true +EOF + +# The Trino container runs as uid 1000 ("trino") and writes the file +# metastore into this directory. Make sure that uid can write here even +# when scripts run as root (cloud-init). +mkdir -p data/hits +sudo chown -R 1000:1000 data diff --git a/trino/load b/trino/load new file mode 100755 index 0000000000..55a0047e3b --- /dev/null +++ b/trino/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Place the parquet file inside the Hive external_location directory. +# The Trino container reads /clickbench/hits (== ./data/hits on the host). +ln -f hits.parquet data/hits/hits.parquet +sudo chown -R 1000:1000 data + +# Run create.sql to register the schema, the external table, and the +# ClickBench-typed view. +sudo docker cp create.sql trino:/tmp/create.sql +sudo docker exec -i trino trino --file /tmp/create.sql + +rm -f hits.parquet +sync diff --git a/trino/query b/trino/query new file mode 100755 index 0000000000..0d7e70cc23 --- /dev/null +++ b/trino/query @@ -0,0 +1,16 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the trino CLI in the running +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +sudo docker exec -i trino trino --catalog hive --schema clickbench \ + --output-format=NULL --execute "$query" +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino/results/20260509/c6a.4xlarge.json b/trino/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..547068e8e4 --- /dev/null +++ b/trino/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 2, + "data_size": 14779976446, + "result": [ + [5.427, 2.759, 2.648], + [5.781, 2.726, 2.664], + [6.09, 3.024, 2.833], + [5.413, 2.804, 2.547], + [6.792, 3.578, 3.42], + [7.89, 4.296, 3.986], + [5.597, 3.02, 2.727], + [5.97, 2.655, 2.635], + [7.477, 4.006, 3.71], + [10.701, 6.216, 5.805], + [6.658, 2.785, 2.694], + [6.797, 3.07, 2.884], + [9.005, 4.478, 3.987], + [10.939, 6.073, 5.546], + [8.589, 4.517, 4.066], + [7.614, 3.725, 3.373], + [11.031, 6.519, 6.193], + [10.479, 6.167, 5.721], + [15.215, 10.561, 10.249], + [6.333, 3.12, 2.915], + [13.155, 5.028, 4.805], + [14.967, 4.75, 4.556], + [25.742, 7.369, 7.175], + [57.811, 46.493, 46.673], + [6.795, 3.46, 3.147], + [6.435, 3.072, 3.037], + [6.887, 3.323, 3.328], + [13.362, 5.247, 4.561], + [20.821, 16.196, 15.487], + [12.414, 7.851, 7.687], + [8.913, 4.334, 3.696], + [10.896, 4.558, 4.401], + [18.899, 12.781, 12.352], + [18.872, 13.876, 13.695], + [20.138, 14.635, 14.36], + [8.985, 4.865, 4.402], + [6.555, 2.841, 2.751], + [6.209, 2.76, 2.659], + [6.505, 2.708, 2.781], + [7.457, 2.957, 2.999], + [6.343, 2.779, 2.694], + [6.216, 2.698, 2.707], + [6.233, 2.712, 2.713] +] +} + diff --git a/trino/results/20260509/c6a.metal.json b/trino/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..95cabc52cb --- /dev/null +++ b/trino/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Trino (Parquet, single)", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Java","column-oriented","stateless","lukewarm-cold-run"], + "load_time": 10, + "data_size": 14779976446, + "result": [ + [4.682, 1.895, 1.624], + [5.713, 1.707, 1.435], + [5.55, 2.09, 1.525], + [4.931, 1.89, 1.49], + [7.925, 2.111, 1.754], + [7.652, 2.805, 2.422], + [4.979, 2.109, 1.715], + [5.448, 1.892, 1.409], + [9.763, 2.488, 1.968], + [13.131, 3.704, 3.57], + [5.958, 1.794, 1.502], + [7.375, 2.101, 1.911], + [8.749, 2.838, 2.565], + [11.4, 4.765, 4.16], + [8.614, 2.989, 2.755], + [9.568, 2.714, 2.156], + [10.186, 3.966, 3.75], + [10.645, 4.086, 3.622], + [16.129, 6.418, 5.926], + [5.586, 1.833, 1.449], + [13.931, 2.418, 1.878], + [15.239, 2.547, 2.42], + [26.426, 4.087, 3.797], + [59.281, 4.588, 4.103], + [7.673, 1.714, 1.56], + [6.09, 1.641, 1.445], + [7.897, 1.765, 1.563], + [14.067, 2.135, 2.049], + [15.824, 5.592, 5.784], + [9.827, 3.638, 2.693], + [8.961, 2.386, 2.114], + [12.139, 2.988, 2.894], + [18.471, 10.839, 12.113], + [18.281, 9.631, 9.914], + [20.186, 11.064, 12.356], + [10.147, 2.724, 2.346], + [5.786, 2.27, 2.05], + [6.329, 1.727, 1.487], + [6.006, 1.85, 1.527], + [6.303, 2.287, 1.751], + [5.43, 1.757, 1.509], + [5.464, 2.344, 1.899], + [4.867, 1.875, 1.503] +] +} + diff --git a/trino/run.sh b/trino/run.sh deleted file mode 100755 index f02cae3cfd..0000000000 --- a/trino/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 - -mapfile -t QUERIES < queries.sql - -for query in "${QUERIES[@]}"; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - START=$(date +%s.%N) - sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "${query}" >/dev/null 2>&1 - EXIT=$? - END=$(date +%s.%N) - if [ "$EXIT" = "0" ]; then - ELAPSED=$(echo "$END - $START" | bc) - printf "%.3f" "$ELAPSED" - else - printf "null" - fi - [[ "$i" != "$TRIES" ]] && echo -n ", " - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/trino/start b/trino/start new file mode 100755 index 0000000000..7a960f3dde --- /dev/null +++ b/trino/start @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Idempotent: if running, do nothing. If exists but stopped, start it. Else run. +if sudo docker ps --format '{{.Names}}' | grep -qx trino; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx trino; then + sudo docker start trino + exit 0 +fi + +sudo docker run -d --name trino \ + -p 8080:8080 \ + -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ + -v "$PWD/data:/clickbench" \ + trinodb/trino:latest diff --git a/trino/stop b/trino/stop new file mode 100755 index 0000000000..956100aa20 --- /dev/null +++ b/trino/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop trino 2>/dev/null || true +exit 0 diff --git a/turso/benchmark.sh b/turso/benchmark.sh index 47a96005ae..b0b9f4775a 100755 --- a/turso/benchmark.sh +++ b/turso/benchmark.sh @@ -1,26 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y curl - -# Download and install Turso -curl --proto '=https' --tlsv1.2 -LsSf https://github.com/tursodatabase/turso/releases/download/v0.1.2-pre.4/turso_cli-installer.sh | sh -export HOME=${HOME:=~} -source $HOME/.turso/env - -tursodb mydb < create.sql - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' tursodb mydb '.import --csv hits.csv hits' -echo -n "Data size: " -wc -c mydb - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|Parse error' | - sed -r -e 's/^(Error|Parse error).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/turso/check b/turso/check new file mode 100755 index 0000000000..836c2cf020 --- /dev/null +++ b/turso/check @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +tursodb :memory: 'SELECT 1' >/dev/null diff --git a/turso/data-size b/turso/data-size new file mode 100755 index 0000000000..f94c4eccf8 --- /dev/null +++ b/turso/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < mydb diff --git a/turso/install b/turso/install new file mode 100755 index 0000000000..f55c9a720c --- /dev/null +++ b/turso/install @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +if ! command -v tursodb >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y curl + curl --proto '=https' --tlsv1.2 -LsSf \ + https://github.com/tursodatabase/turso/releases/download/v0.1.2-pre.4/turso_cli-installer.sh | sh +fi + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" diff --git a/turso/load b/turso/load new file mode 100755 index 0000000000..c7c3fb85a3 --- /dev/null +++ b/turso/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +# Idempotent: blow away any prior DB. +rm -f mydb + +tursodb mydb < create.sql +tursodb mydb '.import --csv hits.csv hits' + +rm -f hits.csv +sync diff --git a/turso/query b/turso/query new file mode 100755 index 0000000000..2dcdf26cf2 --- /dev/null +++ b/turso/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via tursodb against mydb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +query=$(cat) + +TIMEFORMAT='%R' +{ time tursodb mydb <<< "$query" 1>/tmp/turso.out.$$ 2>/tmp/turso.err.$$; } 2>/tmp/turso.time.$$ || status=$? +status=${status:-0} + +cat /tmp/turso.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/turso.err.$$ >&2 + rm -f /tmp/turso.out.$$ /tmp/turso.err.$$ /tmp/turso.time.$$ + exit "$status" +fi + +cat /tmp/turso.err.$$ >&2 +cat /tmp/turso.time.$$ >&2 + +rm -f /tmp/turso.out.$$ /tmp/turso.err.$$ /tmp/turso.time.$$ diff --git a/turso/run.sh b/turso/run.sh deleted file mode 100755 index 02a54dd5ed..0000000000 --- a/turso/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - time tursodb mydb <<< "${query}" - done; -done; diff --git a/turso/start b/turso/start new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/turso/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/turso/stop b/turso/stop new file mode 100755 index 0000000000..06bd986563 --- /dev/null +++ b/turso/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/umbra/benchmark.sh b/umbra/benchmark.sh index 8d6c9c1920..531bd65038 100755 --- a/umbra/benchmark.sh +++ b/umbra/benchmark.sh @@ -1,48 +1,5 @@ #!/bin/bash - -# Ubuntu -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client gzip - -# Amazon Linux -# yum install nc postgresql15 - -# Download + uncompress hits -rm -rf data -mkdir data -../download-hits-tsv -mv hits.tsv data -chmod 777 -R data - -# I spend too much time here battling cryptic error messages only to find out that the data needs to be in some separate directory -rm -rf db -mkdir db -chmod 777 -R db - -# https://hub.docker.com/r/umbradb/umbra -docker run -d -v ./db:/var/db -v ./data:/data -p 5432:5432 --ulimit nofile=1048576:1048576 --ulimit memlock=8388608:8388608 umbradb/umbra:latest -sleep 5 # Things below fail otherwise ... - -start=$(date +%s%3N) -PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -f create.sql 2>&1 | tee load_out.txt -end=$(date +%s%3N) -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo "Load time: $(( (end - start) / 1000 ))" - -./run.sh 2>&1 | tee log.txt - -# Calculate persistence size -sudo chmod 777 -R db # otherwise 'du' complains about permission denied -echo -n "Data size: " -du -bcs db | grep total - -# Pretty-printing -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -# Cleanup -docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q) && docker volume prune --all --force -rm -rf data db +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/umbra/check b/umbra/check new file mode 100755 index 0000000000..5336d8ba03 --- /dev/null +++ b/umbra/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/umbra/data-size b/umbra/data-size new file mode 100755 index 0000000000..ae38a0d596 --- /dev/null +++ b/umbra/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +sudo chmod -R 777 db 2>/dev/null || true +du -bcs db | grep total | awk '{print $1}' diff --git a/umbra/install b/umbra/install new file mode 100755 index 0000000000..d472dbbf8c --- /dev/null +++ b/umbra/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client gzip + +sudo docker pull umbradb/umbra:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/umbra/load b/umbra/load new file mode 100755 index 0000000000..f0c1addae2 --- /dev/null +++ b/umbra/load @@ -0,0 +1,29 @@ +#!/bin/bash +set -eu + +mkdir -p data +mv hits.tsv data/ +chmod -R 777 data + +# create.sql for umbra both creates the table and ingests via COPY. Use +# ON_ERROR_STOP=1 so a mid-COPY failure (e.g. the box runs out of memory +# and Umbra survives but the COPY transaction errored) bubbles up +# instead of leaving a half-loaded table. +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ + -v ON_ERROR_STOP=1 -f create.sql + +# Belt-and-braces row-count check. Umbra has been observed to leave a +# partial table on memory-constrained hosts (16 GB c6a.4xlarge can't +# hold the full mmap working set), letting the benchmark proceed and +# producing implausibly fast warm timings on the surviving subset. +# ClickBench's hits dataset is 99,997,497 rows; allow a small margin. +expected=99997497 +got=$(PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -tAq \ + -c 'SELECT count(*) FROM hits') +if [ "$got" -lt $((expected - 100)) ]; then + echo "umbra/load: hits has $got rows, expected ~$expected — partial load" >&2 + exit 1 +fi + +rm -f data/hits.tsv +sync diff --git a/umbra/query b/umbra/query new file mode 100755 index 0000000000..b72947b6f3 --- /dev/null +++ b/umbra/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against Umbra. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +# Match more failure shapes than just `^ERROR`: PostgreSQL/Umbra also emit +# FATAL: (server-side fatal), PANIC: (Umbra's panic prefix observed in +# `unable to allocate buffer pool`-style failures), and `psql: error` +# (client-side, e.g. connection lost). +# +# Caveat: Umbra silently returns a NULL row for unimplemented functions +# (e.g. regexp_substr) without emitting any error or warning. None of the +# 43 ClickBench queries hit that path, but if a future query does, the +# caller will see a microsecond timing for a "successful" query that +# didn't actually compute anything. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL|PANIC):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/umbra/results/20260509/c6a.metal.json b/umbra/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..003b3564fa --- /dev/null +++ b/umbra/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Umbra", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","PostgreSQL compatible"], + "load_time": 601, + "data_size": 36149312970, + "result": [ + [0.058, 0.003, 0.003], + [0.131, 0.003, 0.003], + [0.325, 0.008, 0.008], + [0.813, 0.009, 0.011], + [0.911, 0.074, 0.074], + [2.243, 0.073, 0.073], + [0.115, 0.009, 0.007], + [0.147, 0.003, 0.004], + [1.774, 0.081, 0.083], + [2.869, 0.109, 0.108], + [1.22, 0.015, 0.015], + [1.601, 0.019, 0.017], + [2.176, 0.069, 0.068], + [3.906, 0.117, 0.123], + [2.566, 0.075, 0.068], + [0.974, 0.097, 0.099], + [3.901, 0.141, 0.132], + [3.888, 0.051, 0.051], + [7.154, 0.444, 0.443], + [0.802, 0.002, 0.002], + [17.618, 0.063, 0.062], + [20.566, 0.026, 0.025], + [33.808, 0.057, 0.056], + [119.487, 0.042, 0.039], + [5.146, 0.005, 0.005], + [2.133, 0.005, 0.005], + [5.145, 0.006, 0.006], + [18.018, 0.068, 0.068], + [13.192, 0.349, 0.349], + [0.183, 0.011, 0.011], + [4.815, 0.039, 0.039], + [7.495, 0.055, 0.055], + [5.139, 0.639, 0.64], + [17.835, 0.421, 0.351], + [17.864, 0.422, 0.414], + [0.453, 0.068, 0.068], + [2.233, 0.013, 0.012], + [2.586, 0.011, 0.009], + [2.306, 0.01, 0.009], + [5.316, 0.021, 0.021], + [0.499, 0.011, 0.01], + [0.417, 0.009, 0.009], + [0.434, 0.01, 0.01] +] +} + diff --git a/umbra/run.sh b/umbra/run.sh deleted file mode 100755 index 19f225684a..0000000000 --- a/umbra/run.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - retry_count=0 - while [ $retry_count -lt 120 ]; do - if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c "SELECT 'Ok';"; then - break - fi - - retry_count=$((retry_count+1)) - sleep 1 - done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/umbra/start b/umbra/start new file mode 100755 index 0000000000..a7fc4dc58e --- /dev/null +++ b/umbra/start @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo docker stop umbradb >/dev/null 2>&1 || true +sudo docker rm umbradb >/dev/null 2>&1 || true + +sudo docker run -d --name umbradb \ + -v "$(pwd)/db:/var/db" \ + -v "$(pwd)/data:/data" \ + -p 5432:5432 \ + --ulimit nofile=1048576:1048576 \ + --ulimit memlock=8388608:8388608 \ + umbradb/umbra:latest >/dev/null + +# Container needs a moment before psql can connect. +for _ in $(seq 1 60); do + PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/umbra/stop b/umbra/stop new file mode 100755 index 0000000000..890229a5b6 --- /dev/null +++ b/umbra/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop umbradb >/dev/null 2>&1 || true diff --git a/ursa/benchmark.sh b/ursa/benchmark.sh index 47aa593c2d..6a7f45d3a1 100755 --- a/ursa/benchmark.sh +++ b/ursa/benchmark.sh @@ -1,34 +1,5 @@ #!/bin/bash - -# Install - -wget --continue --progress=dot:giga "https://ursa-private-builds.s3.eu-central-1.amazonaws.com/ursa-0.0.1/ursa" -chmod +x ursa - -./ursa server > server.log 2>&1 & - -for _ in {1..300} -do - ./ursa client --query "SELECT 1" && break - sleep 1 -done - -# Load the data - -./ursa client < create.sql - -../download-hits-parquet-partitioned -sudo mv hits_*.parquet user_files/ -sudo chown clickhouse:clickhouse user_files/hits_*.parquet - -echo -n "Load time: " -./ursa client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) - -# Run the queries - -./run.sh "$1" - -echo -n "Data size: " -./ursa client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" - -killall ursa +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/ursa/check b/ursa/check new file mode 100755 index 0000000000..9c5d756f03 --- /dev/null +++ b/ursa/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +./ursa client --query "SELECT 1" >/dev/null diff --git a/ursa/data-size b/ursa/data-size new file mode 100755 index 0000000000..e86fff98c0 --- /dev/null +++ b/ursa/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./ursa client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/ursa/install b/ursa/install new file mode 100755 index 0000000000..d5adb7af3c --- /dev/null +++ b/ursa/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# ursa is a private build — fetch the binary if we don't have it. +URSA_URL='https://ursa-private-builds.s3.eu-central-1.amazonaws.com/ursa-0.0.1/ursa' + +if [ ! -x ./ursa ]; then + wget --continue --progress=dot:giga "$URSA_URL" + chmod +x ursa +fi + +mkdir -p user_files diff --git a/ursa/load b/ursa/load new file mode 100755 index 0000000000..2a2560368d --- /dev/null +++ b/ursa/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +./ursa client < create.sql + +# The download script puts hits_*.parquet in the cwd; move them to the +# server's user_files dir so the file() table function can read them. +sudo mv hits_*.parquet user_files/ + +./ursa client \ + --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" \ + --max-insert-threads "$(( $(nproc) / 4 ))" + +sudo rm -f user_files/hits_*.parquet +sync diff --git a/ursa/query b/ursa/query new file mode 100755 index 0000000000..1d747da013 --- /dev/null +++ b/ursa/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via ./ursa client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +./ursa client --time diff --git a/ursa/results/20260509/c6a.4xlarge.json b/ursa/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..13dc368bbd --- /dev/null +++ b/ursa/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "Ursa", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 336, + "data_size": 15431415924, + "result": [ + [0.001, 0.001, 0.001], + [0.003, 0.001, 0.001], + [0.055, 0.021, 0.02], + [0.658, 0.03, 0.029], + [0.975, 0.346, 0.347], + [1.081, 0.446, 0.442], + [0.001, 0.001, 0.001], + [0.033, 0.018, 0.018], + [0.555, 0.5, 0.499], + [0.876, 0.55, 0.544], + [1.017, 0.157, 0.155], + [0.835, 0.167, 0.168], + [1.442, 0.519, 0.509], + [1.691, 0.82, 0.816], + [1.388, 0.656, 0.55], + [0.569, 0.461, 0.517], + [2.426, 1.47, 1.456], + [2.354, 0.938, 0.923], + [5.008, 2.859, 2.808], + [0.34, 0.002, 0.003], + [11.013, 0.335, 0.341], + [12.469, 0.103, 0.109], + [15.3, 0.674, 0.669], + [11.896, 0.419, 0.416], + [2.613, 0.163, 0.165], + [1.651, 0.144, 0.139], + [3.208, 0.163, 0.162], + [1.037, 0.117, 0.122], + [9.067, 5.919, 5.505], + [0.054, 0.026, 0.024], + [0.365, 0.25, 0.25], + [3.326, 0.429, 0.426], + [4.727, 2.378, 2.331], + [9.905, 2.663, 2.641], + [9.903, 2.658, 2.656], + [0.432, 0.313, 0.318], + [0.054, 0.03, 0.03], + [0.033, 0.017, 0.017], + [0.042, 0.015, 0.015], + [0.088, 0.062, 0.062], + [0.029, 0.012, 0.012], + [0.024, 0.01, 0.01], + [0.02, 0.009, 0.009] +] +} + diff --git a/ursa/results/20260510/c6a.metal.json b/ursa/results/20260510/c6a.metal.json new file mode 100644 index 0000000000..aea18e2c84 --- /dev/null +++ b/ursa/results/20260510/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "Ursa", + "date": "2026-05-10", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 288, + "data_size": 15451140669, + "result": [ + [0.002, 0.001, 0.001], + [0.02, 0.002, 0.001], + [0.499, 0.02, 0.018], + [1.6, 0.02, 0.02], + [1.323, 0.272, 0.271], + [2.187, 0.284, 0.277], + [0.002, 0.002, 0.002], + [0.191, 0.017, 0.023], + [1.813, 0.414, 0.408], + [2.042, 0.414, 0.426], + [1.336, 0.105, 0.103], + [1.482, 0.119, 0.121], + [2.412, 0.143, 0.134], + [3.929, 0.228, 0.216], + [2.304, 0.124, 0.116], + [1.036, 0.275, 0.265], + [3.126, 0.288, 0.277], + [2.894, 0.249, 0.237], + [4.386, 0.541, 0.526], + [0.823, 0.003, 0.002], + [10.48, 0.096, 0.091], + [12.119, 0.052, 0.05], + [14.091, 0.163, 0.162], + [17.591, 0.142, 0.152], + [3.432, 0.039, 0.04], + [1.95, 0.037, 0.034], + [3.287, 0.04, 0.037], + [1.832, 0.035, 0.034], + [8.849, 0.905, 0.903], + [0.516, 0.039, 0.039], + [1.719, 0.09, 0.09], + [4.06, 0.308, 0.112], + [4.056, 0.811, 0.804], + [11.806, 0.604, 0.597], + [11.927, 0.578, 0.565], + [0.883, 0.248, 0.251], + [0.901, 0.06, 0.058], + [0.896, 0.03, 0.029], + [0.976, 0.025, 0.024], + [1.498, 0.109, 0.107], + [0.471, 0.021, 0.022], + [0.428, 0.016, 0.014], + [0.963, 0.014, 0.015] +] +} + diff --git a/ursa/run.sh b/ursa/run.sh deleted file mode 100755 index 79394c1e2e..0000000000 --- a/ursa/run.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -else if [ ! -z "$1" ]; then - SUFFIX="-$1" -fi -fi - -TRIES=3 -QUERY_NUM=1 -cat queries"$SUFFIX".sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./ursa client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/ursa/start b/ursa/start new file mode 100755 index 0000000000..3c75896f74 --- /dev/null +++ b/ursa/start @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +# Idempotent: if ursa is already responsive, nothing to do. +if ./ursa client --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi + +# Spawn the server, redirect logs, detach. +nohup ./ursa server > server.log 2>&1 & + +for _ in $(seq 1 300); do + ./ursa client --query "SELECT 1" >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/ursa/stop b/ursa/stop new file mode 100755 index 0000000000..7a76a96612 --- /dev/null +++ b/ursa/stop @@ -0,0 +1,2 @@ +#!/bin/bash +killall ursa 2>/dev/null || true diff --git a/velodb/README.md b/velodb/README.md new file mode 100644 index 0000000000..96f402bc38 --- /dev/null +++ b/velodb/README.md @@ -0,0 +1,16 @@ +# VeloDB + +[VeloDB](https://www.velodb.io/) is the enterprise-grade distribution of +[Apache Doris](https://github.com/apache/doris). Distribution binaries are +hosted at `download.velodb.io`; `install` pulls the latest stable Apache +Doris release from there. + +## History + +- Originally listed in ClickBench as **SelectDB** (the previous brand). +- The hosted SaaS at `selectdb.cloud` was discontinued (the domain now + returns `404 Route Not Found` for every path). +- The company was renamed to **VeloDB** and the entry was renamed + accordingly. Existing results are kept as-is under the new system + name — the engine itself is the same Apache Doris distribution, so + past numbers are still meaningful under the VeloDB brand. diff --git a/velodb/benchmark.sh b/velodb/benchmark.sh new file mode 100755 index 0000000000..531bd65038 --- /dev/null +++ b/velodb/benchmark.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/velodb/check b/velodb/check new file mode 100755 index 0000000000..c6e836c8c1 --- /dev/null +++ b/velodb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/selectdb/create.sql b/velodb/create.sql similarity index 100% rename from selectdb/create.sql rename to velodb/create.sql diff --git a/velodb/data-size b/velodb/data-size new file mode 100755 index 0000000000..2b26d0f6b1 --- /dev/null +++ b/velodb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +du -bs "$DORIS_HOME/be/storage/" | cut -f1 diff --git a/velodb/install b/velodb/install new file mode 100755 index 0000000000..58c041e019 --- /dev/null +++ b/velodb/install @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 20.04+ +ROOT=$(pwd) +# SelectDB no longer publishes free standalone tarballs; the company that +# now stewards SelectDB (VeloDB) hosts Apache Doris release binaries at +# download.velodb.io. Track the latest stable Apache Doris release — +# functionally what SelectDB ships now — so this benchmark keeps running. +URL='https://download.velodb.io/apache-doris-4.0.5-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/$dir_name" + +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" + + # Disable internal caches so warm runs are actually warm — without + # these, the second/third try of every query returned in 0.00-0.01 s + # because the BE's storage-page / segment cache was hot and the FE + # was returning a cached SQL result. Matches doris/install (which + # ships a different Apache Doris build with these defaults already + # off in newer releases). + printf "\ndisable_storage_page_cache = true\n" >> "$DORIS_HOME"/be/conf/be.conf + printf "\nsegment_cache_capacity = 0\n" >> "$DORIS_HOME"/be/conf/be.conf + printf "\ncache_enable_sql_mode = false\n" >> "$DORIS_HOME"/fe/conf/fe.conf + printf "\ncache_enable_partition_mode = false\n" >> "$DORIS_HOME"/fe/conf/fe.conf +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +set +e +sudo systemctl disable unattended-upgrades 2>/dev/null +sudo systemctl stop unattended-upgrades 2>/dev/null +sudo systemctl stop mysql-server 2>/dev/null +set -e + +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/velodb/load b/velodb/load new file mode 100755 index 0000000000..57ded740da --- /dev/null +++ b/velodb/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +sleep 5 +mysql -h127.0.0.1 -P9030 -uroot hits < "$ROOT/create.sql" + +curl --location-trusted \ + -u root: \ + -T "hits.tsv" \ + -H "label:hits_$(date +%s)" \ + -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ + http://localhost:8030/api/hits/hits/_stream_load + +rm -f hits.tsv hits.tsv.gz +sync diff --git a/selectdb/queries.sql b/velodb/queries.sql similarity index 100% rename from selectdb/queries.sql rename to velodb/queries.sql diff --git a/velodb/query b/velodb/query new file mode 100755 index 0000000000..289dd078ef --- /dev/null +++ b/velodb/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against SelectDB's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Per-query BE cache flush (matches original run.sh behavior). +curl -sS http://127.0.0.1:8040/api/clear_cache/all >/dev/null 2>&1 || true + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/selectdb/results/20221006/c6a.4xlarge.json b/velodb/results/20221006/c6a.4xlarge.json similarity index 98% rename from selectdb/results/20221006/c6a.4xlarge.json rename to velodb/results/20221006/c6a.4xlarge.json index 1c83d299cd..dba56d99c2 100644 --- a/selectdb/results/20221006/c6a.4xlarge.json +++ b/velodb/results/20221006/c6a.4xlarge.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2022-10-06", "machine": "c6a.4xlarge", "cluster_size": 1, @@ -231,5 +231,6 @@ 0.03, 0.03 ] - ] + ], + "comment": "" } diff --git a/selectdb/results/20221008/c6a.metal.json b/velodb/results/20221008/c6a.metal.json similarity index 99% rename from selectdb/results/20221008/c6a.metal.json rename to velodb/results/20221008/c6a.metal.json index 8fc03e6e84..7f3cbb2605 100644 --- a/selectdb/results/20221008/c6a.metal.json +++ b/velodb/results/20221008/c6a.metal.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2022-10-08", "machine": "c6a.metal", "cluster_size": 1, diff --git a/selectdb/results/20221018/c5.4xlarge.json b/velodb/results/20221018/c5.4xlarge.json similarity index 97% rename from selectdb/results/20221018/c5.4xlarge.json rename to velodb/results/20221018/c5.4xlarge.json index 102c85702d..3b21c85a2b 100644 --- a/selectdb/results/20221018/c5.4xlarge.json +++ b/velodb/results/20221018/c5.4xlarge.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2022-10-18", "machine": "c5.4xlarge", "cluster_size": 1, @@ -7,8 +7,7 @@ "tags": [ "C++", "column-oriented", - "MySQL compatible", - "historical" + "MySQL compatible" ], "load_time": 520, "data_size": 17122903939, diff --git a/selectdb/results/20221102/c5.4xlarge.json b/velodb/results/20221102/c5.4xlarge.json similarity index 97% rename from selectdb/results/20221102/c5.4xlarge.json rename to velodb/results/20221102/c5.4xlarge.json index e7a7b6f4d2..2725914739 100644 --- a/selectdb/results/20221102/c5.4xlarge.json +++ b/velodb/results/20221102/c5.4xlarge.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2022-11-02", "machine": "c5.4xlarge", "cluster_size": 1, @@ -7,8 +7,7 @@ "tags": [ "C++", "column-oriented", - "MySQL compatible", - "historical" + "MySQL compatible" ], "load_time": 526, "data_size": 17122903966, diff --git a/selectdb/results/20221102/c6a.4xlarge.json b/velodb/results/20221102/c6a.4xlarge.json similarity index 98% rename from selectdb/results/20221102/c6a.4xlarge.json rename to velodb/results/20221102/c6a.4xlarge.json index 0505f10693..938c71a741 100644 --- a/selectdb/results/20221102/c6a.4xlarge.json +++ b/velodb/results/20221102/c6a.4xlarge.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2022-11-02", "machine": "c6a.4xlarge", "cluster_size": 1, @@ -231,5 +231,6 @@ 0.03, 0.02 ] - ] + ], + "comment": "" } diff --git a/selectdb/results/20240426/c6a.metal.json b/velodb/results/20240426/c6a.metal.json similarity index 99% rename from selectdb/results/20240426/c6a.metal.json rename to velodb/results/20240426/c6a.metal.json index b33616501b..62efa3e2d4 100644 --- a/selectdb/results/20240426/c6a.metal.json +++ b/velodb/results/20240426/c6a.metal.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2024-04-26", "machine": "c6a.metal", "cluster_size": 1, diff --git a/selectdb/results/20240618/c6a.metal.json b/velodb/results/20240618/c6a.metal.json similarity index 99% rename from selectdb/results/20240618/c6a.metal.json rename to velodb/results/20240618/c6a.metal.json index 257b479d7a..3f77b940f4 100644 --- a/selectdb/results/20240618/c6a.metal.json +++ b/velodb/results/20240618/c6a.metal.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "date": "2024-06-18", "machine": "c6a.metal", "cluster_size": 1, diff --git a/velodb/results/20240919/c6a.metal.json b/velodb/results/20240919/c6a.metal.json new file mode 100644 index 0000000000..122bd7c06a --- /dev/null +++ b/velodb/results/20240919/c6a.metal.json @@ -0,0 +1,236 @@ +{ + "system": "VeloDB", + "date": "2024-09-19", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "comment": "", + "tags": [ + "C++", + "column-oriented", + "MySQL compatible", + "ClickHouse derivative", + "lukewarm-cold-run" + ], + "load_time": 459, + "data_size": 17365253189, + "result": [ + [ + 0.09, + 0.03, + 0.02 + ], + [ + 0.13, + 0.02, + 0.03 + ], + [ + 1.12, + 0.04, + 0.04 + ], + [ + 1.7, + 0.04, + 0.04 + ], + [ + 1.68, + 0.17, + 0.16 + ], + [ + 1.4, + 0.21, + 0.2 + ], + [ + 0.05, + 0.02, + 0.02 + ], + [ + 0.18, + 0.03, + 0.03 + ], + [ + 2.68, + 0.22, + 0.21 + ], + [ + 3.57, + 0.25, + 0.25 + ], + [ + 2.34, + 0.08, + 0.08 + ], + [ + 3.1, + 0.08, + 0.08 + ], + [ + 2.22, + 0.23, + 0.2 + ], + [ + 3.76, + 0.29, + 0.26 + ], + [ + 2.17, + 0.25, + 0.23 + ], + [ + 1.48, + 0.12, + 0.12 + ], + [ + 3.86, + 0.31, + 0.27 + ], + [ + 2.82, + 0.36, + 0.09 + ], + [ + 4.89, + 0.48, + 0.49 + ], + [ + 0.04, + 0.01, + 0.02 + ], + [ + 11.32, + 0.24, + 0.11 + ], + [ + 13.35, + 0.12, + 0.06 + ], + [ + 25.55, + 0.17, + 0.09 + ], + [ + 7.56, + 0.06, + 0.07 + ], + [ + 2.64, + 0.09, + 0.07 + ], + [ + 2.26, + 0.05, + 0.06 + ], + [ + 3.0, + 0.1, + 0.07 + ], + [ + 11.58, + 0.24, + 0.22 + ], + [ + 9.47, + 0.97, + 0.93 + ], + [ + 0.5, + 0.05, + 0.03 + ], + [ + 5.03, + 0.14, + 0.12 + ], + [ + 6.98, + 0.17, + 0.15 + ], + [ + 4.97, + 0.91, + 0.83 + ], + [ + 11.65, + 1.06, + 1.01 + ], + [ + 11.62, + 1.03, + 0.99 + ], + [ + 0.89, + 0.18, + 0.16 + ], + [ + 1.74, + 0.04, + 0.03 + ], + [ + 1.97, + 0.03, + 0.03 + ], + [ + 2.13, + 0.03, + 0.02 + ], + [ + 2.25, + 0.08, + 0.08 + ], + [ + 1.42, + 0.03, + 0.03 + ], + [ + 1.82, + 0.02, + 0.03 + ], + [ + 1.43, + 0.02, + 0.03 + ] + ] +} diff --git a/velodb/results/20250710/c6a.2xlarge.json b/velodb/results/20250710/c6a.2xlarge.json new file mode 100644 index 0000000000..be31a4466a --- /dev/null +++ b/velodb/results/20250710/c6a.2xlarge.json @@ -0,0 +1,236 @@ +{ + "system": "VeloDB", + "date": "2025-07-10", + "machine": "c6a.2xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "MySQL compatible", + "ClickHouse derivative", + "lukewarm-cold-run" + ], + "load_time": 716, + "data_size": 16402424021, + "result": [ + [ + 2.16, + 0.25, + 0.24 + ], + [ + 5.12, + 0.49, + 0.38 + ], + [ + 5.18, + 0.53, + 0.47 + ], + [ + 6.91, + 0.47, + 0.52 + ], + [ + 7.06, + 1.51, + 1.58 + ], + [ + 6.38, + 3.03, + 2.26 + ], + [ + 2.18, + 0.22, + 0.26 + ], + [ + 5.13, + 0.44, + 0.48 + ], + [ + 8.99, + 1.36, + 1.39 + ], + [ + 11.61, + 1.5, + 1.47 + ], + [ + 10.13, + 0.56, + 0.57 + ], + [ + 10.87, + 0.57, + 0.54 + ], + [ + 8.5, + 1.39, + 1.44 + ], + [ + 13.08, + 3.21, + 2.4 + ], + [ + 9.9, + 1.82, + 1.83 + ], + [ + 6.93, + 1.15, + 1.25 + ], + [ + 10.61, + 4.68, + null + ], + [ + 9.49, + 1.18, + 1.06 + ], + [ + null, + null, + null + ], + [ + 0.15, + 0.01, + 0.01 + ], + [ + 17.28, + 16.23, + 15.97 + ], + [ + 20.12, + 19.12, + 18.7 + ], + [ + 31.4, + 30.41, + 29.79 + ], + [ + 9.53, + 0.75, + 0.7 + ], + [ + 3.24, + 0.39, + 0.39 + ], + [ + 8.46, + 0.78, + 0.76 + ], + [ + 3.16, + 0.4, + 0.38 + ], + [ + 17.78, + 16.73, + 15.63 + ], + [ + 20.1, + null, + null + ], + [ + 3.73, + 0.46, + 0.37 + ], + [ + 15.16, + 1.17, + 1.17 + ], + [ + 18.43, + 3.18, + 1.57 + ], + [ + null, + null, + null + ], + [ + null, + null, + null + ], + [ + null, + null, + null + ], + [ + 5.44, + 1.27, + 1.28 + ], + [ + 8.83, + 0.68, + 0.57 + ], + [ + 11.02, + 0.72, + 0.72 + ], + [ + 3.55, + 0.45, + 0.46 + ], + [ + 4.25, + 0.68, + 0.66 + ], + [ + 3.25, + 0.39, + 0.43 + ], + [ + 3.37, + 0.42, + 0.39 + ], + [ + 3.11, + 0.43, + 0.42 + ] + ], + "comment": "" +} diff --git a/velodb/results/20250710/c6a.4xlarge.json b/velodb/results/20250710/c6a.4xlarge.json new file mode 100644 index 0000000000..15e40cff2a --- /dev/null +++ b/velodb/results/20250710/c6a.4xlarge.json @@ -0,0 +1,236 @@ +{ + "system": "VeloDB", + "date": "2025-07-10", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "MySQL compatible", + "ClickHouse derivative", + "lukewarm-cold-run" + ], + "load_time": 487, + "data_size": 17103182575, + "result": [ + [ + 0.1, + 0.04, + 0.04 + ], + [ + 1.36, + 0.04, + 0.04 + ], + [ + 2.06, + 0.06, + 0.07 + ], + [ + 2.33, + 0.09, + 0.1 + ], + [ + 2.29, + 0.72, + 0.68 + ], + [ + 2.19, + 0.82, + 0.81 + ], + [ + 0.07, + 0.01, + 0.01 + ], + [ + 1.39, + 0.04, + 0.04 + ], + [ + 4, + 0.62, + 0.63 + ], + [ + 5.76, + 0.7, + 0.71 + ], + [ + 3.71, + 0.1, + 0.11 + ], + [ + 4.3, + 0.13, + 0.12 + ], + [ + 2.98, + 0.62, + 0.63 + ], + [ + 5.22, + 1.02, + 0.94 + ], + [ + 4.05, + 1.03, + 1.03 + ], + [ + 2.21, + 0.59, + 0.49 + ], + [ + 4.67, + 1.41, + 1.46 + ], + [ + 4.46, + 0.39, + 0.39 + ], + [ + 6.28, + 2.61, + 2.58 + ], + [ + 0.05, + 0.01, + 0.01 + ], + [ + 11.96, + 0.94, + 0.91 + ], + [ + 14.51, + 0.79, + 0.74 + ], + [ + 26.81, + 1.56, + 1.55 + ], + [ + 6.1, + 0.37, + 0.16 + ], + [ + 1.85, + 0.11, + 0.13 + ], + [ + 3.26, + 0.23, + 0.21 + ], + [ + 1.74, + 0.16, + 0.21 + ], + [ + 12.19, + 1.5, + 1.51 + ], + [ + 11.1, + 8.8, + 8.76 + ], + [ + 1.31, + 0.06, + 0.07 + ], + [ + 7.01, + 0.41, + 0.39 + ], + [ + 9.01, + 0.51, + 0.52 + ], + [ + 6.53, + 3.41, + 3.42 + ], + [ + 14.29, + 13.98, + 12.62 + ], + [ + 14.27, + null, + 12.58 + ], + [ + 2.14, + 0.63, + 0.62 + ], + [ + 2.46, + 0.08, + 0.08 + ], + [ + 2.44, + 0.05, + 0.05 + ], + [ + 1.89, + 0.04, + 0.04 + ], + [ + 2.53, + 0.26, + 0.23 + ], + [ + 1.43, + 0.03, + 0.03 + ], + [ + 1.84, + 0.03, + 0.03 + ], + [ + 1.45, + 0.03, + 0.04 + ] + ], + "comment": "" +} diff --git a/velodb/results/20250830/c7a.metal-48xl.json b/velodb/results/20250830/c7a.metal-48xl.json new file mode 100644 index 0000000000..338613dd96 --- /dev/null +++ b/velodb/results/20250830/c7a.metal-48xl.json @@ -0,0 +1,236 @@ +{ + "system": "VeloDB", + "date": "2025-08-30", + "machine": "c7a.metal-48xl", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "MySQL compatible", + "ClickHouse derivative", + "lukewarm-cold-run" + ], + "load_time": 364, + "data_size": 17361427624, + "result": [ + [ + 0.04, + 0.02, + 0.01 + ], + [ + 0.36, + 0.02, + 0.02 + ], + [ + 1.27, + 0.02, + 0.02 + ], + [ + 1.7, + 0.03, + 0.03 + ], + [ + 1.63, + 0.09, + 0.12 + ], + [ + 1.47, + 0.15, + 0.13 + ], + [ + 0.03, + 0.02, + 0.01 + ], + [ + 0.4, + 0.03, + 0.02 + ], + [ + 2.76, + 0.19, + 0.18 + ], + [ + 3.81, + 0.21, + 0.21 + ], + [ + 2.42, + 0.07, + 0.07 + ], + [ + 3.07, + 0.07, + 0.07 + ], + [ + 1.92, + 0.14, + 0.14 + ], + [ + 3.6, + 0.2, + 0.19 + ], + [ + 2.3, + 0.18, + 0.17 + ], + [ + 1.48, + 0.1, + 0.1 + ], + [ + 3.52, + 0.23, + 0.22 + ], + [ + 2.74, + 0.12, + 0.05 + ], + [ + 4.72, + 0.36, + 0.37 + ], + [ + 0.06, + 0.01, + 0.01 + ], + [ + 11.14, + 0.09, + 0.05 + ], + [ + 12.93, + 0.06, + 0.05 + ], + [ + 24.79, + 0.26, + 0.08 + ], + [ + 7.33, + 0.05, + 0.72 + ], + [ + 2.68, + 0.22, + 0.14 + ], + [ + 2.03, + 0.05, + 0.05 + ], + [ + 2.8, + 0.31, + 0.16 + ], + [ + 11.37, + 0.25, + 0.12 + ], + [ + 9.24, + 0.69, + 0.67 + ], + [ + 0.51, + 0.03, + 0.03 + ], + [ + 5.14, + 0.09, + 0.08 + ], + [ + 6.77, + 0.11, + 0.11 + ], + [ + 4.88, + 0.69, + 0.61 + ], + [ + 11.38, + 0.69, + 0.64 + ], + [ + 11.38, + 0.67, + 0.64 + ], + [ + 0.87, + 0.14, + 0.14 + ], + [ + 1.85, + 0.02, + 0.02 + ], + [ + 2.11, + 0.02, + 0.02 + ], + [ + 2.01, + 0.03, + 0.02 + ], + [ + 2.61, + 0.06, + 0.06 + ], + [ + 1.47, + 0.02, + 0.03 + ], + [ + 1.9, + 0.03, + 0.03 + ], + [ + 1.37, + 0.03, + 0.02 + ] + ], + "comment": "" +} diff --git a/velodb/results/20260509/c6a.4xlarge.json b/velodb/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..0edb10050e --- /dev/null +++ b/velodb/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "VeloDB", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 523, + "data_size": 13720529881, + "result": [ + [0.45, 0.06, 0.04], + [1.21, 0.06, 0.06], + [1.55, 0.11, 0.11], + [1.92, 0.19, 0.19], + [1.98, 0.36, 0.42], + [2.43, 0.94, 0.92], + [0.4, 0.05, 0.03], + [1.27, 0.08, 0.07], + [3.66, 0.69, 0.7], + [5.13, 0.81, 0.8], + [3.34, 0.26, 0.27], + [3.56, 0.3, 0.28], + [2.99, 0.72, 0.7], + [5.2, 1.26, 1.22], + [3.97, 1.13, 1.09], + [2.04, 0.57, 0.62], + [4.74, 1.58, 1.6], + [4.66, 0.4, 0.41], + [6.96, 2.62, 2.77], + [0.25, 0.06, 0.03], + [8.51, 1.08, 1.07], + [10.71, 1.07, 1.07], + [21.93, 2.06, 2.05], + [9.19, 1.18, 1.16], + [4.84, 0.43, 0.43], + [2.87, 0.4, 0.39], + [4.84, 0.42, 0.44], + [8.68, 1.65, 1.61], + [10, 9.48, 9.48], + [1.24, 0.16, 0.16], + [7.04, 0.64, 0.64], + [9.45, 0.79, 0.78], + [6.68, 3.78, 3.8], + [10.82, 5, 5], + [10.8, 5.05, 5.15], + [1.81, 0.63, 0.65], + [2.1, 0.13, 0.12], + [1.95, 0.1, 0.09], + [1.55, 0.09, 0.09], + [1.86, 0.25, 0.23], + [1.41, 0.08, 0.1], + [1.62, 0.11, 0.07], + [1.33, 0.08, 0.07] +] +} + diff --git a/velodb/results/20260509/c6a.metal.json b/velodb/results/20260509/c6a.metal.json new file mode 100644 index 0000000000..001f142458 --- /dev/null +++ b/velodb/results/20260509/c6a.metal.json @@ -0,0 +1,58 @@ +{ + "system": "VeloDB", + "date": "2026-05-09", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["C++","column-oriented","MySQL compatible","ClickHouse derivative","lukewarm-cold-run"], + "load_time": 463, + "data_size": 13734499390, + "result": [ + [0.31, 0.05, 0.04], + [0.63, 0.05, 0.04], + [1.03, 0.09, 0.08], + [1.32, 0.1, 0.09], + [1.41, 0.1, 0.1], + [1.67, 0.27, 0.22], + [0.35, 0.05, 0.04], + [0.73, 0.06, 0.06], + [3.03, 0.28, 0.28], + [4.26, 0.34, 0.3], + [2.53, 0.16, 0.14], + [3.01, 0.15, 0.15], + [2.34, 0.24, 0.24], + [4.1, 0.34, 0.33], + [2.63, 0.29, 0.29], + [1.36, 0.16, 0.15], + [3.6, 0.33, 0.32], + [3.61, 0.13, 0.14], + [5.34, 0.53, 0.49], + [0.36, 0.02, 0.03], + [8.5, 0.34, 0.35], + [9.56, 0.38, 0.36], + [19.95, 0.72, 0.7], + [9.53, 0.45, 0.43], + [3.64, 0.11, 0.11], + [2.09, 0.1, 0.1], + [3.64, 0.13, 0.12], + [8.72, 0.46, 0.45], + [7.16, 1.02, 0.99], + [0.91, 0.13, 0.12], + [5.39, 0.19, 0.18], + [7.45, 0.2, 0.19], + [5.16, 0.71, 0.73], + [8.9, 1.11, 1.07], + [8.9, 1.15, 1.12], + [1.25, 0.19, 0.2], + [1.91, 0.07, 0.07], + [1.82, 0.09, 0.06], + [1.67, 0.07, 0.06], + [1.98, 0.1, 0.1], + [1.58, 0.07, 0.05], + [1.73, 0.07, 0.06], + [1.3, 0.07, 0.06] +] +} + diff --git a/velodb/start b/velodb/start new file mode 100755 index 0000000000..10054a0fec --- /dev/null +++ b/velodb/start @@ -0,0 +1,45 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME/fe/bin/start_fe.sh" --daemon +"$DORIS_HOME/be/bin/start_be.sh" --daemon + +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done + +# Belt-and-suspenders against SelectDB's session-level result/SQL cache, +# in case the FE config we wrote at install time isn't honored. Setting +# these GLOBAL flips the default for any new connection (./query opens a +# fresh one each invocation). +mysql -h127.0.0.1 -P9030 -uroot -e " + SET GLOBAL enable_sql_cache = false; + SET GLOBAL enable_partition_cache = false; + SET GLOBAL enable_query_cache = false; +" 2>/dev/null || true diff --git a/velodb/stop b/velodb/stop new file mode 100755 index 0000000000..4d724b5d34 --- /dev/null +++ b/velodb/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME/fe/bin/stop_fe.sh" 2>/dev/null || true +"$DORIS_HOME/be/bin/stop_be.sh" 2>/dev/null || true +exit 0 diff --git a/selectdb/template.json b/velodb/template.json similarity index 88% rename from selectdb/template.json rename to velodb/template.json index 674d5ef5a4..21e98e5c24 100644 --- a/selectdb/template.json +++ b/velodb/template.json @@ -1,5 +1,5 @@ { - "system": "SelectDB", + "system": "VeloDB", "proprietary": "no", "hardware": "cpu", "tuned": "no", diff --git a/vertica/README.md b/vertica/README.md index 0adab4f41f..ce64b5bf5b 100644 --- a/vertica/README.md +++ b/vertica/README.md @@ -3,3 +3,11 @@ Although Vertica EULA does not prevent doing benchmarks, it restricts from discl > You may not disclose to any third-party performance information or analysis (including, without limitation, benchmarks and performance tests) from any source relating to the Software. https://www.vertica.com/end-user-license-agreement-ce-version/ + +## Dead (May 2026) + +`docker pull vertica/vertica-ce` returns + + Error response from daemon: pull access denied for vertica/vertica-ce, repository does not exist or may require 'docker login' + +The Community Edition image is gone and there is no public replacement. The directory and historical results are kept; nothing here runs anymore. diff --git a/vertica/benchmark.sh b/vertica/benchmark.sh index 83b54d69f7..531bd65038 100755 --- a/vertica/benchmark.sh +++ b/vertica/benchmark.sh @@ -1,27 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y docker.io - -sudo docker run -p 5433:5433 -p 5444:5444 --volume $(pwd):/workdir --mount type=volume,source=vertica-data,target=/data --name vertica_ce vertica/vertica-ce - -sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c "$(cat create.sql)" - -../download-hits-tsv - -echo -n "Load time: " -command time -f '%e' sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c "COPY hits FROM LOCAL '/workdir/hits.tsv' DELIMITER E'\\t' NULL E'\\001' DIRECT" - -echo -n "Data size: " -sudo docker exec vertica_ce du -bcs /data/vertica/VMart | grep total - -./run.sh 2>&1 | tee log.txt - -# If you run the script on your own, you may get numbers like this: -# 200m00.000s -# 25000000000 - -# Note: the real numbers cannot be published. - -grep -F 'All rows formatted' logs.txt | sed -r -e 's/^.* ([0-9.]+) ms$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/vertica/check b/vertica/check new file mode 100755 index 0000000000..d94fc908f3 --- /dev/null +++ b/vertica/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c 'SELECT 1' >/dev/null 2>&1 diff --git a/vertica/data-size b/vertica/data-size new file mode 100755 index 0000000000..39189fa470 --- /dev/null +++ b/vertica/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo docker exec vertica_ce du -bcs /data/vertica/VMart | grep total | awk '{print $1}' diff --git a/vertica/install b/vertica/install new file mode 100755 index 0000000000..73be6d2ad1 --- /dev/null +++ b/vertica/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io + +sudo docker pull vertica/vertica-ce + +# Create container only if missing. +if ! sudo docker inspect vertica_ce >/dev/null 2>&1; then + sudo docker run -d -p 5433:5433 -p 5444:5444 \ + --volume "$(pwd):/workdir" \ + --mount type=volume,source=vertica-data,target=/data \ + --name vertica_ce vertica/vertica-ce +fi diff --git a/vertica/load b/vertica/load new file mode 100755 index 0000000000..4bd5c8c251 --- /dev/null +++ b/vertica/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +VSQL="sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin" + +$VSQL -c "DROP TABLE IF EXISTS hits CASCADE;" +$VSQL -c "$(cat create.sql)" + +$VSQL -c "COPY hits FROM LOCAL '/workdir/hits.tsv' DELIMITER E'\t' NULL E'\001' DIRECT" + +rm -f hits.tsv +sync diff --git a/vertica/query b/vertica/query new file mode 100755 index 0000000000..9aa777075d --- /dev/null +++ b/vertica/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via vsql inside the vertica_ce container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# vsql's `\timing` "All rows formatted: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|^ROLLBACK'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Pass through the result, then parse the timing footer. +printf '%s\n' "$raw" + +ms=$(printf '%s\n' "$raw" \ + | grep -oP 'All rows formatted:\s*\K[0-9.]+(?=\s*ms)' \ + | tail -n1) + +if [ -z "$ms" ]; then + # Fallback: vsql also prints "Time: ms" on \timing. + ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+(?=\s*ms)' | tail -n1) +fi + +if [ -z "$ms" ]; then + echo "no timing in vsql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/vertica/run.sh b/vertica/run.sh deleted file mode 100755 index 138e0c8d71..0000000000 --- a/vertica/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c '\timing' -c "$query" - done; -done; diff --git a/vertica/start b/vertica/start new file mode 100755 index 0000000000..72362a897c --- /dev/null +++ b/vertica/start @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +if sudo docker inspect -f '{{.State.Running}}' vertica_ce 2>/dev/null | grep -q true; then + if sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo docker start vertica_ce diff --git a/vertica/stop b/vertica/stop new file mode 100755 index 0000000000..4bf245e4f2 --- /dev/null +++ b/vertica/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop vertica_ce >/dev/null 2>&1 || true diff --git a/victorialogs/benchmark.sh b/victorialogs/benchmark.sh index c089752a41..d087cc65a7 100755 --- a/victorialogs/benchmark.sh +++ b/victorialogs/benchmark.sh @@ -1,42 +1,8 @@ #!/bin/bash - -# Install - -RELEASE_VERSION=v1.10.1-victorialogs - -# Stop the existing victorialogs instance if any and drop its data -for _ in {1..300} -do - pidof victoria-logs-prod && kill `pidof victoria-logs-prod` || break - sleep 1 -done -rm -rf victoria-logs-data - -# Download and start victorialogs -wget --continue --progress=dot:giga https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-$(dpkg --print-architecture)-${RELEASE_VERSION}.tar.gz -tar xzf victoria-logs-linux-$(dpkg --print-architecture)-${RELEASE_VERSION}.tar.gz -./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y -search.maxQueryDuration=5m > server.log & - -for _ in {1..300} -do - curl -s http://localhost:9428/select/logsql/query -d 'query=_time:2100-01-01Z' && break - sleep 1 -done - -# Load the data - -wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.json.gz -gunzip hits.json.gz -echo -n "Load time: " -command time -f '%e' cat hits.json | split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" - -# Run the queries - -./run.sh - -# Determine on-disk size of the ingested data - -echo -n "Data size: " -du -sb victoria-logs-data - -sudo killall victoria-logs-prod +# Thin shim — actual flow is in lib/benchmark-common.sh. +# victorialogs ingests gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +# queries are LogsQL, not SQL. +export BENCH_QUERIES_FILE="queries.logsql" +exec ../lib/benchmark-common.sh diff --git a/victorialogs/check b/victorialogs/check new file mode 100755 index 0000000000..3db46b7d31 --- /dev/null +++ b/victorialogs/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +curl -sSf --get \ + --data-urlencode 'query=_time:2100-01-01Z' \ + 'http://localhost:9428/select/logsql/query' >/dev/null diff --git a/victorialogs/data-size b/victorialogs/data-size new file mode 100755 index 0000000000..d31cee6752 --- /dev/null +++ b/victorialogs/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -sb victoria-logs-data | awk '{print $1}' diff --git a/victorialogs/install b/victorialogs/install new file mode 100755 index 0000000000..1a9aa4cd52 --- /dev/null +++ b/victorialogs/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +RELEASE_VERSION=${VICTORIALOGS_VERSION:-v1.10.1-victorialogs} + +if [ ! -x ./victoria-logs-prod ]; then + arch=$(dpkg --print-architecture) + wget --continue --progress=dot:giga \ + "https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-${arch}-${RELEASE_VERSION}.tar.gz" + tar xzf "victoria-logs-linux-${arch}-${RELEASE_VERSION}.tar.gz" +fi diff --git a/victorialogs/load b/victorialogs/load new file mode 100755 index 0000000000..259f72efc7 --- /dev/null +++ b/victorialogs/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Idempotent: blow away any prior data. +./stop +rm -rf victoria-logs-data +./start +# Wait for it to come up. +for _ in {1..300}; do + ./check >/dev/null 2>&1 && break + sleep 1 +done + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +gunzip -f hits.json.gz + +# Bulk insert via 8 parallel jsonline streams. +cat hits.json | split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" + +rm -f hits.json +sync diff --git a/victorialogs/query b/victorialogs/query new file mode 100755 index 0000000000..2d2581a27d --- /dev/null +++ b/victorialogs/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a LogsQL query from stdin, runs it via victorialogs HTTP API. +# Stdout: query result (NDJSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +t1=$(date +%s%3N) +out=$(curl -sS --fail --get --data-urlencode "query=$query" \ + 'http://localhost:9428/select/logsql/query') && exit_code=0 || exit_code=$? +t2=$(date +%s%3N) + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$out" + +duration=$((t2 - t1)) +awk -v d="$duration" 'BEGIN { printf "%.3f\n", d / 1000 }' >&2 diff --git a/victorialogs/results/20260509/c6a.4xlarge.json b/victorialogs/results/20260509/c6a.4xlarge.json new file mode 100644 index 0000000000..408d1e66fb --- /dev/null +++ b/victorialogs/results/20260509/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "VictoriaLogs", + "date": "2026-05-09", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Go","column-oriented","lukewarm-cold-run"], + "load_time": 2262, + "data_size": 16966890984, + "result": [ + [0.063, 0.012, 0.012], + [0.123, 0.012, 0.01], + [1.54, 0.199, 0.203], + [2.228, 0.227, 0.232], + [2.762, 2.339, 1.64], + [2.93, 1.165, 1.168], + [0.064, 0.036, 0.039], + [0.151, 0.014, 0.012], + [4.378, 2.25, 2.241], + [4.115, 3.032, 2.971], + [0.583, 0.364, 0.366], + [0.668, 0.44, 0.457], + [3.163, 1.429, 1.229], + [4.363, 2.94, 2.725], + [2.193, 1.516, 1.412], + [4.506, 2.82, 2.539], + [8.817, 5.791, 5.363], + [9.148, 6.733, 6.199], + [19.062, 13.692, 12.251], + [0.192, 0.023, 0.023], + [8.37, 0.877, 0.866], + [0.916, 0.854, 0.86], + [0.045, 0.021, 0.021], + [8.37, 0.87, 0.859], + [2.887, 0.531, 0.52], + [0.464, 0.428, 0.429], + [0.646, 0.56, 0.555], + [7.828, 1.102, 1.09], + [13.037, 12.182, 12.147], + [9.534, 9.482, 9.508], + [4.038, 2.841, 2.926], + [5.37, 4.03, 3.857], + [null, null, null], + [12.219, 7.205, 6.237], + [9.816, 7.133, 6.811], + [7.685, 5.743, 5.651], + [0.17, 0.099, 0.077], + [0.062, 0.038, 0.034], + [0.081, 0.034, 0.041], + [0.312, 0.198, 0.208], + [0.055, 0.035, 0.041], + [0.041, 0.031, 0.029], + [0.042, 0.034, 0.028] +] +} + diff --git a/victorialogs/run.sh b/victorialogs/run.sh deleted file mode 100755 index 36fafb7249..0000000000 --- a/victorialogs/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 - -set -f -cat queries.logsql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - t1=$(date +%s%3N) - curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query" > /dev/null - exit_code=$? - t2=$(date +%s%3N) - duration=$((t2-t1)) - RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.') - [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," -done diff --git a/victorialogs/start b/victorialogs/start new file mode 100755 index 0000000000..c76523e283 --- /dev/null +++ b/victorialogs/start @@ -0,0 +1,14 @@ +#!/bin/bash +set -eu + +# Idempotent: if already serving, do nothing. +if curl -sSf --get \ + --data-urlencode 'query=_time:2100-01-01Z' \ + 'http://localhost:9428/select/logsql/query' >/dev/null 2>&1; then + exit 0 +fi + +# Detach from this script so it doesn't keep the daemon as a child. +nohup ./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y \ + -search.maxQueryDuration=5m > server.log 2>&1 & +disown diff --git a/victorialogs/stop b/victorialogs/stop new file mode 100755 index 0000000000..61a1dd8637 --- /dev/null +++ b/victorialogs/stop @@ -0,0 +1,11 @@ +#!/bin/bash + +pid=$(pidof victoria-logs-prod 2>/dev/null || true) +if [ -n "$pid" ]; then + kill $pid 2>/dev/null || true + for _ in $(seq 1 30); do + pidof victoria-logs-prod >/dev/null 2>&1 || exit 0 + sleep 1 + done + sudo killall -9 victoria-logs-prod 2>/dev/null || true +fi diff --git a/ydb/benchmark.sh b/ydb/benchmark.sh index 2d256ac338..6340357b4f 100755 --- a/ydb/benchmark.sh +++ b/ydb/benchmark.sh @@ -1,292 +1,8 @@ #!/bin/bash -set -e - -PARAMS_FILE="benchmark_variables.sh" -source $PARAMS_FILE -export YDB_PASSWORD=password -START_DIR=`pwd` - -update_file() { - local raw_input="$1" - local raw_output="$2" - local verbose="${3:-0}" - - expand_path() { - local path="$1" - path="${path/#\~/$HOME}" - - local expanded_path - expanded_path=$(eval echo "$path") - echo "$expanded_path" - } - - local input_file - local output_file - input_file=$(expand_path "$raw_input") - output_file=$(expand_path "$raw_output") - - local output_dir - output_dir=$(dirname "$output_file") - - # Making temporary file - local temp_file - temp_file=$(mktemp) || { - echo "Error while creating temporary file" >&2 - return 7 - } - - cleanup() { - rm -f "$temp_file" - } - trap cleanup EXIT - - cp "$input_file" "$temp_file" || { - echo "Error while copying input file to temporary file" >&2 - return 8 - } - - local env_vars - env_vars=$(env | cut -d= -f1) - - for var in $env_vars; do - local value - value="${!var}" - - if grep -q "\$$var" "$temp_file"; then - local escaped_value - escaped_value=$(echo "$value" | sed -e 's/[\/&]/\\&/g') - - sed -i "s/\$$var/$escaped_value/g" "$temp_file" || { - echo "Error while substituting variable \$$var." >&2 - return 9 - } - fi - done - - cp "$temp_file" "$output_file" || { - return 10 - } - - return 0 -} - -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository --yes --update ppa:ansible/ansible -sudo apt-get install -y ansible-core - -cd $START_DIR -if [ ! -d "ydb" ]; then - git clone https://github.com/ydb-platform/ydb.git -fi - -cd $START_DIR/ydb/ydb/apps/ydbd/ -git checkout stable-25-1-analytics || { echo "Error while checking branch out"; exit 1; } -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR/ydb/ydb/apps/ydb/ -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR/ydb/ydb/apps/dstool/ -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR -if [ ! -d "ydb-ansible-examples" ]; then - git clone https://github.com/ydb-platform/ydb-ansible-examples.git -fi - -cd $START_DIR/ydb-ansible-examples -ansible-galaxy install -r requirements.yaml -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc - - -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydbd -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb-dstool - -ln -f $START_DIR/ydb/ydb/apps/ydbd/ydbd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -ln -f $START_DIR/ydb/ydb/apps/ydb/ydb $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -ln -f $START_DIR/ydb/ydb/apps/dstool/ydb-dstool $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ - -cd $START_DIR - -update_file "ydb-cluster-setup/50-inventory.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/50-inventory.yaml" -update_file "ydb-cluster-setup/config.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/config.yaml" -update_file "ydb-cluster-setup/ydb-ca-nodes.txt" "$START_DIR/ydb-ansible-examples/TLS/ydb-ca-nodes.txt" - -hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) -disks=( "$disk1" "$disk2" "$disk3" ) - -replace_string_in_file() { - local file_path="$1" - local search_string="$2" - local replace_string="$3" - local temp_file - - if [[ ! -f "$file_path" ]]; then - echo "Error: File $file_path does not exist" >&2 - return 1 - fi - - temp_file=$(mktemp) - - sed "s|$search_string|$replace_string|g" "$file_path" > "$temp_file" - - if [ $? -ne 0 ]; then - echo "Error: Replacement operation failed" - rm -f "$temp_file" - return 4 - fi - - mv "$temp_file" "$file_path" - - return 0 -} - -ssh_execute() { - declare -n local_hosts="$1" - local command="$2" - - for host in "${local_hosts[@]}"; do - - echo "Executing on $host: $command" >&2 - echo "$command" | ssh -l $ydb_host_user_name -o BatchMode=yes -o StrictHostKeyChecking=no "$host" "bash -s" - local exit_code=$? - - if [ $exit_code -ne 0 ]; then - echo "Command failed with exit code: $exit_code" >&2 - fi - done - - return 0 -} - -copy_file_to_multiple_hosts() { - local file_to_copy=$1 - shift - - local hosts=("$@") - local pids=() - - for host in "${hosts[@]}"; do - { - echo "Copying file '$file_to_copy' to $host" - scp "$file_to_copy" $ydb_host_user_name@$host:/home/$ydb_host_user_name - } & - pids+=($!) - done - - # Waiting for all background processes to complete - for pid in "${pids[@]}"; do - wait $pid - done - - echo "Сopy process is complete" -} - -# Cleaning up YDB services on remote hosts -remove_ydb_services() { - local host=$1 - - # Connecting to server - ssh -o StrictHostKeyChecking=no -l $ydb_host_user_name -o BatchMode=yes "$host" ' - services=$(sudo systemctl list-units --type=service --all| grep "ydb" | awk "{print \$1}") - - if [ -z "$services" ]; then - echo "YDB are not found" - else - for service in $services; do - sudo systemctl stop "$service" - sudo systemctl disable "$service" - - unit_path=$(systemctl show -p FragmentPath "$service" | cut -d= -f2) - - if [ -n "$unit_path" ] && [ -f "$unit_path" ]; then - sudo rm -f "$unit_path" - - service_name=$(basename "$unit_path") - if [ -f "/etc/systemd/system/$service_name" ]; then - sudo rm -f "/etc/systemd/system/$service_name" - fi - - if [ -L "/etc/systemd/system/multi-user.target.wants/$service_name" ]; then - sudo rm -f "/etc/systemd/system/multi-user.target.wants/$service_name" - fi - fi - done - - sudo systemctl daemon-reload - sudo systemctl reset-failed - fi - ' - - echo "All operation on $host are finished" -} - -echo "Beginning the process of removing YDB services on all hosts..." - -for host in "${hosts[@]}"; do - remove_ydb_services "$host" -done - -cd $START_DIR/ydb-ansible-examples/TLS -find . -maxdepth 1 -type d -not -path "." -exec rm -rf {} \; -if [ -f "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS" ]; then - cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS - rm -rf * -fi - -cd $START_DIR/ydb-ansible-examples/TLS -./ydb-ca-update.sh -cd CA/certs -newest_dir=$(find . -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/ -replace_string_in_file "50-inventory.yaml" "" "$START_DIR/ydb-ansible-examples/TLS/CA/certs/$newest_dir" -replace_string_in_file "50-inventory.yaml" "$ydb_host_user_name" "$ydb_host_user_name" - -ssh_execute hosts "sudo mkdir -p /opt/ydb/bin && sudo chmod 755 /opt/ydb/bin" - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -copy_file_to_multiple_hosts "ydbd" $host1$host_suffix $host2$host_suffix $host3$host_suffix - -obliterate_disks() { - declare -n local_hostsd="$1" - declare -n local_disks="$2" - - for disk in "${local_disks[@]}"; do - ssh_execute local_hostsd "sudo /home/$ydb_host_user_name/ydbd admin blobstorage disk obliterate $disk" - done -} - -obliterate_disks hosts disks - -ssh_execute hosts "rm -f /home/$ydb_host_user_name/ydbd" -ssh_execute hosts "sudo rm -rf /opt/ydb/" - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/ -ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks - -cd $START_DIR - -if [ ! -f "hits.csv.gz" ]; then - wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.csv.gz -fi - -if [ ! -f "hits.csv" ]; then - echo "Unpacking hits.csv.gz" - gzip -d -f -k hits.csv.gz - echo "Done" -fi - -# if [ -f "$HOME/.config/ydb/import_progress/hits.csv" ]; then -# rm "$HOME/.config/ydb/import_progress/hits.csv" -# fi - -cert_dir=$(find $START_DIR/ydb-ansible-examples/TLS/CA/certs -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) -echo $YDB_PASSWORD|$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root workload clickbench init --datetime --store column -echo -n "Load time: " -command time -f '%e' echo $YDB_PASSWORD|$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root import file csv hits.csv -p clickbench/hits - -cd $START_DIR -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# YDB downloads CSV directly inside ./load (the ydb CLI imports from CSV). +export BENCH_DOWNLOAD_SCRIPT="" +# YDB has no benefit from server restart — it's a multi-node distributed +# cluster managed via ansible/systemd; stopping between queries is impractical. +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/ydb/check b/ydb/check new file mode 100755 index 0000000000..196afc0268 --- /dev/null +++ b/ydb/check @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +echo "$YDB_PASSWORD" | "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root yql -s 'SELECT 1' >/dev/null 2>&1 diff --git a/ydb/data-size b/ydb/data-size new file mode 100755 index 0000000000..a87f3b59d8 --- /dev/null +++ b/ydb/data-size @@ -0,0 +1,20 @@ +#!/bin/bash +# YDB data is on raw block devices on the cluster nodes; there's no standard +# du-based answer. We approximate via SQL. +set -eu + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" + +echo "$YDB_PASSWORD" | "$YDB_BIN" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root \ + yql -s "SELECT SUM(DataSize) FROM \`/Root/database/.sys/partition_stats\` WHERE Path LIKE '%clickbench/hits%';" \ + 2>/dev/null \ + | grep -oE '[0-9]+' | tail -n1 diff --git a/ydb/install b/ydb/install new file mode 100755 index 0000000000..684496b68f --- /dev/null +++ b/ydb/install @@ -0,0 +1,16 @@ +#!/bin/bash +# YDB install — builds ydbd/ydb/ydb-dstool from source and provisions a +# 3-node mirror-3-dc cluster via ansible. The original benchmark.sh contains +# all the setup logic; we delegate to it via an env flag that stops short of +# loading data and running queries. +# +# This is a best-effort port: the cluster setup is host-specific (it expects +# three reachable peers defined in benchmark_variables.sh) and is not +# idempotent in any meaningful sense. Re-running may re-bootstrap state. +set -e + +# The original script does install + load + run all in one. We only execute +# the install phases here; ./load handles importing data, ./query runs SQL. +# To avoid duplicating that long script we keep the original logic in a +# helper file. +exec ./install-impl.sh diff --git a/ydb/install-impl.sh b/ydb/install-impl.sh new file mode 100755 index 0000000000..537c3e82d1 --- /dev/null +++ b/ydb/install-impl.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# YDB install: build binaries, provision the 3-node cluster, but do not load +# data or run queries (those are split into ./load and ./query). +set -e + +PARAMS_FILE="benchmark_variables.sh" +source "$PARAMS_FILE" +export YDB_PASSWORD=password +START_DIR=$(pwd) + +update_file() { + local raw_input="$1" + local raw_output="$2" + + expand_path() { + local path="$1" + path="${path/#\~/$HOME}" + eval echo "$path" + } + + local input_file output_file + input_file=$(expand_path "$raw_input") + output_file=$(expand_path "$raw_output") + + local temp_file + temp_file=$(mktemp) + trap 'rm -f "$temp_file"' EXIT + + cp "$input_file" "$temp_file" + + local env_vars + env_vars=$(env | cut -d= -f1) + for var in $env_vars; do + local value="${!var}" + if grep -q "\$$var" "$temp_file"; then + local escaped_value + escaped_value=$(echo "$value" | sed -e 's/[\/&]/\\&/g') + sed -i "s/\$$var/$escaped_value/g" "$temp_file" + fi + done + + cp "$temp_file" "$output_file" +} + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository --yes --update ppa:ansible/ansible +sudo apt-get install -y ansible-core + +cd "$START_DIR" +[ -d "ydb" ] || git clone https://github.com/ydb-platform/ydb.git + +cd "$START_DIR/ydb/ydb/apps/ydbd/" +git checkout stable-25-1-analytics +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR/ydb/ydb/apps/ydb/" +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR/ydb/ydb/apps/dstool/" +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR" +[ -d "ydb-ansible-examples" ] || git clone https://github.com/ydb-platform/ydb-ansible-examples.git + +cd "$START_DIR/ydb-ansible-examples" +ansible-galaxy install -r requirements.yaml + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc" +rm -f files/ydbd files/ydb files/ydb-dstool +ln -f "$START_DIR/ydb/ydb/apps/ydbd/ydbd" files/ +ln -f "$START_DIR/ydb/ydb/apps/ydb/ydb" files/ +ln -f "$START_DIR/ydb/ydb/apps/dstool/ydb-dstool" files/ + +cd "$START_DIR" +update_file "ydb-cluster-setup/50-inventory.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/50-inventory.yaml" +update_file "ydb-cluster-setup/config.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/config.yaml" +update_file "ydb-cluster-setup/ydb-ca-nodes.txt" "$START_DIR/ydb-ansible-examples/TLS/ydb-ca-nodes.txt" + +hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) +disks=( "$disk1" "$disk2" "$disk3" ) + +ssh_execute() { + declare -n local_hosts="$1" + local command="$2" + for host in "${local_hosts[@]}"; do + echo "$command" | ssh -l "$ydb_host_user_name" -o BatchMode=yes -o StrictHostKeyChecking=no "$host" "bash -s" || true + done +} + +copy_file_to_multiple_hosts() { + local file_to_copy=$1; shift + local hosts=("$@") + for host in "${hosts[@]}"; do + scp "$file_to_copy" "$ydb_host_user_name@$host:/home/$ydb_host_user_name" & + done + wait +} + +remove_ydb_services() { + local host=$1 + ssh -o StrictHostKeyChecking=no -l "$ydb_host_user_name" -o BatchMode=yes "$host" ' + services=$(sudo systemctl list-units --type=service --all | grep "ydb" | awk "{print \$1}") + if [ -n "$services" ]; then + for service in $services; do + sudo systemctl stop "$service" || true + sudo systemctl disable "$service" || true + unit_path=$(systemctl show -p FragmentPath "$service" | cut -d= -f2) + if [ -n "$unit_path" ] && [ -f "$unit_path" ]; then + sudo rm -f "$unit_path" + fi + done + sudo systemctl daemon-reload + sudo systemctl reset-failed + fi + ' || true +} + +for host in "${hosts[@]}"; do remove_ydb_services "$host"; done + +cd "$START_DIR/ydb-ansible-examples/TLS" +find . -maxdepth 1 -type d -not -path "." -exec rm -rf {} \; +[ -d "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS" ] \ + && rm -rf "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS"/* + +./ydb-ca-update.sh +cd CA/certs +newest_dir=$(find . -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/" +sed -i "s||$START_DIR/ydb-ansible-examples/TLS/CA/certs/$newest_dir|g" 50-inventory.yaml + +ssh_execute hosts "sudo mkdir -p /opt/ydb/bin && sudo chmod 755 /opt/ydb/bin" + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/" +copy_file_to_multiple_hosts "ydbd" "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" + +for disk in "${disks[@]}"; do + ssh_execute hosts "sudo /home/$ydb_host_user_name/ydbd admin blobstorage disk obliterate $disk" +done + +ssh_execute hosts "rm -f /home/$ydb_host_user_name/ydbd" +ssh_execute hosts "sudo rm -rf /opt/ydb/" + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/" +ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks diff --git a/ydb/load b/ydb/load new file mode 100755 index 0000000000..c220c2efa5 --- /dev/null +++ b/ydb/load @@ -0,0 +1,29 @@ +#!/bin/bash +set -eu + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" +COMMON_ARGS=(-e "grpcs://$host1$host_suffix:2135" -d /Root/database --ca-file "$cert_dir/ca.crt" --user root) + +if [ ! -f "hits.csv" ]; then + if [ ! -f "hits.csv.gz" ]; then + wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' + fi + gzip -d -f -k hits.csv.gz +fi + +echo "$YDB_PASSWORD" | "$YDB_BIN" "${COMMON_ARGS[@]}" \ + workload clickbench init --datetime --store column + +echo "$YDB_PASSWORD" | "$YDB_BIN" "${COMMON_ARGS[@]}" \ + import file csv hits.csv -p clickbench/hits + +rm -f hits.csv hits.csv.gz +sync diff --git a/ydb/query b/ydb/query new file mode 100755 index 0000000000..c193419b01 --- /dev/null +++ b/ydb/query @@ -0,0 +1,38 @@ +#!/bin/bash +# Reads a SQL/YQL query from stdin, runs it via the ydb CLI's yql subcommand. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# `--stats basic` "total_duration_us:" output). +# Exit non-zero on error. +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" + +query=$(cat) + +raw=$(echo "$YDB_PASSWORD" | "$YDB_BIN" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root \ + yql -s "$query" --stats basic 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$raw" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$raw" + +us=$(printf '%s\n' "$raw" | grep -oP 'total_duration_us:\s*\K[0-9]+' | tail -n1) +if [ -z "$us" ]; then + echo "no total_duration_us in ydb output" >&2 + exit 1 +fi + +awk -v u="$us" 'BEGIN { printf "%.6f\n", u / 1000000 }' >&2 diff --git a/ydb/run.sh b/ydb/run.sh deleted file mode 100755 index 160b5ca90e..0000000000 --- a/ydb/run.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -TRIES=3 -set -e -source benchmark_variables.sh - -YDB_PASSWORD=password - -cert_dir=$(find ydb-ansible-examples/TLS/CA/certs -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) - -# YDB uses raw block devices, that means there is not need to drop filesystem caches -# sync -# echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - -cat queries.sql | while read -r query; do - echo -n "[" - - for i in $(seq 1 $TRIES); do - result=$(echo $YDB_PASSWORD | ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root yql -s "$query" --stats basic 2>/dev/null) - - # Extracting total_duration_us value - if [[ "$result" =~ total_duration_us:[[:space:]]*([0-9]+) ]]; then - duration_us=${BASH_REMATCH[1]} - # Convert microseconds to seconds - duration_sec=$(awk "BEGIN {printf \"%.6f\", $duration_us/1000000}") - echo -n "$duration_sec" - - if [ $i -ne $(($TRIES)) ]; then - echo -n "," - fi - else - exit -1 - fi - done - echo "]," -done diff --git a/ydb/start b/ydb/start new file mode 100755 index 0000000000..7c639c81e6 --- /dev/null +++ b/ydb/start @@ -0,0 +1,23 @@ +#!/bin/bash +# YDB cluster lifecycle is managed by ansible/systemd on remote nodes. +# After ./install the cluster is already running; we just verify connectivity. +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" 2>/dev/null \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +# Idempotent: if cluster responds, exit success. +if echo "$YDB_PASSWORD" | "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root yql -s 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Cluster is provisioned via ansible during install. Re-running the playbook +# is the most reliable way to bring all nodes up. +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/" +ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks diff --git a/ydb/stop b/ydb/stop new file mode 100755 index 0000000000..5337507c82 --- /dev/null +++ b/ydb/stop @@ -0,0 +1,10 @@ +#!/bin/bash +# Best-effort stop of the YDB systemd services on each node. +source benchmark_variables.sh +hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) + +for host in "${hosts[@]}"; do + ssh -o StrictHostKeyChecking=no -l "$ydb_host_user_name" -o BatchMode=yes "$host" \ + "sudo systemctl list-units --type=service --all | grep ydb | awk '{print \$1}' | xargs -r sudo systemctl stop" \ + 2>/dev/null || true +done diff --git a/yugabytedb/benchmark.sh b/yugabytedb/benchmark.sh index 56766d5846..531bd65038 100755 --- a/yugabytedb/benchmark.sh +++ b/yugabytedb/benchmark.sh @@ -1,41 +1,5 @@ #!/bin/bash - -YDBVERSION=2.25.2.0 -YDBBUILD=b359 - -sudo apt-get update -y - -# Needed dependencies -sudo apt-get install -y python3 locales -sudo locale-gen en_US.UTF-8 -# Should now include en_US.utf8 -locale -a - -wget --continue --progress=dot:giga https://software.yugabyte.com/releases/$YDBVERSION/yugabyte-$YDBVERSION-$YDBBUILD-linux-x86_64.tar.gz -tar xvfz yugabyte-$YDBVERSION-$YDBBUILD-linux-x86_64.tar.gz -mv ./yugabyte-$YDBVERSION ./yugabyte -# Should print "INSTALL PASSED" -./yugabyte/bin/post_install.sh - -./yugabyte/bin/yugabyted start --advertise_address 127.0.0.1 --ui false --background true - -../download-hits-tsv - -./yugabyte/bin/ysqlsh -U yugabyte -c "CREATE DATABASE test;" -./yugabyte/bin/ysqlsh -U yugabyte -c "ALTER DATABASE test SET temp_file_limit=-1;" -./yugabyte/bin/ysqlsh -U yugabyte -d test -c "DROP DATABASE IF EXISTS yugabyte;" -./yugabyte/bin/ysqlsh -U yugabyte -d test -t < create.sql - -# takes around ~78 minutes on AWS EC2 c6a.4xlarge (500GB gp2) -echo -n "Load time: " -command time -f '%e' ./load.sh - -./run.sh 2>&1 | tee log.txt - -# 76977854454 bytes -echo -n "Data size: " -./yugabyte/bin/ysqlsh -U yugabyte -d test -q -c "SELECT pg_total_relation_size('public.hits') AS TOTAL_TABLE_SIZE_IN_BYTES;" - -grep -oP 'Time: \d+\.\d+ ms|ysqlsh: error' log.txt | - sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*ysqlsh: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/yugabytedb/check b/yugabytedb/check new file mode 100755 index 0000000000..5329cf2306 --- /dev/null +++ b/yugabytedb/check @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +./yugabyte/bin/ysqlsh -U yugabyte -c 'SELECT 1' >/dev/null diff --git a/yugabytedb/data-size b/yugabytedb/data-size new file mode 100755 index 0000000000..fa657e9150 --- /dev/null +++ b/yugabytedb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +./yugabyte/bin/ysqlsh -U yugabyte -d test -tAq \ + -c "SELECT pg_total_relation_size('public.hits');" diff --git a/yugabytedb/install b/yugabytedb/install new file mode 100755 index 0000000000..7db84d52ba --- /dev/null +++ b/yugabytedb/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +YDBVERSION=${YDBVERSION:-2.25.2.0} +YDBBUILD=${YDBBUILD:-b359} + +sudo apt-get update -y +# yugabyted's python tooling expects a UTF-8 locale. +sudo apt-get install -y python3 locales +sudo locale-gen en_US.UTF-8 + +if [ ! -d ./yugabyte ]; then + wget --continue --progress=dot:giga \ + "https://software.yugabyte.com/releases/$YDBVERSION/yugabyte-$YDBVERSION-$YDBBUILD-linux-x86_64.tar.gz" + tar xfz "yugabyte-$YDBVERSION-$YDBBUILD-linux-x86_64.tar.gz" + mv "./yugabyte-$YDBVERSION" ./yugabyte + ./yugabyte/bin/post_install.sh +fi diff --git a/yugabytedb/load b/yugabytedb/load new file mode 100755 index 0000000000..9324c8d44b --- /dev/null +++ b/yugabytedb/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +# Create the test database and apply the schema. Idempotent enough: +# subsequent runs see the database already exists and skip. +./yugabyte/bin/ysqlsh -U yugabyte -c "CREATE DATABASE test;" 2>/dev/null || true +./yugabyte/bin/ysqlsh -U yugabyte -c "ALTER DATABASE test SET temp_file_limit=-1;" +./yugabyte/bin/ysqlsh -U yugabyte -d test -c "DROP TABLE IF EXISTS hits;" +./yugabyte/bin/ysqlsh -U yugabyte -d test -v ON_ERROR_STOP=1 -t < create.sql + +# COPY from the local hits.tsv. yugabytedb's COPY is single-stream; +# expect ~78 minutes on c6a.4xlarge per the original benchmark notes. +./yugabyte/bin/ysqlsh -U yugabyte -d test -v ON_ERROR_STOP=1 <<'EOF' +\copy hits FROM 'hits.tsv'; +EOF + +./yugabyte/bin/ysqlsh -U yugabyte -d test -v ON_ERROR_STOP=1 -t -c 'ANALYZE hits;' + +rm -f hits.tsv +sync diff --git a/yugabytedb/load.sh b/yugabytedb/load.sh deleted file mode 100755 index d688a38f5a..0000000000 --- a/yugabytedb/load.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -set -eu - -./yugabyte/bin/ysqlsh -U yugabyte -d test <<'EOF' -\copy hits FROM 'hits.tsv'; -EOF - -./yugabyte/bin/ysqlsh -U yugabyte -d test -t -c 'ANALYZE hits;' diff --git a/yugabytedb/query b/yugabytedb/query new file mode 100755 index 0000000000..27aef7038e --- /dev/null +++ b/yugabytedb/query @@ -0,0 +1,25 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via ysqlsh against yugabytedb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(./yugabyte/bin/ysqlsh -U yugabyte -d test -v ON_ERROR_STOP=1 -t \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL):|ysqlsh: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in ysqlsh output" >&2 + exit 1 +fi +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/yugabytedb/run.sh b/yugabytedb/run.sh deleted file mode 100755 index 2e65c89a79..0000000000 --- a/yugabytedb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | ./yugabyte/bin/ysqlsh -U yugabyte -d test -t 2>&1 | grep -P 'Time|ysqlsh: error' | tail -n1 -done diff --git a/yugabytedb/start b/yugabytedb/start new file mode 100755 index 0000000000..9d358fdb59 --- /dev/null +++ b/yugabytedb/start @@ -0,0 +1,17 @@ +#!/bin/bash +set -eu + +# Idempotent: if ysqlsh can connect, nothing to do. +if ./yugabyte/bin/ysqlsh -U yugabyte -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +./yugabyte/bin/yugabyted start \ + --advertise_address 127.0.0.1 \ + --ui false \ + --background true + +for _ in $(seq 1 120); do + ./yugabyte/bin/ysqlsh -U yugabyte -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/yugabytedb/stop b/yugabytedb/stop new file mode 100755 index 0000000000..ff812f6b4d --- /dev/null +++ b/yugabytedb/stop @@ -0,0 +1,2 @@ +#!/bin/bash +./yugabyte/bin/yugabyted stop 2>/dev/null || true