diff --git a/datafusion-partitioned/run.sh b/datafusion-partitioned/run.sh index 2e1c36109e..2e9b92c2aa 100755 --- a/datafusion-partitioned/run.sh +++ b/datafusion-partitioned/run.sh @@ -2,20 +2,45 @@ TRIES=3 QUERY_NUM=1 +TMP_DIR=$(mktemp -d) +trap 'rm -rf "${TMP_DIR}"' EXIT + echo $1 -cat queries.sql | while read -r query; do +while read -r query; do + [ -z "$query" ] && continue + sync echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo "$query" > /tmp/query.sql + QUERY_FILE="${TMP_DIR}/query-${QUERY_NUM}.sql" + MARKER="clickbench_query_${QUERY_NUM}_start" + printf "SELECT '${MARKER}';\n\n" > "${QUERY_FILE}" + for i in $(seq 1 $TRIES); do + printf '%s\n\n' "$query" >> "${QUERY_FILE}" + done + + # Keep all tries in one process so DataFusion process-local caches stay hot. + # Use a marker query to ignore setup timings from create.sql. + ELAPSED_FILE="${TMP_DIR}/elapsed-${QUERY_NUM}.txt" + OUTPUT_FILE="${TMP_DIR}/output-${QUERY_NUM}.txt" + datafusion-cli -f create.sql "${QUERY_FILE}" > "${OUTPUT_FILE}" 2>&1 + awk -v marker="$MARKER" ' + index($0, marker) { seen = 1 } + seen && /Elapsed/ { + if (!skipped_marker_elapsed) { + skipped_marker_elapsed = 1 + next + } + print $2 + } + ' "${OUTPUT_FILE}" > "${ELAPSED_FILE}" + if [ "$(wc -l < "${ELAPSED_FILE}")" -lt "$TRIES" ]; then + grep -v "Elapsed" "${OUTPUT_FILE}" >&2 + fi echo -n "[" for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1| awk '{ print $2 }') + RES=$(awk -v line="$i" 'NR == line { print; exit }' "${ELAPSED_FILE}") [[ $RES != "" ]] && \ echo -n "$RES" || \ echo -n "null" @@ -25,4 +50,4 @@ cat queries.sql | while read -r query; do echo "]," QUERY_NUM=$((QUERY_NUM + 1)) -done +done < queries.sql diff --git a/datafusion/run.sh b/datafusion/run.sh index cd1059ac31..2e9b92c2aa 100755 --- a/datafusion/run.sh +++ b/datafusion/run.sh @@ -2,20 +2,45 @@ TRIES=3 QUERY_NUM=1 +TMP_DIR=$(mktemp -d) +trap 'rm -rf "${TMP_DIR}"' EXIT + echo $1 -cat queries.sql | while read -r query; do +while read -r query; do + [ -z "$query" ] && continue + sync echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo "$query" > /tmp/query.sql + QUERY_FILE="${TMP_DIR}/query-${QUERY_NUM}.sql" + MARKER="clickbench_query_${QUERY_NUM}_start" + printf "SELECT '${MARKER}';\n\n" > "${QUERY_FILE}" + for i in $(seq 1 $TRIES); do + printf '%s\n\n' "$query" >> "${QUERY_FILE}" + done + + # Keep all tries in one process so DataFusion process-local caches stay hot. + # Use a marker query to ignore setup timings from create.sql. + ELAPSED_FILE="${TMP_DIR}/elapsed-${QUERY_NUM}.txt" + OUTPUT_FILE="${TMP_DIR}/output-${QUERY_NUM}.txt" + datafusion-cli -f create.sql "${QUERY_FILE}" > "${OUTPUT_FILE}" 2>&1 + awk -v marker="$MARKER" ' + index($0, marker) { seen = 1 } + seen && /Elapsed/ { + if (!skipped_marker_elapsed) { + skipped_marker_elapsed = 1 + next + } + print $2 + } + ' "${OUTPUT_FILE}" > "${ELAPSED_FILE}" + if [ "$(wc -l < "${ELAPSED_FILE}")" -lt "$TRIES" ]; then + grep -v "Elapsed" "${OUTPUT_FILE}" >&2 + fi echo -n "[" for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1 | awk '{ print $2 }') + RES=$(awk -v line="$i" 'NR == line { print; exit }' "${ELAPSED_FILE}") [[ $RES != "" ]] && \ echo -n "$RES" || \ echo -n "null" @@ -25,4 +50,4 @@ cat queries.sql | while read -r query; do echo "]," QUERY_NUM=$((QUERY_NUM + 1)) -done +done < queries.sql