diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 611121a..65cf658 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -45,6 +45,7 @@ jobs: owner: ModelEngine-Group repository: 'DataMate' access-token: ${{ secrets.ACCESS_TOKEN }} + branch: develop/sealed-secrets - name: DataMate Package run: | @@ -57,6 +58,27 @@ jobs: sed -i "s/latest/${{ inputs.version }}/g" helm/datamate/values.yaml sed -i 's#HOME_PAGE_URL: *""#HOME_PAGE_URL: "/data/management"#g' helm/datamate/values.yaml + - name: Sealed-Secrets Helm Chart + run: | + mkdir -p helm/sealed-secrets + helm repo add sealed-secrets https://bitnami-labs.github.io/sealed-secrets + helm repo update + helm pull sealed-secrets/sealed-secrets --version 2.18.6 -d helm/sealed-secrets + + - name: Download kubeseal + run: | + mkdir -p tools/bin + if [ "${{ inputs.aarch }}" = "arm64" ]; then + KUBESEAL_ARCH="arm64" + else + KUBESEAL_ARCH="amd64" + fi + wget -q "https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.27.2/kubeseal-0.27.2-linux-${KUBESEAL_ARCH}.tar.gz" + tar xzf kubeseal-0.27.2-linux-${KUBESEAL_ARCH}.tar.gz kubeseal + mv kubeseal tools/bin/kubeseal + chmod +x tools/bin/kubeseal + rm kubeseal-0.27.2-linux-${KUBESEAL_ARCH}.tar.gz + - name: DeerFlow Package if: inputs.deer-flow == true run: | @@ -95,6 +117,10 @@ jobs: docker pull quay.io/kuberay/operator:v1.4.2 --platform ${{ inputs.aarch }} docker save -o images/datamate/kuberay-operator.tar quay.io/kuberay/operator:v1.4.2 docker rmi quay.io/kuberay/operator:v1.4.2 + # Sealed-secrets controller (Docker v2 manifest for offline compat) + docker pull bitnami/sealed-secrets-controller:0.27.0 --platform ${{ inputs.aarch }} + docker save -o images/datamate/sealed-secrets-controller.tar bitnami/sealed-secrets-controller:0.27.0 + docker rmi bitnami/sealed-secrets-controller:0.27.0 - name: Download DeerFlow Image if: inputs.deer-flow == true @@ -144,4 +170,4 @@ jobs: path: | helm/ images/ - tools/ \ No newline at end of file + tools/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e58e5a2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +.idea diff --git a/tools/README-node-isolation.md b/tools/README-node-isolation.md new file mode 100644 index 0000000..f69d844 --- /dev/null +++ b/tools/README-node-isolation.md @@ -0,0 +1,286 @@ +# Node Isolation for DataMate Deployment + +This document describes the node isolation feature for commercial DataMate deployment. + +## Overview + +Node isolation allows you to dedicate specific Kubernetes nodes for DataMate components, ensuring: +- **Resource isolation**: DataMate pods only run on designated nodes +- **Performance guarantee**: Avoid resource competition with other workloads +- **Hardware specialization**: Use specific nodes for GPU/NPU workloads + +## Features + +- Interactive node selection with keyboard navigation (↑/↓ or j/k) +- Automatic label application: `node-role.kubernetes.io/datamate=true` +- Optional taint application: `node-role.kubernetes.io/datamate=true:NoSchedule` +- Helm argument generation for nodeSelector and tolerations +- Cross-platform support (Linux, macOS) + +## Usage + +### During Installation + +The node isolation setup is automatically triggered during `./install.sh`: + +```bash +cd tools +./install.sh -n model-engine +``` + +You will see an interactive prompt: + +``` +Configure dedicated nodes for DataMate deployment? +This will apply labels and taints to selected nodes. + +1. Yes - Configure nodes interactively +2. No - Use default scheduling (recommended for development) + +Enter choice [default: 2]: +``` + +**Options:** +- Choose `1` to enter interactive node selection +- Choose `2` or press Enter to skip (recommended for dev/test environments) + +### Interactive Node Selection + +If you choose to configure nodes, you'll see an interactive menu: + +``` +===================================== + DataMate Node Setup +===================================== + +Select nodes for DataMate deployment + + → [ ] node-1 (Ready) [datamate] + [x] node-2 (Ready) + [ ] node-3 (NotReady) + +Navigation: ↑/k: up ↓/j: down space: toggle enter: confirm q: quit + +Selected: 1/3 nodes +``` + +**Controls:** +- **↑/k**: Move up +- **↓/j**: Move down +- **Space**: Toggle selection +- **Enter**: Confirm selection +- **q**: Quit and skip setup + +### Skip Node Setup + +To skip node isolation during installation: + +```bash +./install.sh --skip-node-setup +``` + +### Manual Node Setup + +You can also run node setup independently: + +```bash +cd tools +./node-setup.sh --namespace model-engine +``` + +**Options:** +- `--namespace `: Target namespace (default: model-engine) +- `--dry-run`: Show what would be done without applying changes +- `--skip-taint`: Only apply labels, skip taints + +### Node Cleanup + +To remove node labels and taints: + +```bash +cd tools +./node-cleanup.sh --namespace model-engine +``` + +**Options:** +- `--namespace `: Target namespace +- `--dry-run`: Show what would be removed +- `--nodes `: Clean specific nodes (default: auto-detect labeled nodes) +- `--label-key `: Custom label key (default: node-role.kubernetes.io/datamate) + +During uninstallation, node cleanup is automatically triggered unless skipped: + +```bash +./uninstall.sh --skip-node-cleanup +``` + +## How It Works + +### Labels and Taints + +When you select nodes, the script applies: + +**Label:** +```bash +kubectl label node node-role.kubernetes.io/datamate=true --overwrite +``` + +**Taint (optional):** +```bash +kubectl taint node node-role.kubernetes.io/datamate=true:NoSchedule --overwrite +``` + +### Helm Configuration + +The script generates Helm arguments and saves them to `/tmp/datamate-helm-args.sh`: + +```bash +export HELM_NODE_SELECTOR_ARGS="--set-string global.nodeSelector.node-role\.kubernetes\.io/datamate=true ..." +export HELM_TOLERATIONS_ARGS="--set-string global.tolerations[0].key=node-role.kubernetes.io/datamate ..." +``` + +These arguments are automatically sourced by `install.sh` and applied to all DataMate components: +- backend +- backend-python +- database +- frontend +- gateway +- runtime +- ray-cluster (head, worker, npuGroup, gpuGroup) +- kuberay-operator + +### Taint Effect + +The `NoSchedule` taint effect ensures: +- **Only pods with matching tolerations** can be scheduled on these nodes +- **Regular pods without tolerations** are scheduled elsewhere +- **Existing pods remain running** (NoSchedule only affects new pods) + +## Best Practices + +### Production Deployment + +For production environments: +1. **Dedicate 3+ nodes** for DataMate (HA requirement) +2. **Apply both labels and taints** for strict isolation +3. **Use nodes with sufficient resources** (CPU, memory, storage) +4. **Consider hardware specialization** (GPU/NPU nodes for ML workloads) + +### Development/Testing + +For dev/test environments: +1. **Skip node isolation** (use default scheduling) +2. **Or apply labels only** (skip taints for flexibility) +3. **Single node is acceptable** (no HA requirement) + +### Mixed Workloads + +If you want DataMate to coexist with other workloads: +1. **Apply labels only** (use `--skip-taint`) +2. **DataMate pods prefer labeled nodes** (nodeSelector) +3. **Other pods can still run** on these nodes (no taint blocking) + +## Troubleshooting + +### Nodes Not Selected + +If nodes aren't being selected properly: +1. Check node status: `kubectl get nodes` +2. Verify kubectl connectivity: `kubectl cluster-info` +3. Ensure you're in a terminal (not piped input) + +### Pods Not Scheduling + +If DataMate pods fail to schedule after node isolation: +1. Check node labels: `kubectl get nodes --show-labels` +2. Check node taints: `kubectl describe node ` +3. Verify tolerations in Helm values +4. Check pod events: `kubectl describe pod -n ` + +### Cleanup Failed + +If cleanup fails to remove labels/taints: +1. Manual removal: + ```bash + kubectl label node node-role.kubernetes.io/datamate- + kubectl taint node node-role.kubernetes.io/datamate=true:NoSchedule- + ``` +2. Check for stuck pods: `kubectl get pods -n ` + +## Implementation Details + +### File Structure + +``` +tools/ +├── node-setup.sh # Interactive node selection and configuration +├── node-cleanup.sh # Remove labels and taints +├── install.sh # Modified to call node-setup.sh +├── uninstall.sh # Modified to call node-cleanup.sh +└── README-node-isolation.md # This documentation +``` + +### Integration Points + +**install.sh:** +- Line 309-313: Node setup call before sealed-secrets installation +- Line 275-285: Helm args sourcing in install_datamate() +- Line 384: `--skip-node-setup` flag handler + +**uninstall.sh:** +- Line 63-66: Node cleanup call after Helm uninstall +- Line 117: `--skip-node-cleanup` flag handler + +### Helm Arguments Structure + +**NodeSelector:** +```yaml +global: + nodeSelector: + node-role.kubernetes.io/datamate: "true" +backend: + nodeSelector: + node-role.kubernetes.io/datamate: "true" +# ... (same for all services) +``` + +**Tolerations:** +```yaml +global: + tolerations: + - key: node-role.kubernetes.io/datamate + operator: Equal + value: "true" + effect: NoSchedule +backend: + tolerations: + - key: node-role.kubernetes.io/datamate + operator: Equal + value: "true" + effect: NoSchedule +# ... (same for all services) +``` + +## Comparison with Open Source Version + +The commercial version node isolation is **equivalent** to the open source version in DataMate repository: + +| Feature | Open Source | Commercial | +|---------|-------------|------------| +| Interactive selection | ✓ | ✓ | +| Keyboard navigation | ✓ | ✓ | +| Label application | ✓ | ✓ | +| Taint application | ✓ | ✓ | +| Helm args generation | ✓ | ✓ | +| Integration point | Makefile `node-setup` target | install.sh automatic call | +| Cleanup script | ✓ | ✓ | + +**Key difference:** +- **Open source**: Manual invocation via `make node-setup` +- **Commercial**: Automatic invocation during `./install.sh` + +## Related Documentation + +- [DataMate Open Source Node Setup](https://github.com/modelengine-group/datamate/blob/main/scripts/k8s/node-setup.sh) +- [Kubernetes Node Selection](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) +- [Kubernetes Taints and Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) \ No newline at end of file diff --git a/tools/generate-sealed-secrets.sh b/tools/generate-sealed-secrets.sh new file mode 100755 index 0000000..5ae70ef --- /dev/null +++ b/tools/generate-sealed-secrets.sh @@ -0,0 +1,206 @@ +#!/bin/bash +### Generate SealedSecret resources using the cluster's sealed-secrets controller. +### +### Usage: +### ./generate-sealed-secrets.sh [--namespace ] [--controller-name ] +### [--skip-label-studio] [--skip-milvus] [--cleanup] +### +### Password sourcing priority: +### 1. Interactive prompt +### 2. Auto-generated random value (for JWT_SECRET, MinIO keys, tokens) +### +### Required env vars (entered interactively): +### DB_PASSWORD, CERT_PASS, DOMAIN, HOME_PAGE_URL +### LABEL_STUDIO_PASSWORD, POSTGRE_PASSWORD +### JWT_SECRET, LABEL_STUDIO_USER_TOKEN, MINIO_ACCESS_KEY, MINIO_SECRET_KEY +### (empty = auto-generate where applicable) + +set -e + +NAMESPACE="datamate" +CONTROLLER_NAME="sealed-secrets" +SKIP_LABEL_STUDIO=false +SKIP_MILVUS=false +CLEANUP=false + +WORK_DIR="$(cd "$(dirname "$0")" && pwd)" +KUBESEAL="${WORK_DIR}/bin/kubeseal" +TMP_DIR=$(mktemp -d) +trap "rm -rf $TMP_DIR" EXIT + +# ========== Argument Parsing ========== +while [[ "$#" -gt 0 ]]; do + case $1 in + -n|--namespace) NAMESPACE="$2"; shift 2 ;; + --controller-name) CONTROLLER_NAME="$2"; shift 2 ;; + --skip-label-studio) SKIP_LABEL_STUDIO=true; shift ;; + --skip-milvus) SKIP_MILVUS=false; shift ;; + --cleanup) CLEANUP=true; shift ;; + *) echo "Unknown parameter: $1"; exit 1 ;; + esac +done + +# ========== Utility Functions ========== +log_info() { echo -e "\033[32m[INFO]\033[0m $*"; } +log_warn() { echo -e "\033[33m[WARN]\033[0m $*"; } +log_error() { echo -e "\033[31m[ERROR]\033[0m $*"; } + +random_hex() { head -c 32 /dev/urandom 2>/dev/null | xxd -p -c 64 || openssl rand -hex 32; } + +# ========== Check sealed-secrets controller ========== +log_info "Waiting for sealed-secrets controller..." +kubectl wait pod -l app.kubernetes.io/instance="${CONTROLLER_NAME}" \ + -n "${NAMESPACE}" --for=condition=Ready --timeout=120s + +# ========== Check kubeseal ========== +# Test if a binary can actually run on this machine +ensure_runnable() { + local bin="$1" + chmod +x "$bin" 2>/dev/null || true + "$bin" --version >/dev/null 2>&1 && return 0 + return 1 +} + +if [ -f "$KUBESEAL" ] && ! ensure_runnable "$KUBESEAL"; then + log_warn "Bundled kubeseal is not compatible with this platform, trying system kubeseal..." + KUBESEAL="$(command -v kubeseal 2>/dev/null || echo "")" +elif [ ! -f "$KUBESEAL" ]; then + KUBESEAL="$(command -v kubeseal 2>/dev/null || echo "")" +fi +if [ -z "$KUBESEAL" ] || ! ensure_runnable "$KUBESEAL"; then + log_error "kubeseal not found at ${WORK_DIR}/bin/kubeseal or in PATH" + exit 1 +fi +log_info "Using kubeseal: $KUBESEAL" + +# ========== Secret Collection ========== +prompt_or_default() { + local var_name="$1" prompt="$2" gen_random="$3" + if [ "$gen_random" = true ]; then + local generated + generated=$(random_hex) + eval "$var_name=\"$generated\"" + log_info "Auto-generated ${var_name}" + return 0 + fi + # Interactive prompt + local is_sensitive=false + case "$var_name" in + *_PASSWORD|*_SECRET|*_TOKEN|CERT_PASS|DB_PASSWORD|MINIO_*) is_sensitive=true ;; + esac + if [ "$is_sensitive" = true ]; then + read -rsp "Enter ${prompt}: " value + echo "" + else + read -rp "Enter ${prompt}: " value + fi + eval "$var_name=\"$value\"" +} + +log_info "Collecting secrets..." + +# DataMate core secrets +prompt_or_default DB_PASSWORD "database password" false +prompt_or_default CERT_PASS "SSL certificate password (enter to skip)" false +prompt_or_default DOMAIN "domain" false +HOME_PAGE_URL="${HOME_PAGE_URL:-/data/management}" +prompt_or_default JWT_SECRET "JWT secret" true + +# Label Studio secrets +if [ "$SKIP_LABEL_STUDIO" = false ]; then + prompt_or_default LABEL_STUDIO_PASSWORD "Label Studio admin password" false + prompt_or_default POSTGRE_PASSWORD "Label Studio PostgreSQL password (same as DB_PASSWORD)" false + if [ -z "$POSTGRE_PASSWORD" ] && [ -n "$DB_PASSWORD" ]; then + POSTGRE_PASSWORD="$DB_PASSWORD" + log_info "Using DB_PASSWORD as POSTGRE_PASSWORD" + fi + prompt_or_default LABEL_STUDIO_USER_TOKEN "Label Studio API token" true +fi + +# Milvus / MinIO secrets +if [ "$SKIP_MILVUS" = false ]; then + prompt_or_default MINIO_ACCESS_KEY "MinIO access key" true + prompt_or_default MINIO_SECRET_KEY "MinIO secret key" true +fi + +# ========== Generate SealedSecret YAML ========== +SEAL_ARGS="--controller-name=${CONTROLLER_NAME} --controller-namespace=${NAMESPACE} -o yaml" + +create_sealed_secret() { + local secret_name="$1" namespace="$2" output_file="$3" + shift 3 + local raw_secret="${TMP_DIR}/${secret_name}-raw.yaml" + + # Build raw Secret YAML + cat > "$raw_secret" <> "$raw_secret" + done + + "$KUBESEAL" ${SEAL_ARGS} -f "$raw_secret" > "$output_file" + log_info "Created SealedSecret: ${output_file}" +} + +# Datamate secret +create_sealed_secret "datamate-conf" "${NAMESPACE}" "${TMP_DIR}/datamate-sealed.yaml" \ + "DB_PASSWORD=${DB_PASSWORD}" \ + "CERT_PASS=${CERT_PASS}" \ + "DOMAIN=${DOMAIN}" \ + "HOME_PAGE_URL=${HOME_PAGE_URL}" \ + "JWT_SECRET=${JWT_SECRET}" \ + "LABEL_STUDIO_PASSWORD=${LABEL_STUDIO_PASSWORD}" \ + "LABEL_STUDIO_USER_TOKEN=${LABEL_STUDIO_USER_TOKEN}" + +# Label Studio secret +if [ "$SKIP_LABEL_STUDIO" = false ]; then + create_sealed_secret "label-studio-env" "${NAMESPACE}" "${TMP_DIR}/label-studio-sealed.yaml" \ + "POSTGRE_PASSWORD=${POSTGRE_PASSWORD}" \ + "LABEL_STUDIO_PASSWORD=${LABEL_STUDIO_PASSWORD}" \ + "LABEL_STUDIO_USER_TOKEN=${LABEL_STUDIO_USER_TOKEN}" +fi + +# Milvus/MinIO secret +if [ "$SKIP_MILVUS" = false ]; then + create_sealed_secret "milvus-minio-secret" "${NAMESPACE}" "${TMP_DIR}/milvus-sealed.yaml" \ + "accesskey=${MINIO_ACCESS_KEY}" \ + "secretkey=${MINIO_SECRET_KEY}" +fi + +# ========== Apply SealedSecrets ========== +log_info "Applying SealedSecret resources..." + +for f in "$TMP_DIR"/*-sealed.yaml; do + [ -f "$f" ] || continue + kubectl apply -f "$f" -n "${NAMESPACE}" +done + +# ========== Verify ========== +log_info "Verifying secret decryption..." +for secret_name in datamate-conf label-studio-env milvus-minio-secret; do + case "$secret_name" in + label-studio-env) [ "$SKIP_LABEL_STUDIO" = true ] && continue ;; + milvus-minio-secret) [ "$SKIP_MILVUS" = true ] && continue ;; + esac + if kubectl get secret "$secret_name" -n "${NAMESPACE}" > /dev/null 2>&1; then + log_info "✓ Secret ${secret_name} decrypted successfully" + else + log_warn "Secret ${secret_name} not yet available (may need controller restart)" + fi +done + +if [ "$CLEANUP" = true ]; then + rm -f "$TMP_DIR"/*-sealed.yaml "$TMP_DIR"/*-raw.yaml +fi + +log_info "Sealed-secrets generation complete!" diff --git a/tools/install.sh b/tools/install.sh index 57d3ea7..a382f42 100644 --- a/tools/install.sh +++ b/tools/install.sh @@ -20,6 +20,7 @@ ### --skip-load Skip loading images. ### --skip-milvus Skip Milvus installation. ### --skip-push Skip pushing images. +### --skip-node-setup Skip node isolation configuration. ### -h, --help Show this help message. set -e @@ -43,6 +44,7 @@ INSTALL_LABEL_STUDIO=true EXECUTE_HAPROXY=true DATAMATE_JWT_ENABLE=true REAL_IP_MODE=proxy_protocol +SKIP_NODE_SETUP=false # --- 脚本内部变量 --- @@ -234,6 +236,7 @@ function get_cert_pass() { function helm_install() { local release_name="$1" local chart_path="$2" + shift 2 local helm_args=() @@ -242,6 +245,11 @@ function helm_install() { helm_args+=("--namespace" "$NAMESPACE") helm_args+=("--create-namespace") + # Append extra args (e.g., --set key=value) + for arg in "$@"; do + helm_args+=("$arg") + done + log_info "即将执行: helm ${helm_args[*]}" if ! helm "${helm_args[@]}"; then @@ -250,19 +258,102 @@ function helm_install() { fi } +function install_sealed_secrets() { + local chart_tgz + chart_tgz=$(ls "${HELM_PATH}/sealed-secrets/sealed-secrets-"*.tgz 2>/dev/null | head -1) + if [ -z "$chart_tgz" ]; then + log_error "sealed-secrets Helm chart not found in ${HELM_PATH}/sealed-secrets/" + exit 1 + fi + log_info "Installing sealed-secrets controller..." + local registry="${REPO:-docker.io}" + + # Source node isolation args if available + local tolerations_args="" + if [ -f /tmp/datamate-helm-args.sh ]; then + source /tmp/datamate-helm-args.sh + tolerations_args="$HELM_SEALED_SECRETS_TOLERATIONS" + fi + + # Build helm command with tolerations (string expansion, not array) + helm upgrade --install sealed-secrets "$chart_tgz" \ + -n "$NAMESPACE" --create-namespace \ + --set image.registry="${registry}" \ + --set image.tag=0.27.0 \ + --set image.pullPolicy=IfNotPresent \ + --wait --timeout 120s $tolerations_args + log_info "sealed-secrets controller installed." +} + function install_datamate() { - helm_install "datamate" "${HELM_PATH}/datamate" + local jwt_args="" + local node_selector_args="" + local tolerations_args="" + + if [ "$DATAMATE_JWT_ENABLE" == "true" ]; then + jwt_args="--set datamate.jwt.enable=true" + fi + + # Source node isolation args if available + if [ -f /tmp/datamate-helm-args.sh ]; then + source /tmp/datamate-helm-args.sh + node_selector_args="$HELM_NODE_SELECTOR_ARGS" + tolerations_args="$HELM_TOLERATIONS_ARGS" + fi + + # Build helm command with all args (string expansion, not array) + helm_install "datamate" "${HELM_PATH}/datamate" \ + --set public.secrets.create=false \ + --set public.persistentVolumeClaim.accessModes=ReadWriteOnce \ + $jwt_args $node_selector_args $tolerations_args } function install_milvus() { - helm_install "milvus" "${HELM_PATH}/milvus" + local tolerations_args="" + + # Source node isolation args if available + if [ -f /tmp/datamate-helm-args.sh ]; then + source /tmp/datamate-helm-args.sh + tolerations_args="$HELM_MILVUS_TOLERATIONS" + fi + + # Build helm command with tolerations (string expansion, not array) + helm_install "milvus" "${HELM_PATH}/milvus" \ + --set log.persistence.persistentVolumeClaim.accessModes=ReadWriteOnce \ + $tolerations_args } function install_label_studio() { - helm_install "label-studio" "${HELM_PATH}/label-studio" + local tolerations_args="" + + # Source node isolation args if available + if [ -f /tmp/datamate-helm-args.sh ]; then + source /tmp/datamate-helm-args.sh + tolerations_args="$HELM_LABEL_STUDIO_TOLERATIONS" + fi + + # Build helm command with tolerations (string expansion, not array) + helm_install "label-studio" "${HELM_PATH}/label-studio" $tolerations_args } function install() { + # 1. Node isolation setup (interactive, optional) + if [ "$SKIP_NODE_SETUP" == "false" ]; then + log_info "Configuring node isolation (optional)..." + bash "${WORK_DIR}/node-setup.sh" --namespace "$NAMESPACE" + fi + + # 2. Install sealed-secrets controller + install_sealed_secrets + + # 3. Generate sealed secrets (from .env or interactive input) + log_info "Generating SealedSecret resources..." + bash "${WORK_DIR}/generate-sealed-secrets.sh" \ + -n "$NAMESPACE" \ + $([ "$INSTALL_MILVUS" = false ] && echo "--skip-milvus") \ + $([ "$INSTALL_LABEL_STUDIO" = false ] && echo "--skip-label-studio") + + # 4. Install DataMate components install_datamate if [ "$INSTALL_MILVUS" == "true" ]; then install_milvus @@ -270,6 +361,9 @@ function install() { if [ "$INSTALL_LABEL_STUDIO" == "true" ]; then install_label_studio fi + + # Cleanup node isolation temp file (all components have sourced it) + rm -f /tmp/datamate-helm-args.sh } function add_nginx_route_to_haproxy() { @@ -322,6 +416,7 @@ function main() { --skip-load) SKIP_LOAD=true; shift ;; --skip-milvus) INSTALL_MILVUS=false; shift ;; --skip-label-studio|--skip-ls) INSTALL_LABEL_STUDIO=false; shift ;; + --skip-node-setup) SKIP_NODE_SETUP=true; shift ;; --package) PACKAGE_PATH="$2"; shift 2 ;; --skip-haproxy) EXECUTE_HAPROXY=false; shift ;; --node-port) NODE_PORT="$2"; shift 2 ;; @@ -334,7 +429,6 @@ function main() { read_value read_storage_value - get_cert_pass load_images "datamate" if [ "$INSTALL_MILVUS" == "true" ]; then load_images "milvus" diff --git a/tools/node-cleanup.sh b/tools/node-cleanup.sh new file mode 100755 index 0000000..c1ed1e5 --- /dev/null +++ b/tools/node-cleanup.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# +# DataMate Node Cleanup Script +# Remove labels and taints from nodes that were configured for DataMate deployment +# +# Usage: ./node-cleanup.sh [--dry-run] [--nodes NODE1,NODE2] +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Default values +DRY_RUN=false +NAMESPACE="model-engine" +LABEL_KEY="node-role.kubernetes.io/datamate" +LABEL_VALUE="true" +TAINT_EFFECT="NoSchedule" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --namespace) + NAMESPACE="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --nodes) + PROVIDED_NODES="$2" + shift 2 + ;; + --label-key) + LABEL_KEY="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +echo -e "${BLUE}=====================================${NC}" +echo -e "${BLUE} DataMate Node Cleanup${NC}" +echo -e "${BLUE}=====================================${NC}" +echo "" + +# Check if kubectl is available +if ! command -v kubectl &> /dev/null; then + echo -e "${RED}Error: kubectl is not installed${NC}" + exit 1 +fi + +# Check if connected to cluster +if ! kubectl cluster-info &> /dev/null; then + echo -e "${RED}Error: Cannot connect to Kubernetes cluster${NC}" + exit 1 +fi + +# Determine nodes to clean up +if [ "$PROVIDED_NODES" != "" ]; then + # Use provided nodes + IFS=',' read -ra SELECTED_NODES <<< "$PROVIDED_NODES" +else + # Find nodes with the datamate label directly from Kubernetes + echo -e "${YELLOW}Finding nodes with $LABEL_KEY=$LABEL_VALUE label...${NC}" + NODES=$(kubectl get nodes -l "$LABEL_KEY=$LABEL_VALUE" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') + + if [ -z "$NODES" ]; then + echo -e "${GREEN}No nodes found with $LABEL_KEY=$LABEL_VALUE label.${NC}" + echo -e "${YELLOW}Cleanup not needed - no nodes were labeled.${NC}" + exit 0 + fi + + SELECTED_NODES=() + while IFS= read -r NODE; do + if [ -n "$NODE" ]; then + SELECTED_NODES+=("$NODE") + fi + done <<< "$NODES" +fi + +if [ ${#SELECTED_NODES[@]} -eq 0 ]; then + echo -e "${YELLOW}No nodes to clean up.${NC}" + exit 0 +fi + +echo -e "${GREEN}Nodes to clean up:${NC}" +for NODE in "${SELECTED_NODES[@]}"; do + echo " - $NODE" +done + +echo "" +echo -e "${BLUE}Summary:${NC}" +echo " Label to remove: $LABEL_KEY" + +# Check if any node has the taint +HAS_TAINTS=false +for NODE in "${SELECTED_NODES[@]}"; do + TAINT_COUNT=$(kubectl get node "$NODE" -o jsonpath='{range .spec.taints[*]}{.key}{"\n"}{end}' | grep -c "^${LABEL_KEY}$" || echo "0") + if [ "$TAINT_COUNT" -gt 0 ]; then + HAS_TAINTS=true + break + fi +done + +if [ "$HAS_TAINTS" = true ]; then + echo " Taint to remove: $LABEL_KEY=$LABEL_VALUE:$TAINT_EFFECT" +fi +echo "" + +read -p "Remove labels and taints? (y/n) [y]: " CONFIRM +CONFIRM=${CONFIRM:-y} + +if [ "$CONFIRM" != "y" ] && [ "$CONFIRM" != "Y" ]; then + echo -e "${YELLOW}Cancelled.${NC}" + exit 0 +fi + +echo "" +echo -e "${GREEN}Removing configuration...${NC}" + +# Remove labels from selected nodes +for NODE in "${SELECTED_NODES[@]}"; do + if [ "$DRY_RUN" = true ]; then + echo "[DRY-RUN] kubectl label node $NODE $LABEL_KEY-" + else + kubectl label node "$NODE" "$LABEL_KEY-" --overwrite + echo -e " ${GREEN}✓${NC} Removed label from $NODE" + fi +done + +# Remove taints (check if node has the taint) +for NODE in "${SELECTED_NODES[@]}"; do + # Check if node has the taint + HAS_TAINT=$(kubectl get node "$NODE" -o jsonpath='{range .spec.taints[*]}{.key}{"\n"}{end}' | grep -c "^${LABEL_KEY}$" || echo "0") + + if [ "$HAS_TAINT" -gt 0 ]; then + if [ "$DRY_RUN" = true ]; then + echo "[DRY-RUN] kubectl taint node $NODE $LABEL_KEY=$LABEL_VALUE:$TAINT_EFFECT-" + else + kubectl taint node "$NODE" "$LABEL_KEY=$LABEL_VALUE:$TAINT_EFFECT-" --overwrite || true + echo -e " ${GREEN}✓${NC} Removed taint from $NODE" + fi + fi +done + +echo "" +echo -e "${GREEN}Cleanup complete!${NC}" +echo "" + +# Summary +echo -e "${BLUE}Summary:${NC}" +echo " Nodes cleaned: ${#SELECTED_NODES[@]}" +echo " Label removed: $LABEL_KEY" +if [ "$HAS_TAINTS" = true ]; then + echo " Taint removed: $LABEL_KEY=$LABEL_VALUE:$TAINT_EFFECT" +fi \ No newline at end of file diff --git a/tools/node-setup.sh b/tools/node-setup.sh new file mode 100755 index 0000000..ce13ed2 --- /dev/null +++ b/tools/node-setup.sh @@ -0,0 +1,520 @@ +#!/bin/bash +# +# DataMate Node Setup Script +# Interactive script to select nodes with keyboard navigation (↑/↓ or j/k) +# Automatically applies labels and taints for DataMate deployment +# +# Usage: ./node-setup.sh [--dry-run] [--namespace NAMESPACE] [--skip-taint] +# + +set -e + +# ============================================================================ +# Configuration +# ============================================================================ + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Fixed label and taint values (no prompts needed) +NAMESPACE="model-engine" +LABEL_KEY="node-role.kubernetes.io/datamate" +LABEL_VALUE="true" +TAINT_EFFECT="NoSchedule" +DRY_RUN=false +SKIP_TAINT=false + +# ============================================================================ +# Argument Parsing +# ============================================================================ + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --namespace) + NAMESPACE="$2" + shift 2 + ;; + --skip-taint) + SKIP_TAINT=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# ============================================================================ +# Prerequisite Checks +# ============================================================================ + +check_prerequisites() { + if ! command -v kubectl &> /dev/null; then + echo -e "${RED}Error: kubectl is not installed${NC}" + exit 1 + fi + + if ! kubectl cluster-info &> /dev/null; then + echo -e "${RED}Error: Cannot connect to Kubernetes cluster${NC}" + exit 1 + fi +} + +# ============================================================================ +# Node Data Collection +# ============================================================================ + +fetch_nodes() { + echo -e "${YELLOW}Fetching available nodes...${NC}" + + NODES=$(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') + NODE_COUNT=$(echo "$NODES" | wc -l | tr -d ' ') + + if [ "$NODE_COUNT" -eq 0 ]; then + echo -e "${RED}Error: No nodes found in the cluster${NC}" + exit 1 + fi + + # Build node array with status info + NODE_ARRAY=() + NODE_STATUS=() + NODE_HAS_LABEL=() + + for NODE in $NODES; do + NODE_ARRAY+=("$NODE") + + # Get status + STATUS=$(kubectl get node "$NODE" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') + NODE_STATUS+=("$STATUS") + + # Check if already labeled + CURRENT_LABEL=$(kubectl get node "$NODE" -o jsonpath='{.metadata.labels.node-role\.kubernetes\.io/datamate}') + if [ "$CURRENT_LABEL" = "true" ]; then + NODE_HAS_LABEL+=("true") + else + NODE_HAS_LABEL+=("false") + fi + done +} + +# ============================================================================ +# Interactive Menu Functions +# ============================================================================ + +# Initialize selection state array +init_selection() { + SELECTED=() + for i in $(seq 1 $NODE_COUNT); do + SELECTED+=("false") + done + CURRENT_INDEX=0 +} + +# Print the interactive menu +print_menu() { + # Clear screen and print header + echo -e "\033[2J\033[H" + echo -e "${BLUE}=====================================${NC}" + echo -e "${BLUE} DataMate Node Setup${NC}" + echo -e "${BLUE}=====================================${NC}" + echo "" + echo -e "${CYAN}Select nodes for DataMate deployment${NC}" + echo "" + + # Print each node with selection marker + for i in $(seq 0 $(($NODE_COUNT - 1))); do + NODE="${NODE_ARRAY[$i]}" + STATUS="${NODE_STATUS[$i]}" + HAS_LABEL="${NODE_HAS_LABEL[$i]}" + IS_SELECTED="${SELECTED[$i]}" + + # Status display + if [ "$STATUS" = "True" ]; then + STATUS_DISPLAY="${GREEN}Ready${NC}" + else + STATUS_DISPLAY="${RED}NotReady${NC}" + fi + + # Label marker + LABEL_MARKER="" + if [ "$HAS_LABEL" = "true" ]; then + LABEL_MARKER=" ${GREEN}[datamate]${NC}" + fi + + # Selection marker + if [ "$IS_SELECTED" = "true" ]; then + MARKER="${GREEN}[x]${NC}" + else + MARKER="${NC}[ ]${NC}" + fi + + # Highlight current row + if [ "$i" -eq "$CURRENT_INDEX" ]; then + echo -e " ${YELLOW}→${NC} $MARKER ${CYAN}$NODE${NC} ($STATUS_DISPLAY)$LABEL_MARKER" + else + echo -e " $MARKER $NODE ($STATUS_DISPLAY)$LABEL_MARKER" + fi + done + + echo "" + echo -e "${YELLOW}Navigation:${NC} ↑/k: up ↓/j: down ${GREEN}space${NC}: toggle ${GREEN}enter${NC}: confirm ${RED}q${NC}: quit" + echo "" + + # Show current selection count + SELECTED_COUNT=0 + for s in "${SELECTED[@]}"; do + if [ "$s" = "true" ]; then + SELECTED_COUNT=$((SELECTED_COUNT + 1)) + fi + done + echo -e "${BLUE}Selected: ${SELECTED_COUNT}/${NODE_COUNT} nodes${NC}" +} + +# Toggle selection at current index +toggle_selection() { + if [ "${SELECTED[$CURRENT_INDEX]}" = "true" ]; then + SELECTED[$CURRENT_INDEX]="false" + else + SELECTED[$CURRENT_INDEX]="true" + fi +} + +# Move cursor up +move_up() { + if [ "$CURRENT_INDEX" -gt 0 ]; then + CURRENT_INDEX=$((CURRENT_INDEX - 1)) + fi +} + +# Move cursor down +move_down() { + if [ "$CURRENT_INDEX" -lt $(($NODE_COUNT - 1)) ]; then + CURRENT_INDEX=$((CURRENT_INDEX + 1)) + fi +} + +# Get selected nodes list +get_selected_nodes() { + SELECTED_NODES=() + for i in $(seq 0 $(($NODE_COUNT - 1))); do + if [ "${SELECTED[$i]}" = "true" ]; then + SELECTED_NODES+=("${NODE_ARRAY[$i]}") + fi + done +} + +# ============================================================================ +# Keyboard Input Handling +# ============================================================================ + +# Read single keypress (with fallback for non-terminal environments) +read_key() { + # Check if we're in a proper terminal + if [ ! -t 0 ]; then + # Not in terminal - use simple read mode + read -r key + echo "$key" + return + fi + + # Save current terminal settings + old_stty=$(stty -g 2>/dev/null) || return 1 + + # Set terminal to raw mode for single char read + stty raw -echo 2>/dev/null + + # Read key + key=$(dd bs=1 count=1 2>/dev/null) + + # Check for arrow keys (escape sequence) + if [ "$key" = $'\x1b' ]; then + # Read the next two chars for arrow key sequence + read -rs -t0.1 -n2 key2 2>/dev/null || true + key="${key}${key2}" + fi + + # Restore terminal settings BEFORE processing + stty "$old_stty" 2>/dev/null || true + + # In raw mode, Enter produces \r (carriage return), convert to \n for easier matching + if [ "$key" = $'\x0d' ]; then + key=$'\x0a' + fi + + echo "$key" +} + +# Main interactive loop +interactive_selection() { + init_selection + + while true; do + print_menu + + key=$(read_key) + + case "$key" in + # Arrow up or 'k' + $'\x1b[A'|k) + move_up + ;; + # Arrow down or 'j' + $'\x1b[B'|j) + move_down + ;; + # Space - toggle selection + ' ') + toggle_selection + ;; + # Enter - confirm + ''|$'\x0a') + get_selected_nodes + if [ ${#SELECTED_NODES[@]} -eq 0 ]; then + echo -e "${YELLOW}No nodes selected. Please select at least one node.${NC}" + sleep 1 + else + return 0 + fi + ;; + # Q - quit/skip + q|Q) + echo -e "\n${YELLOW}Skipping node setup.${NC}" + echo "" + # Create empty args file + HELM_ARGS_FILE="/tmp/datamate-helm-args.sh" + cat > "$HELM_ARGS_FILE" < "$HELM_ARGS_FILE" < "$HELM_ARGS_FILE" <