Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion utils/_context/_scenarios/auto_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,12 @@ def pytest_sessionfinish(self, session: pytest.Session, exitstatus: int) -> None

def close_targets(self):
if self.is_main_worker:
# Extract logs from the VM before destroy
# Extract logs from the VM before destroy (including after provision failures)
if self.virtual_machine.provision_install_error is not None:
logger.stdout(
"Provision failed — downloading VM logs via SSH/SFTP "
f"(see {self.host_log_folder}/var/log/datadog_weblog/dd-agent-diagnostics.log)"
)
download_vm_logs(
vm=self.virtual_machine,
remote_folder_paths=["/var/log/datadog", "/var/log/datadog_weblog", "/tmp/datadog/java"], # noqa: S108
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,24 @@
local_path: binaries/

remote-command: |
# Pin to 7.78.4 agent release. APMSP-3059
export DD_AGENT_MAJOR_VERSION=7
export DD_AGENT_MINOR_VERSION=78.4
# APMSP-3059
#export DD_INSTALLER_AGENT_VERSION="pipeline-114830198"
#export DD_INSTALLER_INSTALLER_VERSION="pipeline-114830198"

#export TESTING_APT_URL="apttesting.datad0g.com"
#export TESTING_APT_REPO_VERSION="pipeline-114830198-a7-x86_64 7"
#export TESTING_YUM_URL="yumtesting.datad0g.com"
#export TESTING_YUM_VERSION_PATH="testing/pipeline-114830198-a7/7"
#export TESTING_KEYS_URL="apttesting.datad0g.com/test-keys"

export DD_INSTALLER_AGENT_VERSION="pipeline-114830198"
export DD_INSTALLER_INSTALLER_VERSION="pipeline-114830198"
export TESTING_APT_URL="apttesting.datad0g.com/datadog-agent/pipeline-114830198-a7"
export TESTING_APT_REPO_VERSION="stable-x86_64 7"
export TESTING_YUM_URL="yumtesting.datad0g.com"
export TESTING_YUM_VERSION_PATH="testing/pipeline-114830198-a7/7"
export TESTING_KEYS_URL="apttesting.datad0g.com/test-keys"

# Check if Docker is installed and ensure it's running
if command -v docker >/dev/null 2>&1; then
echo "Docker is installed, ensuring service is enabled and running..."
Expand Down Expand Up @@ -47,6 +62,7 @@

if [ "${DD_env}" == "dev" ]; then
# To force the installer to pull from dev repositories -- agent config is set manually to datadoghq.com
echo "RMM TEST DD_env: ${DD_env}"
export DD_SITE="datad0g.com"
export DD_INSTALLER_REGISTRY_URL='install.datad0g.com'
#The latest_snapshot of python tracer version is 2.x we want to use 4.x. Get from repo.
Expand Down Expand Up @@ -80,11 +96,13 @@
fi

if [ -n "${DD_INSTALLER_INJECTOR_VERSION}" ]; then
echo "RMM TEST DD_INSTALLER_INJECTOR_VERSION: ${DD_INSTALLER_INJECTOR_VERSION}"
export DD_INSTALLER_REGISTRY_URL_APM_INJECT_PACKAGE='installtesting.datad0g.com'
export DD_INSTALLER_DEFAULT_PKG_VERSION_DATADOG_APM_INJECT="${DD_INSTALLER_INJECTOR_VERSION}"
fi

if [ -n "${DD_INSTALLER_AGENT_VERSION}" ]; then
echo "RMM TEST DD_INSTALLER_AGENT_VERSION: ${DD_INSTALLER_AGENT_VERSION}"
export DD_INSTALLER_REGISTRY_URL_AGENT_PACKAGE='installtesting.datad0g.com'
export DD_INSTALLER_DEFAULT_PKG_VERSION_DATADOG_AGENT="${DD_INSTALLER_AGENT_VERSION}"
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ services:
datadog:
container_name: dd-agent
# Pin to 7.78.4 agent release. APMSP-3059
image: gcr.io/datadoghq/agent:7.78.4
#image: 669783387624.dkr.ecr.us-east-1.amazonaws.com/dockerhub/datadog/agent-dev:275edf32-py3-jmx
image: 669783387624.dkr.ecr.us-east-1.amazonaws.com/dockerhub/datadog/agent-dev:623e4f63-py3-jmx
environment:
- DD_API_KEY=${DD_API_KEY}
# EC2 Docker: explicit hostname required (Agent >=7.40, APMSP-3059)
#- DD_HOSTNAME=system-tests-ssi-agent
- DD_SITE=datadoghq.com
- DD_APM_ENABLED=true
- DD_LOG_LEVEL=TRACE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,61 @@
#!/bin/bash
# shellcheck disable=SC2015

# Provision runs "sh create_and_run_app_container.sh" — re-exec with bash for traps/functions.
if [ -z "${BASH_VERSION:-}" ]; then
exec /bin/bash "$0" "$@"
fi

set -e

readonly DIAGNOSTICS_LOG="${HOME}/dd-agent-diagnostics.log"
readonly SCRIPT_MARKER="create_and_run_app_container.sh diagnostics-v3"

_dd_agent_diagnostics_dumped=0

dump_dd_agent_diagnostics() {
if [ "$_dd_agent_diagnostics_dumped" -eq 1 ]; then
return 0
fi
if [ ! -f docker-compose-agent-prod.yml ]; then
return 0
fi
_dd_agent_diagnostics_dumped=1

{
echo "..:: DD-AGENT DIAGNOSTICS (${SCRIPT_MARKER}) ::.."
date -u '+%Y-%m-%dT%H:%M:%SZ'
sudo docker-compose -f docker-compose-agent-prod.yml ps 2>&1 || true
if sudo docker inspect dd-agent >/dev/null 2>&1; then
echo "..:: DD-AGENT HEALTH ::.."
sudo docker inspect dd-agent --format '{{json .State.Health}}' 2>&1 || true
echo "..:: DD-AGENT LOGS (docker logs) ::.."
sudo docker logs dd-agent 2>&1 | tail -300 || true
else
echo "..:: dd-agent container not found ::.."
sudo docker ps -a 2>&1 || true
fi
echo "..:: DD-AGENT LOGS (docker-compose logs) ::.."
sudo docker-compose -f docker-compose-agent-prod.yml logs --no-color datadog 2>&1 || true
} 2>&1 | tee -a "${DIAGNOSTICS_LOG}"

sudo mkdir -p /var/log/datadog_weblog 2>/dev/null || true
if [ -d /var/log/datadog_weblog ]; then
sudo cp "${DIAGNOSTICS_LOG}" /var/log/datadog_weblog/dd-agent-diagnostics.log 2>/dev/null || true
sudo chmod 644 /var/log/datadog_weblog/dd-agent-diagnostics.log 2>/dev/null || true
fi
sync 2>/dev/null || true
}

# Dump agent diagnostics on any failure (deduplicated if already printed)
trap 'status=$?; if [ "$status" -ne 0 ]; then dump_dd_agent_diagnostics; fi; exit "$status"' EXIT

echo "..:: ${SCRIPT_MARKER} ::.."

# Writable by log download even when provision fails before vm_logs step
sudo mkdir -p /var/log/datadog_weblog 2>/dev/null || true
sudo chmod 777 /var/log/datadog_weblog 2>/dev/null || true

# shellcheck disable=SC2035
sudo chmod -R 755 *

Expand Down Expand Up @@ -38,9 +91,14 @@ done
if [ -f docker-compose-agent-prod.yml ]; then
# Agent may be installed in a different way
echo "DD_API_KEY=${DD_API_KEY}" > .env
sudo -E docker-compose -f docker-compose-agent-prod.yml up -d --remove-orphans datadog --wait --wait-timeout 120
if ! sudo -E docker-compose -f docker-compose-agent-prod.yml up -d --remove-orphans datadog --wait --wait-timeout 120; then
echo "..:: COMPOSE_WAIT_FAILED (dd-agent unhealthy or timeout) ::.." | tee -a "${DIAGNOSTICS_LOG}" >&2
dump_dd_agent_diagnostics
echo "..:: Diagnostics written to ${DIAGNOSTICS_LOG} ::.." >&2
exit 1
fi
fi
#Env variables set on the scenario definition. Write to file and load
#Env variables set on the scenario definition. Write to file and load
if [ ! -f scenario_app.env ]
then
SCENARIO_APP_ENV="${DD_APP_ENV:-''}"
Expand All @@ -50,7 +108,7 @@ then
fi
sudo -E docker-compose -f docker-compose.yml up -d test-app

echo "..:: RUNNING DOCKER SERVICES ::.."
echo "..:: RUNNING DOCKER SERVICES ::.."
sudo docker-compose ps
if [ -f docker-compose-agent-prod.yml ]; then
echo "..:: DATADOG AGENT OUTPUT ::.."
Expand Down
130 changes: 84 additions & 46 deletions utils/onboarding/debug_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,98 +4,136 @@
from utils._logger import logger
from utils.virtual_machine.virtual_machines import _VirtualMachine


def download_vm_logs(vm: _VirtualMachine, remote_folder_paths: list[str], local_base_logs_folder: str):
"""Using SSH/SFTP connects to VM and downloads one or more folders from the remote machine

Args:
vm: Virtual machine object
remote_folder_paths: Single path (str) or list of paths (list[str]) to download
local_base_logs_folder: Base folder where logs will be stored locally

# Collect dd-agent docker diagnostics into /var/log/datadog_weblog (runs from VM user home).
_COLLECT_DD_AGENT_DIAGNOSTICS_CMD = r"""bash -lc '
sudo mkdir -p /var/log/datadog_weblog && sudo chmod 777 /var/log/datadog_weblog;
cd ~;
if [ -f "$HOME/dd-agent-diagnostics.log" ]; then
sudo cp "$HOME/dd-agent-diagnostics.log" /var/log/datadog_weblog/dd-agent-diagnostics.log 2>/dev/null || true;
fi;
if [ -f docker-compose-agent-prod.yml ]; then
{
echo "..:: DD-AGENT DIAGNOSTICS (download_vm_logs) ::..";
date -u "+%Y-%m-%dT%H:%M:%SZ";
sudo docker-compose -f docker-compose-agent-prod.yml ps 2>&1 || true;
if sudo docker inspect dd-agent >/dev/null 2>&1; then
echo "..:: DD-AGENT HEALTH ::..";
sudo docker inspect dd-agent --format "{{json .State.Health}}" 2>&1 || true;
echo "..:: DD-AGENT LOGS (docker logs) ::..";
sudo docker logs dd-agent 2>&1 | tail -300 || true;
else echo "..:: dd-agent container not found ::.."; sudo docker ps -a 2>&1 || true;
fi;
echo "..:: DD-AGENT LOGS (docker-compose logs) ::..";
sudo docker-compose -f docker-compose-agent-prod.yml logs --no-color datadog 2>&1 || true;
} | sudo tee /var/log/datadog_weblog/dd-agent-diagnostics.log >/dev/null;
fi'"""


def download_vm_logs(vm: _VirtualMachine, remote_folder_paths: list[str], local_base_logs_folder: str) -> bool:
"""Connect over SSH/SFTP and download folders from the remote machine.

Works even when provisioning failed (uses get_ssh_connection_for_log_download).

Returns True if at least one folder was downloaded successfully.
"""
# Handle both single path and list of paths for backward compatibility
if isinstance(remote_folder_paths, str):
remote_folder_paths = [remote_folder_paths]

if not vm.ssh_config.hostname:
logger.warning(
"Skipping VM log download for %s: no IP/hostname (VM may not have been created)",
vm.name,
)
return False

downloaded_any = False
try:
logger.info(f"Downloading folders from machine {vm.get_ip()}")
logger.info(f"Remote folders: {remote_folder_paths}")
logger.info(
"Downloading folders from machine %s (%s) provision_error=%s",
vm.name,
vm.ssh_config.hostname,
vm.provision_install_error is not None,
)
logger.info("Remote folders: %s", remote_folder_paths)

# Use the VM's get_ssh_connection method instead of creating our own
c = vm.get_ssh_connection()
logger.info(f"Connected [{vm.get_ip()}]")
connection = vm.get_ssh_connection_for_log_download()
logger.info("Connected [%s]", vm.ssh_config.hostname)

# Execute remote commands to collect logs prior to download
commands_to_run = [
"sudo mkdir -p /var/log/datadog_weblog || true",
# Docker and systemd related logs (mirrors utils/build/virtual_machine/provisions/auto-inject/auto-inject-vm_logs.yml)
"sudo docker-compose ps > /var/log/datadog_weblog/docker_proccess.log 2>&1 || true",
"sudo docker-compose logs > /var/log/datadog_weblog/docker_logs.log 2>&1 || true",
"sudo chmod 777 /var/log/datadog_weblog || true",
_COLLECT_DD_AGENT_DIAGNOSTICS_CMD,
# Docker and systemd related logs (mirrors auto-inject-vm_logs.yml)
"bash -lc 'cd ~ && sudo docker-compose ps > /var/log/datadog_weblog/docker_proccess.log 2>&1 || true'",
"bash -lc 'cd ~ && sudo docker-compose logs > /var/log/datadog_weblog/docker_logs.log 2>&1 || true'",
"sudo journalctl -xeu docker > /var/log/datadog_weblog/journalctl_docker.log 2>&1 || true",
# Copy Datadog Agent configuration files
"sudo cp /etc/datadog-agent/application_monitoring.yaml /var/log/datadog_weblog/application_monitoring.yaml 2>&1 || true",
# Additional logs requested
"sudo cat /var/log/cloud-init.log > /var/log/datadog_weblog/cloud-init.log 2>&1 || true",
"sudo cat /var/log/syslog > /var/log/datadog_weblog/syslog.log 2>&1 || true",
"sudo dmesg > /var/log/datadog_weblog/dmesg.log 2>&1 || true",
"sudo systemctl list-dependencies docker.service > /var/log/datadog_weblog/docker_list_dependencies.log 2>&1 || true",
"sudo systemctl list-timers --all > /var/log/datadog_weblog/system.timers.log 2>&1 || true",
"sudo crontab -l > /var/log/datadog_weblog/crontab.log 2>&1 || true",
"sudo cat /var/log/apt/history.log > /var/log/datadog_weblog/apt.log 2>&1 || true",
"sudo cat /var/log/yum.log > /var/log/datadog_weblog/yum.log 2>&1 || true",
"sudo cat /var/log/apt/history.log > /var/log/datadog_weblog/apt.log 2>&1 || true",
"sudo cat /var/log/yum.log > /var/log/datadog_weblog/yum.log 2>&1 || true",
]

for cmd in commands_to_run:
try:
logger.info(f"Executing remote command: {cmd}")
_stdin, stdout, _stderr = c.exec_command(cmd)
logger.info("Executing remote command: %s", cmd)
_stdin, stdout, _stderr = connection.exec_command(cmd)
exit_status = stdout.channel.recv_exit_status()
logger.info(f"Remote command exit status: {exit_status}")
logger.info("Remote command exit status: %s", exit_status)
except Exception as exec_err:
logger.warning(f"Failed executing command on {vm.get_ip()}: {cmd}")
logger.warning("Failed executing command on %s: %s", vm.ssh_config.hostname, cmd)
logger.exception(exec_err)

# Create SFTP client
sftp = c.open_sftp()
sftp = connection.open_sftp()

# Download each folder
for remote_folder_path in remote_folder_paths:
local_folder_path = f"{local_base_logs_folder}/{remote_folder_path}"
logger.info(f"Downloading: {remote_folder_path} -> {local_folder_path}")
logger.info("Downloading: %s -> %s", remote_folder_path, local_folder_path)

# Create local directory if it doesn't exist
local_path = Path(local_folder_path)
local_path.mkdir(parents=True, exist_ok=True)

# Download the folder recursively
_download_folder_recursive(sftp, remote_folder_path, local_folder_path)
if _download_folder_recursive(sftp, remote_folder_path, local_folder_path):
downloaded_any = True

sftp.close()
c.close()
logger.info(f"Successfully downloaded all folders from {vm.get_ip()}")
connection.close()
if downloaded_any:
logger.info(
"Successfully downloaded VM logs from %s into %s", vm.ssh_config.hostname, local_base_logs_folder
)
else:
logger.warning("No files downloaded from %s", vm.ssh_config.hostname)

except Exception:
logger.error("Cannot download folders from remote machine")
logger.exception("Cannot download folders from remote machine %s", vm.name)

return downloaded_any


def _download_folder_recursive(sftp: SFTPClient, remote_dir: str, local_dir: str):
"""Recursively download a folder using SFTP"""
def _download_folder_recursive(sftp: SFTPClient, remote_dir: str, local_dir: str) -> bool:
"""Recursively download a folder using SFTP. Returns True if at least one file was downloaded."""
downloaded_any = False
try:
# List contents of remote directory
for item in sftp.listdir_attr(remote_dir):
remote_path = f"{remote_dir}/{item.filename}"
local_path = Path(local_dir) / item.filename

if stat.S_ISDIR(item.st_mode):
# Create local directory and recursively download
local_path.mkdir(exist_ok=True)
logger.info(f"Created directory: {local_path}")
_download_folder_recursive(sftp, remote_path, str(local_path))
logger.info("Created directory: %s", local_path)
if _download_folder_recursive(sftp, remote_path, str(local_path)):
downloaded_any = True
else:
# Download file
logger.info(f"Downloading file: {remote_path} -> {local_path}")
logger.info("Downloading file: %s -> %s", remote_path, local_path)
sftp.get(remote_path, str(local_path))
downloaded_any = True

except Exception:
logger.error(f"Error downloading from {remote_dir}")
logger.exception("Error downloading from %s", remote_dir)

return downloaded_any
Loading
Loading