Skip to content
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions executable.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,12 @@ RUN ./phpspy_build.sh
# async-profiler glibc
FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc
WORKDIR /tmp
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh scripts/pdeathsigger.c ./
RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \
./fix_centos7.sh; \
fi
RUN ./async_profiler_env_glibc.sh
RUN ./async_profiler_env_glibc.sh && \
gcc -static -o pdeathsigger pdeathsigger.c

COPY scripts/async_profiler_build_shared.sh .
RUN ./async_profiler_build_shared.sh
Expand Down Expand Up @@ -256,6 +257,7 @@ COPY --from=async-profiler-builder-glibc /usr/bin/xargs gprofiler/resources/php/

COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/bin/asprof gprofiler/resources/java/asprof
COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/async-profiler-version gprofiler/resources/java/async-profiler-version
COPY --from=async-profiler-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger
COPY --from=async-profiler-centos-min-test-glibc /libasyncProfiler.so gprofiler/resources/java/glibc/libasyncProfiler.so
COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncProfiler.so gprofiler/resources/java/musl/libasyncProfiler.so
COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl
Expand Down
16 changes: 10 additions & 6 deletions gprofiler/containers_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,23 @@

logger = get_logger_adapter(__name__)

_containers_client: Optional[ContainersClient] = None


class ContainerNamesClient:
def __init__(self) -> None:
global _containers_client
try:
self._containers_client: Optional[ContainersClient] = ContainersClient()
logger.info(f"Discovered container runtimes: {self._containers_client.get_runtimes()}")
if _containers_client is None:
_containers_client = ContainersClient()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's extract this to a method get_containers_client which will be locked (to avoid any concurrency issues) and maintain the cache.

Then, ContainerNamesClient runs self._containers_client = get_containers_client().

logger.info(f"Discovered container runtimes: {_containers_client.get_runtimes()}")
except NoContainerRuntimesError:
logger.warning(
"Could not find a Docker daemon or CRI-compatible daemon, profiling data will not"
" include the container names. If you do have a containers runtime and it's not supported,"
" please open a new issue here:"
" https://github.com/Granulate/gprofiler/issues/new"
)
self._containers_client = None

self._pid_to_container_name_cache: Dict[int, str] = {}
self._current_container_names: Set[str] = set()
Expand All @@ -53,7 +56,7 @@ def container_names(self) -> List[str]:
return list(self._current_container_names)

def get_container_name(self, pid: int) -> str:
if self._containers_client is None:
if _containers_client is None:
return ""

if not valid_perf_pid(pid):
Expand All @@ -73,7 +76,8 @@ def get_container_name(self, pid: int) -> str:
def _safely_get_process_container_name(self, pid: int) -> Optional[str]:
try:
try:
container_id = get_process_container_id(Process(pid))
process = Process(pid)
container_id = get_process_container_id(process)
if container_id is None:
return None
except NoSuchProcess:
Expand Down Expand Up @@ -103,5 +107,5 @@ def _get_container_name(self, container_id: str) -> Optional[str]:
def _refresh_container_names_cache(self) -> None:
# We re-fetch all of the currently running containers, so in order to keep the cache small we clear it
self._container_id_to_name_cache.clear()
for container in self._containers_client.list_containers() if self._containers_client is not None else []:
for container in _containers_client.list_containers() if _containers_client is not None else []:
self._container_id_to_name_cache[container.id] = container.name
30 changes: 2 additions & 28 deletions gprofiler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@
import logging.handlers
import os
import shutil
import signal
import sys
import time
import traceback
from pathlib import Path
from threading import Event
from types import FrameType, TracebackType
from types import TracebackType
from typing import Iterable, List, Optional, Type, cast

import configargparse
Expand Down Expand Up @@ -74,6 +73,7 @@
reset_umask,
resource_path,
run_process,
setup_signals,
)
from gprofiler.utils.fs import escape_filename, mkdir_owned_root
from gprofiler.utils.proxy import get_https_proxy
Expand All @@ -98,21 +98,6 @@

UPLOAD_FILE_SUBCOMMAND = "upload-file"

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

last_signal_ts: Optional[float] = None


def sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if last_signal_ts is None or ts > last_signal_ts + SIGINT_RATELIMIT:
last_signal_ts = ts
raise KeyboardInterrupt


class GProfiler:
def __init__(
Expand Down Expand Up @@ -939,17 +924,6 @@ def verify_preconditions(args: configargparse.Namespace, processes_to_profile: O
sys.exit(1)


def setup_signals() -> None:
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, sigint_handler)


def log_system_info() -> None:
system_info = get_static_system_info()
logger.info(f"gProfiler Python version: {system_info.python_version}")
Expand Down
7 changes: 6 additions & 1 deletion gprofiler/metadata/system_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ def decode_libc_version(version: bytes) -> str:

try:
ldd_version = run_process(
["ldd", "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, suppress_log=True, check=False
["ldd", "--version"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
suppress_log=True,
check=False,
pdeathsigger=False,
).stdout
except FileNotFoundError:
ldd_version = b"ldd not found"
Expand Down
4 changes: 3 additions & 1 deletion gprofiler/metadata/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def get_exe_version(
exe_path = f"/proc/{get_process_nspid(process.pid)}/exe"

def _run_get_version() -> "CompletedProcess[bytes]":
return run_process([exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout)
return run_process(
[exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout, pdeathsigger=False
)

try:
cp = run_in_ns(["pid", "mnt"], _run_get_version, process.pid)
Expand Down
3 changes: 2 additions & 1 deletion gprofiler/profilers/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def _run_java_version() -> "CompletedProcess[bytes]":
],
stop_event=stop_event,
timeout=_JAVA_VERSION_TIMEOUT,
pdeathsigger=False,
)

# doesn't work without changing PID NS as well (I'm getting ENOENT for libjli.so)
Expand Down Expand Up @@ -1229,6 +1230,7 @@ def _check_async_profiler_loaded(self, process: Process) -> bool:
def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData:
comm = process_comm(process)
exe = process_exe(process)
container_name = self._profiler_state.get_container_name(process.pid)
java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event)

if self._enabled_proc_events_java:
Expand Down Expand Up @@ -1258,7 +1260,6 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr
self._profiled_pids.add(process.pid)

logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler")
container_name = self._profiler_state.get_container_name(process.pid)
app_metadata = self._metadata.get_metadata(process)
appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name)

Expand Down
3 changes: 2 additions & 1 deletion gprofiler/profilers/php.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,11 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str:
if profiler_state.processes_to_profile is not None:
if pid not in [process.pid for process in profiler_state.processes_to_profile]:
continue
container_name = profiler_state.get_container_name(pid)
# TODO: appid & app metadata for php!
appid = None
app_metadata = None
profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid))
profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name)

return profiles

Expand Down
1 change: 1 addition & 0 deletions gprofiler/profilers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def _run_python_process_in_ns() -> "CompletedProcess[bytes]":
[python_path, "-S", "-c", "import sys; print(sys.maxunicode)"],
stop_event=self._stop_event,
timeout=self._PYTHON_TIMEOUT,
pdeathsigger=False,
)

return run_in_ns(["pid", "mnt"], _run_python_process_in_ns, process.pid).stdout.decode().strip()
Expand Down
2 changes: 1 addition & 1 deletion gprofiler/profilers/python_ebpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ def snapshot(self) -> ProcessToProfileData:
if self._profiler_state.processes_to_profile is not None:
if process not in self._profiler_state.processes_to_profile:
continue
container_name = self._profiler_state.get_container_name(pid)
appid = application_identifiers.get_python_app_id(process)
app_metadata = self._metadata.get_metadata(process)
container_name = self._profiler_state.get_container_name(pid)
except NoSuchProcess:
appid = None
app_metadata = None
Expand Down
94 changes: 45 additions & 49 deletions gprofiler/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import atexit
import ctypes
import datetime
import glob
Expand All @@ -33,6 +34,7 @@
from subprocess import CompletedProcess, Popen, TimeoutExpired
from tempfile import TemporaryDirectory
from threading import Event
from types import FrameType
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast

import importlib_resources
Expand Down Expand Up @@ -70,6 +72,12 @@

gprofiler_mutex: Optional[socket.socket] = None

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

_last_signal_ts: Optional[float] = None
_processes: List[Popen] = []


@lru_cache(maxsize=None)
def resource_path(relative_path: str = "") -> str:
Expand All @@ -90,52 +98,20 @@ def is_root() -> bool:
return os.geteuid() == 0


libc: Optional[ctypes.CDLL] = None


def prctl(*argv: Any) -> int:
global libc
if libc is None:
libc = ctypes.CDLL("libc.so.6", use_errno=True)
return cast(int, libc.prctl(*argv))


PR_SET_PDEATHSIG = 1


def set_child_termination_on_parent_death() -> int:
ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM)
if ret != 0:
errno = ctypes.get_errno()
logger.warning(
f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}"
)
return ret


def wrap_callbacks(callbacks: List[Callable]) -> Callable:
# Expects array of callback.
# Returns one callback that call each one of them, and returns the retval of last callback
def wrapper() -> Any:
ret = None
for cb in callbacks:
ret = cb()

return ret

return wrapper


def start_process(
cmd: Union[str, List[str]],
via_staticx: bool = False,
term_on_parent_death: bool = True,
tmpdir: Optional[Path] = None,
**kwargs: Any,
) -> Popen:
global _processes

if isinstance(cmd, str):
cmd = [cmd]

if kwargs.pop("pdeathsigger", True) and is_linux():
cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd

logger.debug("Running command", command=cmd)

env = kwargs.pop("env", None)
Expand All @@ -161,23 +137,17 @@ def start_process(
# explicitly remove our directory from LD_LIBRARY_PATH
env["LD_LIBRARY_PATH"] = ""

if is_windows():
cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this.
else:
cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp)
if term_on_parent_death:
cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn])

popen = Popen(
process = Popen(
cmd,
stdout=kwargs.pop("stdout", subprocess.PIPE),
stderr=kwargs.pop("stderr", subprocess.PIPE),
stdin=subprocess.PIPE,
preexec_fn=cur_preexec_fn,
start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+
env=env,
**kwargs,
)
return popen
_processes.append(process)
return process


def wait_event(timeout: float, stop_event: Event, condition: Callable[[], bool], interval: float = 0.1) -> None:
Expand Down Expand Up @@ -355,10 +325,9 @@ def pgrep_maps(match: str) -> List[Process]:
# this is much faster than iterating over processes' maps with psutil.
# We use flag -E in grep to support systems where grep is not PCRE
result = run_process(
f"grep -lE '{match}' /proc/*/maps",
["sh", "-c", f"grep -lE '{match}' /proc/*/maps"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
suppress_log=True,
check=False,
)
Expand Down Expand Up @@ -563,3 +532,30 @@ def merge_dicts(source: Dict[str, Any], dest: Dict[str, Any]) -> Dict[str, Any]:

def is_profiler_disabled(profile_mode: str) -> bool:
return profile_mode in ("none", "disabled")


def _exit_handler() -> None:
for process in _processes:
process.kill()


def _sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global _last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if _last_signal_ts is None or ts > _last_signal_ts + SIGINT_RATELIMIT:
_last_signal_ts = ts
raise KeyboardInterrupt


def setup_signals() -> None:
atexit.register(_exit_handler)
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, _sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, _sigint_handler)
2 changes: 1 addition & 1 deletion gprofiler/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def is_rw_exec_dir(path: Path) -> bool:

# try executing
try:
run_process([str(test_script)], suppress_log=True)
run_process([str(test_script)], suppress_log=True, pdeathsigger=False)
except PermissionError:
# noexec
return False
Expand Down
2 changes: 1 addition & 1 deletion granulate-utils
Loading