Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/client/dfuse/pil4dfs/int_dfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
#define FREE(ptr) do {free(ptr); (ptr) = NULL; } while (0)

/* The max number of mount points for DAOS mounted simultaneously */
#define MAX_DAOS_MT (8)
#define MAX_DAOS_MT (32)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not make this 128 or 256? The cost of this is what, a few KB? 32 is still in the realm of possibility of mounts on a common login.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest increasing the size to something that will likely not happen. Otherwise you'll just hit this in the future and be annoyed again.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discussed with Kevin offline. the change made to just disable interception once > 32 mounts are there is sufficient and this won't abort apps as before and cause problems.


#define READ_DIR_BATCH_SIZE (96)
#define MAX_FD_DUP2ED (16)
Expand Down Expand Up @@ -687,12 +687,15 @@ discover_dfuse_mounts(void)
}

while ((fs_entry = getmntent(fp)) != NULL) {
if (num_dfs >= MAX_DAOS_MT) {
D_FATAL("dfs_list[] is full. Need to increase MAX_DAOS_MT.\n");
abort();
}
pt_dfs_mt = &dfs_list[num_dfs];
if (memcmp(fs_entry->mnt_type, STR_AND_SIZE(MNT_TYPE_FUSE)) == 0) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit

Suggested change
if (memcmp(fs_entry->mnt_type, STR_AND_SIZE(MNT_TYPE_FUSE)) == 0) {
if (memcmp(fs_entry->mnt_type, STR_AND_SIZE(MNT_TYPE_FUSE)) != 0)
continue;

if (num_dfs >= MAX_DAOS_MT) {
D_WARN("Found more than MAX_DAOS_MT (%d) dfuse mount points. "
"Disabling interception. Increase MAX_DAOS_MT to support "
"more simultaneous mounts.\n",
MAX_DAOS_MT);
D_GOTO(out, rc = EOVERFLOW);
}
pt_dfs_mt = &dfs_list[num_dfs];
pt_dfs_mt->dcache = NULL;
pt_dfs_mt->len_fs_root = strnlen(fs_entry->mnt_dir, DFS_MAX_PATH);
if (pt_dfs_mt->len_fs_root >= DFS_MAX_PATH) {
Expand Down
137 changes: 137 additions & 0 deletions src/tests/ftest/dfuse/pil4dfs_many_mounts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
(C) Copyright 2026 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import os

from apricot import TestWithServers
from dfuse_utils import get_dfuse, start_dfuse
from host_utils import get_local_host
from run_utils import run_remote

# Marker printed to stderr by libpil4dfs at process exit when D_IL_REPORT is set
# and interception is enabled. Its presence/absence tells us whether interception
# was active for the process.
INTERCEPT_MARKER = "libpil4dfs intercepting summary"


class Pil4dfsManyMounts(TestWithServers):
"""Verify libpil4dfs handling of many dfuse mount points (MAX_DAOS_MT).

libpil4dfs discovers every fuse.daos mount point listed in /proc/self/mounts
when it initializes and stores them in a fixed-size table (MAX_DAOS_MT). When
the number of mount points is at or below the limit, interception is enabled
and used for all of them. When the number exceeds the limit, libpil4dfs must
gracefully disable interception (falling back to dfuse) rather than aborting
the application, so that no core file is produced.

:avocado: recursive
"""

def _run_case(self, pool, dfuse_hosts, env_str, mount_count, expect_intercept):
"""Mount mount_count dfuse instances and run a single libpil4dfs process across them.

Args:
pool (TestPool): pool to create the containers in.
dfuse_hosts (NodeSet): hosts on which to mount dfuse and run the command.
env_str (str): shell prefix that loads libpil4dfs and enables D_IL_REPORT.
mount_count (int): number of dfuse mount points to mount simultaneously.
expect_intercept (bool): whether interception is expected to be enabled.
"""
self.log_step(
f"Case: {mount_count} mount points, "
f"expecting interception to be {'enabled' if expect_intercept else 'disabled'}")

dfuses = []
mount_dirs = []
try:
for _ in range(mount_count):
container = self.get_container(pool)
dfuse = get_dfuse(self, dfuse_hosts)
start_dfuse(self, dfuse, pool, container)
dfuses.append(dfuse)
mount_dirs.append(dfuse.mount_dir.value)

# A single libpil4dfs-intercepted process that touches every mount point. At
# initialization libpil4dfs discovers all fuse.daos mounts in /proc/self/mounts,
# so this exercises the MAX_DAOS_MT table regardless of which mount is accessed.
stat_cmd = env_str + "stat " + " ".join(mount_dirs)
result = run_remote(self.log, dfuse_hosts, stat_cmd)

# The process must always complete cleanly, regardless of how many mounts are
# present. Over the limit, libpil4dfs must disable interception gracefully and
# never abort (which would create a core file and fail the CI stage).
if not result.passed:
self.fail(
f"libpil4dfs process failed with {mount_count} mount points on "
f"{result.failed_hosts}; it must never abort")

intercepted = INTERCEPT_MARKER in result.joined_stdout

# Log the observed interception status so the test log shows each case behaving
# as expected (interception enabled at/below MAX_DAOS_MT, disabled above it).
self.log.info(
"Case result: %d mount points -> process succeeded, interception %s "
"(expected %s)", mount_count, "enabled" if intercepted else "disabled",
"enabled" if expect_intercept else "disabled")
if intercepted:
self.log.info(
"libpil4dfs interception summary for %d mount points:\n%s",
mount_count, result.joined_stdout)
Comment thread
daltonbohning marked this conversation as resolved.
Outdated

if expect_intercept and not intercepted:
self.fail(
f"Expected interception to be enabled with {mount_count} mount points, "
"but the libpil4dfs summary was not found")
if not expect_intercept and intercepted:
self.fail(
f"Expected interception to be disabled with {mount_count} mount points "
"(more than MAX_DAOS_MT), but the libpil4dfs summary was found")
finally:
# Unmount this case's dfuse instances so the next case only sees its own mounts.
for dfuse in dfuses:
dfuse.stop()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we know this runs with 10, 32, then 33 instances, do we want to optimize this to reuse the containers and mounts? I.e.

  1. Create and mount 10 containers
  2. Verify behavior
  3. Create and mount additional containers up to 32
  4. Verify behavior
  5. Create and mount additional containers up to 33
  6. Verify behavior

I could make that change if you agree

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call.. i will make the change.
will wait for all testing to complete first and will post following PR with test only changes to address your feedback.


def test_pil4dfs_many_mounts(self):
"""JIRA ID: DAOS-18890.

Test Description:
Verify libpil4dfs behavior with dfuse mount point counts at/below and
above MAX_DAOS_MT, all within a single test run. No case may produce a
core file.

Steps:
1.) Create a single pool.
2.) For each count in intercept_mount_counts, mount that many dfuse
instances and confirm a single libpil4dfs process uses them all
(interception enabled).
3.) Mount no_intercept_mount_count dfuse instances (more than
MAX_DAOS_MT) and confirm the libpil4dfs process completes without
aborting and with interception disabled.

:avocado: tags=all,daily_regression
:avocado: tags=vm
:avocado: tags=dfuse,pil4dfs
:avocado: tags=Pil4dfsManyMounts,test_pil4dfs_many_mounts
"""
intercept_mount_counts = self.params.get(
"intercept_mount_counts", "/run/test/*", [10, 32])
no_intercept_mount_count = self.params.get(
"no_intercept_mount_count", "/run/test/*", 33)

lib_path = os.path.join(self.prefix, "lib64", "libpil4dfs.so")
env_str = (
f"export LD_PRELOAD={lib_path}; export D_IL_NO_BYPASS=1; export D_IL_REPORT=1; ")
Comment thread
daltonbohning marked this conversation as resolved.
Outdated
dfuse_hosts = get_local_host()
Comment thread
daltonbohning marked this conversation as resolved.
Outdated

self.log_step("Creating a single pool")
pool = self.get_pool(connect=False)

for mount_count in intercept_mount_counts:
self._run_case(pool, dfuse_hosts, env_str, mount_count, expect_intercept=True)

self._run_case(
pool, dfuse_hosts, env_str, no_intercept_mount_count, expect_intercept=False)

self.log.info("Test passed")
29 changes: 29 additions & 0 deletions src/tests/ftest/dfuse/pil4dfs_many_mounts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
hosts:
test_servers: 1
test_clients: 1
timeout: 900

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only if you need to push again: could reduce this to 600 since the actual execution time was ~6 minutes

Suggested change
timeout: 900
timeout: 600

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sigh.. i did repush but forgot about this

server_config:
name: daos_server
engines_per_host: 1
engines:
0:
targets: 4
nr_xs_helpers: 0
storage:
0:
class: ram
scm_mount: /mnt/daos
system_ram_reserved: 1
pool:
size: 1GiB
container:
type: POSIX
control_method: daos
test:
# Mount counts at/below MAX_DAOS_MT for which libpil4dfs enables interception.
intercept_mount_counts:
- 10
- 32
# Mount count above MAX_DAOS_MT for which libpil4dfs must gracefully disable
# interception (no abort, no core file).
no_intercept_mount_count: 33
Loading