-
Notifications
You must be signed in to change notification settings - Fork 348
DAOS-18928 dfuse: increase MAX_DAOS_MT to 32 #18526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -64,7 +64,7 @@ | |||||||
| #define FREE(ptr) do {free(ptr); (ptr) = NULL; } while (0) | ||||||||
|
|
||||||||
| /* The max number of mount points for DAOS mounted simultaneously */ | ||||||||
| #define MAX_DAOS_MT (8) | ||||||||
| #define MAX_DAOS_MT (32) | ||||||||
|
|
||||||||
| #define READ_DIR_BATCH_SIZE (96) | ||||||||
| #define MAX_FD_DUP2ED (16) | ||||||||
|
|
@@ -687,12 +687,15 @@ discover_dfuse_mounts(void) | |||||||
| } | ||||||||
|
|
||||||||
| while ((fs_entry = getmntent(fp)) != NULL) { | ||||||||
| if (num_dfs >= MAX_DAOS_MT) { | ||||||||
| D_FATAL("dfs_list[] is full. Need to increase MAX_DAOS_MT.\n"); | ||||||||
| abort(); | ||||||||
| } | ||||||||
| pt_dfs_mt = &dfs_list[num_dfs]; | ||||||||
| if (memcmp(fs_entry->mnt_type, STR_AND_SIZE(MNT_TYPE_FUSE)) == 0) { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit
Suggested change
|
||||||||
| if (num_dfs >= MAX_DAOS_MT) { | ||||||||
| D_WARN("Found more than MAX_DAOS_MT (%d) dfuse mount points. " | ||||||||
| "Disabling interception. Increase MAX_DAOS_MT to support " | ||||||||
| "more simultaneous mounts.\n", | ||||||||
| MAX_DAOS_MT); | ||||||||
| D_GOTO(out, rc = EOVERFLOW); | ||||||||
| } | ||||||||
| pt_dfs_mt = &dfs_list[num_dfs]; | ||||||||
| pt_dfs_mt->dcache = NULL; | ||||||||
| pt_dfs_mt->len_fs_root = strnlen(fs_entry->mnt_dir, DFS_MAX_PATH); | ||||||||
| if (pt_dfs_mt->len_fs_root >= DFS_MAX_PATH) { | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| """ | ||
| (C) Copyright 2026 Hewlett Packard Enterprise Development LP | ||
|
|
||
| SPDX-License-Identifier: BSD-2-Clause-Patent | ||
| """ | ||
| import os | ||
|
|
||
| from apricot import TestWithServers | ||
| from dfuse_utils import get_dfuse, start_dfuse | ||
| from host_utils import get_local_host | ||
| from run_utils import run_remote | ||
|
|
||
| # Marker printed to stderr by libpil4dfs at process exit when D_IL_REPORT is set | ||
| # and interception is enabled. Its presence/absence tells us whether interception | ||
| # was active for the process. | ||
| INTERCEPT_MARKER = "libpil4dfs intercepting summary" | ||
|
|
||
|
|
||
| class Pil4dfsManyMounts(TestWithServers): | ||
| """Verify libpil4dfs handling of many dfuse mount points (MAX_DAOS_MT). | ||
|
|
||
| libpil4dfs discovers every fuse.daos mount point listed in /proc/self/mounts | ||
| when it initializes and stores them in a fixed-size table (MAX_DAOS_MT). When | ||
| the number of mount points is at or below the limit, interception is enabled | ||
| and used for all of them. When the number exceeds the limit, libpil4dfs must | ||
| gracefully disable interception (falling back to dfuse) rather than aborting | ||
| the application, so that no core file is produced. | ||
|
|
||
| :avocado: recursive | ||
| """ | ||
|
|
||
| def _run_case(self, pool, dfuse_hosts, env_str, mount_count, expect_intercept): | ||
| """Mount mount_count dfuse instances and run a single libpil4dfs process across them. | ||
|
|
||
| Args: | ||
| pool (TestPool): pool to create the containers in. | ||
| dfuse_hosts (NodeSet): hosts on which to mount dfuse and run the command. | ||
| env_str (str): shell prefix that loads libpil4dfs and enables D_IL_REPORT. | ||
| mount_count (int): number of dfuse mount points to mount simultaneously. | ||
| expect_intercept (bool): whether interception is expected to be enabled. | ||
| """ | ||
| self.log_step( | ||
| f"Case: {mount_count} mount points, " | ||
| f"expecting interception to be {'enabled' if expect_intercept else 'disabled'}") | ||
|
|
||
| dfuses = [] | ||
| mount_dirs = [] | ||
| try: | ||
| for _ in range(mount_count): | ||
| container = self.get_container(pool) | ||
| dfuse = get_dfuse(self, dfuse_hosts) | ||
| start_dfuse(self, dfuse, pool, container) | ||
| dfuses.append(dfuse) | ||
| mount_dirs.append(dfuse.mount_dir.value) | ||
|
|
||
| # A single libpil4dfs-intercepted process that touches every mount point. At | ||
| # initialization libpil4dfs discovers all fuse.daos mounts in /proc/self/mounts, | ||
| # so this exercises the MAX_DAOS_MT table regardless of which mount is accessed. | ||
| stat_cmd = env_str + "stat " + " ".join(mount_dirs) | ||
| result = run_remote(self.log, dfuse_hosts, stat_cmd) | ||
|
|
||
| # The process must always complete cleanly, regardless of how many mounts are | ||
| # present. Over the limit, libpil4dfs must disable interception gracefully and | ||
| # never abort (which would create a core file and fail the CI stage). | ||
| if not result.passed: | ||
| self.fail( | ||
| f"libpil4dfs process failed with {mount_count} mount points on " | ||
| f"{result.failed_hosts}; it must never abort") | ||
|
|
||
| intercepted = INTERCEPT_MARKER in result.joined_stdout | ||
|
|
||
| # Log the observed interception status so the test log shows each case behaving | ||
| # as expected (interception enabled at/below MAX_DAOS_MT, disabled above it). | ||
| self.log.info( | ||
| "Case result: %d mount points -> process succeeded, interception %s " | ||
| "(expected %s)", mount_count, "enabled" if intercepted else "disabled", | ||
| "enabled" if expect_intercept else "disabled") | ||
| if intercepted: | ||
| self.log.info( | ||
| "libpil4dfs interception summary for %d mount points:\n%s", | ||
| mount_count, result.joined_stdout) | ||
|
daltonbohning marked this conversation as resolved.
Outdated
|
||
|
|
||
| if expect_intercept and not intercepted: | ||
| self.fail( | ||
| f"Expected interception to be enabled with {mount_count} mount points, " | ||
| "but the libpil4dfs summary was not found") | ||
| if not expect_intercept and intercepted: | ||
| self.fail( | ||
| f"Expected interception to be disabled with {mount_count} mount points " | ||
| "(more than MAX_DAOS_MT), but the libpil4dfs summary was found") | ||
| finally: | ||
| # Unmount this case's dfuse instances so the next case only sees its own mounts. | ||
| for dfuse in dfuses: | ||
| dfuse.stop() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we know this runs with 10, 32, then 33 instances, do we want to optimize this to reuse the containers and mounts? I.e.
I could make that change if you agree
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good call.. i will make the change. |
||
|
|
||
| def test_pil4dfs_many_mounts(self): | ||
| """JIRA ID: DAOS-18890. | ||
|
|
||
| Test Description: | ||
| Verify libpil4dfs behavior with dfuse mount point counts at/below and | ||
| above MAX_DAOS_MT, all within a single test run. No case may produce a | ||
| core file. | ||
|
|
||
| Steps: | ||
| 1.) Create a single pool. | ||
| 2.) For each count in intercept_mount_counts, mount that many dfuse | ||
| instances and confirm a single libpil4dfs process uses them all | ||
| (interception enabled). | ||
| 3.) Mount no_intercept_mount_count dfuse instances (more than | ||
| MAX_DAOS_MT) and confirm the libpil4dfs process completes without | ||
| aborting and with interception disabled. | ||
|
|
||
| :avocado: tags=all,daily_regression | ||
| :avocado: tags=vm | ||
| :avocado: tags=dfuse,pil4dfs | ||
| :avocado: tags=Pil4dfsManyMounts,test_pil4dfs_many_mounts | ||
| """ | ||
| intercept_mount_counts = self.params.get( | ||
| "intercept_mount_counts", "/run/test/*", [10, 32]) | ||
| no_intercept_mount_count = self.params.get( | ||
| "no_intercept_mount_count", "/run/test/*", 33) | ||
|
|
||
| lib_path = os.path.join(self.prefix, "lib64", "libpil4dfs.so") | ||
| env_str = ( | ||
| f"export LD_PRELOAD={lib_path}; export D_IL_NO_BYPASS=1; export D_IL_REPORT=1; ") | ||
|
daltonbohning marked this conversation as resolved.
Outdated
|
||
| dfuse_hosts = get_local_host() | ||
|
daltonbohning marked this conversation as resolved.
Outdated
|
||
|
|
||
| self.log_step("Creating a single pool") | ||
| pool = self.get_pool(connect=False) | ||
|
|
||
| for mount_count in intercept_mount_counts: | ||
| self._run_case(pool, dfuse_hosts, env_str, mount_count, expect_intercept=True) | ||
|
|
||
| self._run_case( | ||
| pool, dfuse_hosts, env_str, no_intercept_mount_count, expect_intercept=False) | ||
|
|
||
| self.log.info("Test passed") | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,29 @@ | ||||||
| hosts: | ||||||
| test_servers: 1 | ||||||
| test_clients: 1 | ||||||
| timeout: 900 | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only if you need to push again: could reduce this to 600 since the actual execution time was ~6 minutes
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sigh.. i did repush but forgot about this |
||||||
| server_config: | ||||||
| name: daos_server | ||||||
| engines_per_host: 1 | ||||||
| engines: | ||||||
| 0: | ||||||
| targets: 4 | ||||||
| nr_xs_helpers: 0 | ||||||
| storage: | ||||||
| 0: | ||||||
| class: ram | ||||||
| scm_mount: /mnt/daos | ||||||
| system_ram_reserved: 1 | ||||||
| pool: | ||||||
| size: 1GiB | ||||||
| container: | ||||||
| type: POSIX | ||||||
| control_method: daos | ||||||
| test: | ||||||
| # Mount counts at/below MAX_DAOS_MT for which libpil4dfs enables interception. | ||||||
| intercept_mount_counts: | ||||||
| - 10 | ||||||
| - 32 | ||||||
| # Mount count above MAX_DAOS_MT for which libpil4dfs must gracefully disable | ||||||
| # interception (no abort, no core file). | ||||||
| no_intercept_mount_count: 33 | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we not make this 128 or 256? The cost of this is what, a few KB? 32 is still in the realm of possibility of mounts on a common login.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest increasing the size to something that will likely not happen. Otherwise you'll just hit this in the future and be annoyed again.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
discussed with Kevin offline. the change made to just disable interception once > 32 mounts are there is sufficient and this won't abort apps as before and cause problems.