From 02ec636228443894aee7bf1307444ebf3d074315 Mon Sep 17 00:00:00 2001 From: Kushal Deb Date: Mon, 16 Feb 2026 20:12:20 +0530 Subject: [PATCH] cephadm: reapply hugepages for nvmeof at service start NVMeoF gateways (SPDK) require host hugepages (vm.nr_hugepages + /dev/hugepages). After a power-cycle some nodes boot with hugepages=0 (and/or the cephadm sysctl drop-in under /etc/sysctl.d is missing/not applied), causing SPDK to fail and the nvmeof container to crash-loop until the service is redeployed. Cephadm previously applied hugepages only during deploy/reconfig via install_sysctl(). Normal boot/start uses the generated systemd unit which runs unit.run and does not re-apply sysctl settings. Added a pre-start step for nvmeof to set vm.nr_hugepages to the configured value (from spdk_huge_pages, defaulting to 4096) before launching the container, so the service self-heals on reboot/service restart. Signed-off-by: Kushal Deb --- src/cephadm/cephadm.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 4e255e55165..18061df654b 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1057,6 +1057,17 @@ def deploy_daemon_units( post_stop_commands.append( CephIscsi.configfs_mount_umount(data_dir, mount=False) ) + daemon = daemon_form_create(ctx, ident) + if ident.daemon_type == 'nvmeof': + hp = '4096' + files = getattr(daemon, 'files', None) + if isinstance(files, dict): + val = files.get('spdk_huge_pages') + if isinstance(val, int): + hp = str(val) + elif isinstance(val, str) and val.isdigit(): + hp = val + pre_start_commands.append(f'/usr/sbin/sysctl -w vm.nr_hugepages={hp} || true\n') runscripts.write_service_scripts( ctx, @@ -1071,7 +1082,7 @@ def deploy_daemon_units( ) # sysctl - install_sysctl(ctx, ident.fsid, daemon_form_create(ctx, ident)) + install_sysctl(ctx, ident.fsid, daemon) # systemd ic_ids = [ -- 2.47.3