From e872693c151842ea8d6142effe65e604acecf8b8 Mon Sep 17 00:00:00 2001 From: Gil Bregman Date: Mon, 19 Jan 2026 14:18:03 +0200 Subject: [PATCH] mgr/cephadm: Add some new fields to the cephadm NVMEoF spec file. Fixes: https://tracker.ceph.com/issues/74446 Signed-off-by: Gil Bregman --- .../services/nvmeof/ceph-nvmeof.conf.j2 | 6 ++++++ src/pybind/mgr/cephadm/tests/test_services.py | 6 ++++++ .../ceph/deployment/service_spec.py | 19 ++++++++++++++++++- src/python-common/ceph/deployment/utils.py | 14 +++++++++++--- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 2a0293998f9..2f9d6a84f86 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -27,6 +27,8 @@ verify_listener_ip = {{ spec.verify_listener_ip }} # This is a development flag, do not change it abort_on_errors = {{ spec.abort_on_errors }} # This is a development flag, do not change it +abort_on_update_error = {{ spec.abort_on_update_error }} +# This is a development flag, do not change it omap_file_ignore_unlock_errors = {{ spec.omap_file_ignore_unlock_errors }} # This is a development flag, do not change it omap_file_lock_on_read = {{ spec.omap_file_lock_on_read }} @@ -48,6 +50,8 @@ max_namespaces_per_subsystem = {{ spec.max_namespaces_per_subsystem }} max_hosts_per_subsystem = {{ spec.max_hosts_per_subsystem }} subsystem_cache_expiration = {{ spec.subsystem_cache_expiration }} force_tls = {{ spec.force_tls }} +# This is a development flag, do not change it +max_message_length_in_mb = {{ spec.max_message_length_in_mb }} [gateway-logs] log_level = {{ spec.log_level }} @@ -64,6 +68,8 @@ addr = {{ discovery_addr }} port = {{ spec.discovery_port }} # This is a development flag, do not change it abort_on_errors = {{ spec.abort_discovery_on_errors }} +bind_retries_limit = {{ spec.discovery_bind_retries_limit }} +bind_sleep_interval = {{ spec.discovery_bind_sleep_interval }} [ceph] pool = {{ spec.pool }} diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 864c4e1a6f5..485f1e63fff 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -477,6 +477,8 @@ verify_listener_ip = True # This is a development flag, do not change it abort_on_errors = True # This is a development flag, do not change it +abort_on_update_error = True +# This is a development flag, do not change it omap_file_ignore_unlock_errors = False # This is a development flag, do not change it omap_file_lock_on_read = True @@ -498,6 +500,8 @@ max_namespaces_per_subsystem = 512 max_hosts_per_subsystem = 128 subsystem_cache_expiration = 30 force_tls = False +# This is a development flag, do not change it +max_message_length_in_mb = 4 [gateway-logs] log_level = INFO @@ -514,6 +518,8 @@ addr = 192.168.100.100 port = 8009 # This is a development flag, do not change it abort_on_errors = True +bind_retries_limit = 10 +bind_sleep_interval = 0.5 [ceph] pool = {pool} diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 722b5848d2e..932044741ca 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -36,7 +36,7 @@ import yaml from ceph.deployment.hostspec import HostSpec, SpecValidationError, assert_valid_host from ceph.deployment.utils import unwrap_ipv6, valid_addr, verify_non_negative_int from ceph.deployment.utils import verify_positive_int, verify_non_negative_number -from ceph.deployment.utils import verify_boolean, verify_enum +from ceph.deployment.utils import verify_boolean, verify_enum, verify_int from ceph.deployment.utils import parse_combined_pem_file from ceph.utils import is_hex from ceph.smb import constants as smbconst @@ -1660,6 +1660,7 @@ class NvmeofServiceSpec(ServiceSpec): max_gws_in_grp: Optional[int] = 16, max_ns_to_change_lb_grp: Optional[int] = 8, abort_on_errors: Optional[bool] = True, + abort_on_update_error: Optional[bool] = True, omap_file_ignore_unlock_errors: Optional[bool] = False, omap_file_lock_on_read: Optional[bool] = True, omap_file_lock_duration: Optional[int] = 20, @@ -1692,6 +1693,7 @@ class NvmeofServiceSpec(ServiceSpec): max_hosts_per_subsystem: Optional[int] = 128, subsystem_cache_expiration: Optional[int] = 30, force_tls: Optional[bool] = False, + max_message_length_in_mb: Optional[int] = 4, server_key: Optional[str] = None, server_cert: Optional[str] = None, client_key: Optional[str] = None, @@ -1721,6 +1723,8 @@ class NvmeofServiceSpec(ServiceSpec): discovery_addr: Optional[str] = None, discovery_addr_map: Optional[Dict[str, str]] = None, discovery_port: Optional[int] = None, + discovery_bind_retries_limit: Optional[int] = 10, + discovery_bind_sleep_interval: Optional[float] = 0.5, abort_discovery_on_errors: Optional[bool] = True, log_level: Optional[str] = 'INFO', log_files_enabled: Optional[bool] = True, @@ -1810,6 +1814,8 @@ class NvmeofServiceSpec(ServiceSpec): self.verify_listener_ip = verify_listener_ip #: ``abort_on_errors`` abort gateway in case of errors self.abort_on_errors = abort_on_errors + #: ``abort_on_update_error`` abort gateway in case of an error during update + self.abort_on_update_error = abort_on_update_error #: ``omap_file_ignore_unlock_errors`` ignore errors when unlocking the OMAP file self.omap_file_ignore_unlock_errors = omap_file_ignore_unlock_errors #: ``omap_file_lock_on_read`` lock omap when reading its content @@ -1842,6 +1848,8 @@ class NvmeofServiceSpec(ServiceSpec): self.subsystem_cache_expiration = subsystem_cache_expiration #: ``force_tls`` force using TLS when adding hosts and listeners self.force_tls = force_tls + #: ``max_message_length_in_mb`` max protobuf message length, in mb + self.max_message_length_in_mb = max_message_length_in_mb #: ``allowed_consecutive_spdk_ping_failures`` # of ping failures before aborting gateway self.allowed_consecutive_spdk_ping_failures = allowed_consecutive_spdk_ping_failures #: ``spdk_ping_interval_in_seconds`` sleep interval in seconds between SPDK pings @@ -1922,6 +1930,10 @@ class NvmeofServiceSpec(ServiceSpec): self.discovery_addr_map = discovery_addr_map #: ``discovery_port`` port of the discovery service self.discovery_port = discovery_port or 8009 + #: ``discovery_bind_retries_limit`` how many times to keep trying bind the discovery port + self.discovery_bind_retries_limit = discovery_bind_retries_limit + #: ``discovery_bind_sleep_interval`` seconds to wait between each bind attempt + self.discovery_bind_sleep_interval = discovery_bind_sleep_interval #: ``abort_discovery_on_errors`` abort discovery service in case of errors self.abort_discovery_on_errors = abort_discovery_on_errors #: ``log_level`` the nvmeof gateway log level @@ -2055,6 +2067,7 @@ class NvmeofServiceSpec(ServiceSpec): verify_non_negative_int(self.max_ns_to_change_lb_grp, "Max namespaces to change load balancing group") verify_boolean(self.abort_on_errors, "Abort gateway on errors") + verify_boolean(self.abort_on_update_error, "Abort gateway on an update error") verify_boolean(self.omap_file_ignore_unlock_errors, "Ignore OMAP file unlock errors") verify_boolean(self.omap_file_lock_on_read, "Lock OMAP on read") verify_non_negative_int(self.omap_file_lock_duration, "OMAP file lock duration") @@ -2081,9 +2094,13 @@ class NvmeofServiceSpec(ServiceSpec): verify_non_negative_number(self.subsystem_cache_expiration, "Subsystem cache expiration period") verify_boolean(self.force_tls, "Force TLS") + verify_positive_int(self.max_message_length_in_mb, "Max protocol message length") verify_non_negative_number(self.monitor_timeout, "Monitor timeout") verify_non_negative_int(self.port, "Port") verify_non_negative_int(self.discovery_port, "Discovery port") + verify_int(self.discovery_bind_retries_limit, "Discovery port bind retries limit") + verify_non_negative_number(self.discovery_bind_sleep_interval, + "Sleep between discovery port bind retries") verify_boolean(self.abort_discovery_on_errors, "Abort discovery service on errors") verify_non_negative_int(self.prometheus_port, "Prometheus port") verify_non_negative_int(self.prometheus_stats_interval, "Prometheus stats interval") diff --git a/src/python-common/ceph/deployment/utils.py b/src/python-common/ceph/deployment/utils.py index 0bc92b6df7a..f28573b03fa 100644 --- a/src/python-common/ceph/deployment/utils.py +++ b/src/python-common/ceph/deployment/utils.py @@ -129,19 +129,27 @@ def verify_numeric(field: Any, field_name: str) -> None: raise SpecValidationError(f"{field_name} must be a number") -def verify_non_negative_int(field: Any, field_name: str) -> None: +def verify_int(field: Any, field_name: str) -> None: verify_numeric(field, field_name) if field is not None: if not isinstance(field, int) or isinstance(field, bool): raise SpecValidationError(f"{field_name} must be an integer") + + +def verify_non_negative_int(field: Any, field_name: str) -> None: + verify_numeric(field, field_name) + if field is not None: + verify_int(field, field_name) if field < 0: raise SpecValidationError(f"{field_name} can't be negative") def verify_positive_int(field: Any, field_name: str) -> None: verify_non_negative_int(field, field_name) - if field is not None and field <= 0: - raise SpecValidationError(f"{field_name} must be greater than zero") + if field is not None: + verify_int(field, field_name) + if field <= 0: + raise SpecValidationError(f"{field_name} must be greater than zero") def verify_non_negative_number(field: Any, field_name: str) -> None: -- 2.47.3