From: Adam King Date: Thu, 25 Sep 2025 20:13:18 +0000 (-0400) Subject: mgr/cephadm: split host cache entries if they exceed max mon store entry size X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2140659c5e5e567b461ff12ef2fe4d1abbeef8c3;p=ceph-ci.git mgr/cephadm: split host cache entries if they exceed max mon store entry size If the json blob we attempt to store for a host entry exceeds the max mon store entry size, we become unable to continue to store that hosts information in the config key store. This means we only ever have the information from the last time the json blob was under the size limit each time the mgr fails over, resulting in a number of stray host/daemon warnings being generated and very outdated information being reported by `ceph orch ps` and `ceph orch ls` around the time of the failover Signed-off-by: Adam King (cherry picked from commit ffe61afc2b5e6c2f4db1001e5288dcd8f995e570) Resolves: rhbz#2345474 --- diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index b0dfe2078ae..63ddfb68b28 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -45,6 +45,7 @@ NODE_PROXY_CACHE_PREFIX = 'node_proxy' class HostCacheStatus(enum.Enum): stray = 'stray' host = 'host' + additional_host_entry = 'additional_host_entry' devices = 'devices' @@ -985,13 +986,18 @@ class HostCache(): # type: () -> None for k, v in self.mgr.get_store_prefix(HOST_CACHE_PREFIX).items(): host = k[len(HOST_CACHE_PREFIX):] - if self._get_host_cache_entry_status(host) != HostCacheStatus.host: - if self._get_host_cache_entry_status(host) == HostCacheStatus.devices: + entry_type = self._get_host_cache_entry_status(host) + if entry_type != HostCacheStatus.host: + if ( + entry_type == HostCacheStatus.devices + or entry_type == HostCacheStatus.additional_host_entry + ): continue self.mgr.log.warning('removing stray HostCache host record %s' % ( host)) self.mgr.set_store(k, None) try: + host, v = self._combine_potential_split_entry(host, v) j = json.loads(v) if 'last_device_update' in j: self.last_device_update[host] = str_to_datetime(j['last_device_update']) @@ -1057,7 +1063,15 @@ class HostCache(): # .devices. where is # in out inventory. If neither case applies, it is stray if host in self.mgr.inventory: + # legacy entry "mgr/cephadm/host." return HostCacheStatus.host + elif ';' in host and host.split(';', 1)[0] in self.mgr.inventory: + # new entry style that allows multiple entries per host + # "mgr/cephadm/host.;" + if host.split(';', 1)[1] == '0': + return HostCacheStatus.host + else: + return HostCacheStatus.additional_host_entry try: # try stripping off the ".devices." and see if we get # a host name that matches our inventory @@ -1066,6 +1080,76 @@ class HostCache(): except Exception: return HostCacheStatus.stray + def _combine_potential_split_entry(self, hostname: str, entry_content: str) -> Tuple[str, str]: + # HostCache load function should have filtered out "additional" + # host entries "mgr/cephadm/host.;" where entry-id > 0 as + # well as device entries "mgr/cephadm/host./devices." + # if we're here, the entry should be either "mgr/cephadm/host." + # or "mgr/cephadm/host.;0" + if ';' not in hostname: + # old style, just one entry for this host + return hostname, entry_content + else: + # new style of entry that allows splitting the host entry + hostname = hostname.split(';', 1)[0] + entries = [entry_content] + found_content = True + entry_id_to_check = 1 + while found_content: + next_entry_name = HOST_CACHE_PREFIX + hostname + f';{entry_id_to_check}' + next_entry_content = self.mgr.get_store(next_entry_name) + if next_entry_content: + entries.append(next_entry_content) + entry_id_to_check += 1 + else: + found_content = False + return hostname, ''.join(entries) + + def _potentially_split_and_save_host_entry(self, hostname: str, entry_content: str) -> None: + cache_size: int = self.mgr.get_foreign_ceph_option('mon', 'mon_config_key_max_entry_size') + if not cache_size: + self.mgr.set_store(HOST_CACHE_PREFIX + hostname + ';0', entry_content) + return + + def byte_len(s: str) -> int: + return len(s.encode('utf-8')) + + entries: List[str] = [] + required_entry_count = math.ceil(byte_len(entry_content) / cache_size) + chars_per_entry: int = math.floor(len(entry_content) / (required_entry_count)) + for i in range(required_entry_count): + if i == (required_entry_count - 1): + entry = entry_content[i * chars_per_entry:len(entry_content)] + entries.append(entry) + else: + entry = entry_content[i * chars_per_entry:(i + 1) * chars_per_entry] + entries.append(entry) + + for i in range(len(entries)): + self.mgr.set_store(HOST_CACHE_PREFIX + hostname + f';{i}', entries[i]) + + # it is possible that we now have less entries than we did previously if + # the user has moved daemons off of this host or the mon config key store + # entry max size has been raised. Make sure to clean up any extra entries + # if they are present + self._cleanup_potential_split_host_entries(hostname, starting_entry_id=len(entries)) + + def _cleanup_potential_split_host_entries(self, hostname: str, starting_entry_id: int = 0) -> None: + # cover case of old format that didn't allow splitting entries + self.mgr.set_store(HOST_CACHE_PREFIX + hostname, None) + + # cover case of new format that could have multiple entries per host + found_content = True + entry_id_to_check = starting_entry_id + while found_content: + next_entry_name = HOST_CACHE_PREFIX + hostname + f';{entry_id_to_check}' + next_entry_content = self.mgr.get_store(next_entry_name) + if next_entry_content: + self.mgr.set_store(next_entry_name, None) + entry_id_to_check += 1 + else: + found_content = False + def update_host_daemons(self, host, dm): # type: (str, Dict[str, orchestrator.DaemonDescription]) -> None self.daemons[host] = dm @@ -1268,7 +1352,7 @@ class HostCache(): if host in self.devices: self.save_host_devices(host) - self.mgr.set_store(HOST_CACHE_PREFIX + host, json.dumps(j)) + self._potentially_split_and_save_host_entry(host, json.dumps(j)) def save_host_devices(self, host: str) -> None: if host not in self.devices or not self.devices[host]: @@ -1368,7 +1452,7 @@ class HostCache(): del self.scheduled_daemon_actions[host] if host in self.last_client_files: del self.last_client_files[host] - self.mgr.set_store(HOST_CACHE_PREFIX + host, None) + self._cleanup_potential_split_host_entries(host) def get_hosts(self): # type: () -> List[str]