mgr/call_home: refactor agent

author Yaarit Hatuka <yhatuka@ibm.com>

Thu, 16 May 2024 02:08:32 +0000 (22:08 -0400)

committer Justin Caratzas <jcaratza@redhat.com>

Tue, 23 Sep 2025 13:07:09 +0000 (09:07 -0400)
author Yaarit Hatuka <yhatuka@ibm.com>
Thu, 16 May 2024 02:08:32 +0000 (22:08 -0400)
committer Justin Caratzas <jcaratza@redhat.com>
Tue, 23 Sep 2025 13:07:09 +0000 (09:07 -0400)
diff --git a/src/pybind/mgr/call_home_agent/__init__.py b/src/pybind/mgr/call_home_agent/__init__.py

index c0985397934152c8d8b8f97817c58632855e5dfb..9a80709cdd786b1d2952ca7183c756c95756a180 100644 (file)
--- a/src/pybind/mgr/call_home_agent/__init__.py
+++ b/src/pybind/mgr/call_home_agent/__init__.py
@@ -2,4 +2,4 @@ import os
  if 'UNITTEST' in os.environ:
      import tests
  
-from .module import CallHomeAgent
-\ No newline at end of file
+from .module import CallHomeAgent
diff --git a/src/pybind/mgr/call_home_agent/config.py b/src/pybind/mgr/call_home_agent/config.py

deleted file mode 100644 (file)

index 85da598..0000000
--- a/src/pybind/mgr/call_home_agent/config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-import sys
-from cryptography.hazmat.primitives.ciphers.aead import AESGCM
-import base64
-import json
-
-call_home_keys = '/usr/share/ceph/mgr/call_home_agent/ceph_call_home'  # default location of the key file
-
-decryption_key = b'yDVH70MMpzBnu5Y1dKfJrw=='
-decyption_nonce = b'1K6HRTiLD80laBi6'
-
-def get_settings() -> dict:
-    if 'UNITTEST' in os.environ:
-        return {'api_key': 'test_api_key', 'private_key': 'test_private_key'}
-
-    try:
-        encrypted_keys = _load_encrypted_keys()
-        aes_key = base64.b64decode(decryption_key)
-        nonce = base64.b64decode(decyption_nonce)
-        aesgcm = AESGCM(aes_key)
-        clear_keys = aesgcm.decrypt(nonce, encrypted_keys, b'')
-        keys = json.loads(clear_keys)
-        return keys
-    except Exception as e:
-        raise Exception(f"Error getting encrypted settings: {e}")
-
-def _load_encrypted_keys() -> bytes:
-    key_file = os.environ.get('CALLHOMEKEYSFILE', call_home_keys)
-    if not os.path.isfile(key_file):
-        raise Exception(f"Can't find key file {key_file}")
-
-    with open(key_file, 'rb') as f:
-        return f.read()
diff --git a/src/pybind/mgr/call_home_agent/dataDicts.py b/src/pybind/mgr/call_home_agent/dataDicts.py

deleted file mode 100644 (file)

index e09f5b9..0000000
--- a/src/pybind/mgr/call_home_agent/dataDicts.py
+++ /dev/null
@@ -1,286 +0,0 @@
-from datetime import datetime
-from typing import Any, Optional
-import json
-import os
-import jwt
-import re
-import time
-from mgr_module import CommandResult
-
-from .config import get_settings
-
-# Constants for operations types:
-UPLOAD_SNAP = 'upload_snap'
-UPLOAD_FILE = 'upload_file'
-DISABLE_SI_MESSAGES = 'disable_si_messages'
-CONFIRM_RESPONSE = 'confirm_response'
-NOT_SUPPORTED = 'unknown_operation'
-
-# Constants for operation status
-OPERATION_STATUS_NEW = 'READY'
-OPERATION_STATUS_IN_PROGRESS = 'IN_PROGRESS'
-OPERATION_STATUS_COMPLETE = 'COMPLETE'
-OPERATION_STATUS_ERROR = 'ERROR'
-OPERATION_STATUS_REQUEST_REJECTED = 'REQUEST_REJECTED'
-
-#Constants for operations status delivery
-ST_NOT_SENT = 0
-ST_SENT = 1
-
-def confirm_response_event(ceph_cluster_id: str, report_timestamp: float,
-                           tenant_id: str) -> dict:
-    """
-    Return a confirm response event
-    """
-    event_time = datetime.fromtimestamp(report_timestamp).strftime("%Y-%m-%d %H:%M:%S")
-    event_time_ms = int(report_timestamp * 1000)
-
-    return {
-        "header": {
-                "event_type": "confirm_response",
-                "event_id": f"IBM_event_RedHatMarine_ceph_{ceph_cluster_id}_{event_time_ms}_confirm_response_event",
-                "event_time": f"{event_time}",
-                "event_time_ms": event_time_ms,
-                "tenant_id": tenant_id,
-        },
-        "body": {
-                "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
-                "event_type": "last_contact",
-                "component": "ceph_operations"
-        }
-    }
-
-def upload_snap_operation_event(ceph_cluster_id: str, report_timestamp: float,
-                                tenant_id: str, operation: dict) -> dict:
-    """
-    Return an event based in the operation passed as parameter
-    """
-    event_time = datetime.fromtimestamp(report_timestamp).strftime("%Y-%m-%d %H:%M:%S")
-    event_time_ms = int(report_timestamp * 1000)
-
-    return {
-            "header": {
-                "event_type": "status",
-                "event_id": f"IBM_event_RedHatMarine_ceph_{ceph_cluster_id}_{event_time_ms}_upload_snap_status__event",
-                "event_time": f"{event_time}",
-                "event_time_ms": event_time_ms,
-                "tenant_id": tenant_id,
-            },
-            "body": {
-                "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
-                "product":  "Red Hat Ceph",
-                "component": "ceph_log_upload",
-                "description":  operation['description'],
-                "state" : f"{operation['status']} ({operation['progress']}%)",
-                "complete" : (operation['status'] != OPERATION_STATUS_IN_PROGRESS),
-                "payload": {
-                    "action": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
-                    "description": operation['description'],
-                    "state" : operation['status'],
-                    "progress": operation['progress'],
-                    "complete" : (operation['status'] != OPERATION_STATUS_IN_PROGRESS),
-                    "si_requestid": operation['si_requestid'],
-                }
-            }
-        }
-
-class ReportHeader:
-    def collect(report_type: str, ceph_cluster_id: str, ceph_version: str,
-                report_timestamp: float, mgr_module: Any, target_space: str = 'prod',
-                operation_event_id: str = '') -> dict:
-        try:
-            id_data = get_settings()
-        except Exception as e:
-            mgr_module.log.error(f"Error getting encrypted identification keys for {report_type} report: {e}. "
-                                 "Provide keys and restart IBM Ceph Call Home module")
-            id_data = {'api_key': '', 'private_key': ''}
-
-        report_time = datetime.fromtimestamp(report_timestamp).strftime("%Y-%m-%d %H:%M:%S")
-        report_time_ms = int(report_timestamp * 1000)
-        local_report_time = datetime.fromtimestamp(report_timestamp).strftime("%a %b %d %H:%M:%S %Z")
-
-        if not operation_event_id:
-            event_id = "IBM_chc_event_RedHatMarine_ceph_{}_{}_report_{}".format(ceph_cluster_id, report_type, report_time_ms)
-        else:
-            event_id = operation_event_id
-
-        return {
-                "agent": "RedHat_Marine_firmware_agent",
-                "api_key": "{}".format(id_data['api_key']),
-                "private_key": "{}".format(id_data['private_key']),
-                "target_space": "{}".format(target_space),
-                "asset": "ceph",
-                "asset_id": "{}".format(ceph_cluster_id),
-                "asset_type": "RedHatMarine",
-                "asset_vendor": "IBM",
-                "asset_virtual_id": "{}".format(ceph_cluster_id),
-                "country_code": "",
-                "event_id": event_id,
-                "event_time": "{}".format(report_time),
-                "event_time_ms": report_time_ms,
-                "local_event_time": "{}".format(local_report_time),
-                "software_level": {
-                    "name": "ceph_software",
-                    "vrmf": "{}".format(ceph_version)
-                },
-                "type": "eccnext_apisv1s",
-                "version": "1.0.0.1",
-                "analytics_event_source_type": "asset_event",
-                "analytics_type": "ceph",
-                "analytics_instance":  "{}".format(ceph_cluster_id),
-                "analytics_virtual_id": "{}".format(ceph_cluster_id),
-                "analytics_group": "Storage",
-                "analytics_category": "RedHatMarine",
-                "events": []
-            }
-
-class ReportEvent():
-    def collect(event_type: str, component: str, report_timestamp: float, ceph_cluster_id: str,
-                icn: str, tenant_id: str, description: str, content: dict,
-                mgr_module: Any, operation_key: str = "") -> dict:
-
-        # OPERATION STATUS Reports:
-        # ----------------------------------------------------------------------
-        event_data = {}
-        if operation_key != "":
-            try:
-                operation = content["type"]
-                if operation == UPLOAD_SNAP:
-                    event_data = upload_snap_operation_event(ceph_cluster_id,
-                                                             report_timestamp,
-                                                             tenant_id,
-                                                             content)
-                elif operation == CONFIRM_RESPONSE:
-                    event_data = confirm_response_event(ceph_cluster_id,
-                                                        report_timestamp,
-                                                        tenant_id)
-            except Exception:
-                mgr_module.log.error(f'not able to obtain event data: {ex}')
-            return event_data
-
-        # event time data
-        event_time = datetime.fromtimestamp(report_timestamp).strftime("%Y-%m-%d %H:%M:%S")
-        event_time_ms = int(report_timestamp * 1000)
-        local_event_time = datetime.fromtimestamp(report_timestamp).strftime("%a %b %d %H:%M:%S %Z")
-
-        # INVENTORY; CLUSTER STATUS; LAST_CONTACT reports
-        # ----------------------------------------------------------------------
-        # Extract jti from JWT. This is another way to identify clusters in addition to the ICN.
-        jwt_jti = ""
-        reg_credentials_str = ceph_command(mgr=mgr_module, srv_type='mon',
-                                           prefix='config-key get',
-                                           key='mgr/cephadm/registry_credentials')
-        if reg_credentials_str:
-            jti_token_fail = ""
-            try:
-                reg_credentials = json.loads(reg_credentials_str)
-                user_jwt_password = r"{}".format(reg_credentials['password'])
-                registry_url = reg_credentials['url']
-                if re.match(mgr_module.valid_container_registry, registry_url):
-                    jwt_jti = jwt.decode(user_jwt_password, options={
-                                        "verify_signature": False})["jti"]
-                    mgr_module.log.info("JWT jti field extracted succesfully")
-                else:
-                    jti_token_fail = f"url for registry credentials stored in <mgr/cephadm/registry_url> does not match with the expected ones <{mgr_module.valid_container_registry}>"
-            except Exception as ex:
-                jti_token_fail = str(ex)
-
-            if jti_token_fail:
-                mgr_module.log.warning(
-                    f"not able to extract <jti> from JWT token, a valid not empty jti token is required in <mgr/cephadm/registry_password> field password: {jti_token_fail}")
-
-        event_data = {
-                "header": {
-                    "event_id": "IBM_event_RedHatMarine_ceph_{}_{}_{}_event".format(ceph_cluster_id, event_time_ms, event_type),
-                    "event_time": "{}".format(event_time),
-                    "event_time_ms": event_time_ms,
-                    "event_type": "{}".format(event_type),
-                    "local_event_time": "{}".format(local_event_time)
-                },
-                "body": {
-                    "component": component,
-                    "context": {
-                        "origin": 2,
-                        "timestamp": event_time_ms,
-                        "transid": event_time_ms
-                    },
-                    "description": "".format(description),
-                    "payload": {}
-                }
-            }
-
-        # The perfo report is special because elastic and kafka reqs (IBM)
-        # and the payload needs to have only the perfstats content
-        if event_type == 'performance':
-            event_data["body"]["payload"]["perfstats"] = content["perfstats"]
-        else:
-            event_data["body"]["payload"]["request_time"] = event_time_ms
-            event_data["body"]["payload"]["ibm_customer_number"] = icn
-            event_data["body"]["payload"]["content"] = content
-            event_data["body"]["payload"]["product_id_list"] = [
-                            ['5900-AVA', 'D0CYVZX'],
-                            ['5900-AVA', 'D0CYWZX'],
-                            ['5900-AVA', 'D0CYXZX'],
-                            ['5900-AVA', 'D0DKDZX'],
-                            ['5900-AVA', 'E0CYUZX'],
-                            ['5900-AXK', 'D0DSJZX'],
-                            ['5900-AXK', 'D0DSKZX'],
-                            ['5900-AXK', 'D0DSMZX'],
-                            ['5900-AXK', 'D0DSLZX'],
-                            ['5900-AXK', 'E0DSIZX'],
-                        ]
-            event_data["body"]["payload"]["jti"] = jwt_jti
-
-        if event_type == 'inventory':
-            if tenant_id:
-                event_data["header"]["tenant_id"] = "{}".format(tenant_id)
-
-        if event_type == 'status':
-            event_data["body"]["event_transaction_id"] = "IBM_event_RedHatMarine_ceph_{}_{}_{}_event".format(ceph_cluster_id, event_time_ms, event_type)
-
-            if component != 'ceph_alerts':
-                event_data["body"]["state"] =  "{}".format(content['status']['health']['status'])
-            else:
-                # if the status event contains alerts we add a boolean in the body to help with analytics
-                event_data["body"]["alert"] =  True
-                # Call Home requires the 'state' attribute in the 'body' section
-                event_data["body"]["state"] = "Ok"
-
-            event_data["body"]["complete"] = True
-
-            if tenant_id:
-                event_data["header"]["tenant_id"] = "{}".format(tenant_id)
-
-        if event_type == 'last_contact':
-            # Additional fields to enable response with commands
-            event_data["body"]["context"]["messagetype"] = 1
-            event_data["body"]["enable_response_detail"] = True
-            event_data["body"]["enable_response_detail_filter"] = ["Unsolicited_Storage_Insights_RedHatMarine_ceph_Request"]
-
-        return event_data
-
-def ceph_command(mgr: Any, srv_type, prefix, srv_spec='', inbuf='', **kwargs):
-    # type: (Any, str, str, Optional[str], str, Any) -> Any
-    #
-    # Note: A simplified version of the function used in dashboard ceph services
-    """
-    :type prefix: str
-    :param srv_type: mon |
-    :param kwargs: will be added to argdict
-    :param srv_spec: typically empty. or something like "<fs_id>:0"
-    :param to_json: if true return as json format
-    """
-    argdict = {
-        "prefix": prefix,
-    }
-    argdict.update({k: v for k, v in kwargs.items() if v is not None})
-    result = CommandResult("")
-    mgr.send_command(result, srv_type, srv_spec, json.dumps(argdict), "", inbuf=inbuf)
-    r, outb, outs = result.wait()
-    if r != 0:
-        mgr.log.error(f"Execution of command '{prefix}' failed. (r={r}, outs=\"{outs}\", kwargs={kwargs})")
-    try:
-        return outb or outs
-    except Exception as ex:
-        mgr.log.error(f"Execution of command '{prefix}' failed: {ex}")
-        return outb
diff --git a/src/pybind/mgr/call_home_agent/design.md b/src/pybind/mgr/call_home_agent/design.md

new file mode 100644 (file)

index 0000000..f158dd9
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/design.md
@@ -0,0 +1,144 @@
+
+# Abbriviations / Glossery
+
+- **UR** : Unsolicited Request. A request message that we receive from "IBM Call
+  Home". Currently we receive those in the HTTP reply for the "Last Contact"
+report.
+- **agent** : Instance of the CallHomeAgent class, which is the main class of the
+  module. Usually it gets passed to constructors and functions as the first
+argument so that they can use the CallHomeAgent to get configuration, etc. Note
+that CallHomeAgent inherits from MgrModule, therefore all of its API is
+available through the "agent".
+
+---
+
+# Classes
+
+The main classes are those inheriting from or implementing the interface of:
+
+- **WorkFlow** (interface)
+- **Report** (base class)
+- **Event** (base class)
+
+WorkFlow implements the logic of _what_ to do, while Report and Event are only tools / macros that help generate the report JSON format.
+A new report object, and the Event object (or objects) in it are created every time that a reports needs to be sent, and destroyed after the report is sent.
+They do not live past a single report.
+
+# Helper classes and interfaces:
+
+- **reportTimes** - Holds the time of the report, and provides an API to get time, time_ms and local_time fields needed in the report.
+- **URInfo** (interface) : Provides information about a UR, such as its ID for stale, ID for cooldown, timeout for cooldown, etc.
+
+Currently, the current implementation does not require sending more than one
+event in a report, and each event+report are sent in one, and only one,
+workflow. So even thought the design supports having any number of events in a
+report, and a report to be sent from any workflow, for readability sake, the
+derived events are declared in the same file as in which the report class that uses them is declared.
+E.g., `class EventStatusHealth` and `class ReportStatusHealth` are both declared in `ReportStatusHealth.py`.
+
+---
+
+## Workflow
+
+An interface (not a base class).
+Workflow classes implement a "workflow" that requires sending more than one Report.
+Currently the only one is `WorkFlowUploadSnap`, which implements the following:
+
+- WorkFlowUploadSnap:
+  - collects diagnostics commands
+  - collects SOS report if needed
+  - uploads those reports to ECuRep while sending *ReportStatusLogUpload* with the progress
+  - sends *ReportConfirmResponse* to mark that we processed this UR
+
+### Interface
+
+- `__init__(self, agent, req, req_id)`
+- run()
+
+## Report
+
+Report implements the envalope of a message sent to IBM Call Home. Most of the reports are similar, but there are small changes
+between them, therefore a specific report, such as ReportLastContact, inherits from Report and expands it with specific changes.
+
+### Report base class
+
+You can override any of these methods in derived classes to change the behavior
+
+#### `__init__(self, agent, report_type, event_classes = [])`
+Initialize the Report object
+
+- `report_type`: string
+- `event_classes`: list of `Event*` classes which should be included in this report. E.g. [EventStatusHealth, EventLastContact]
+
+#### `compile(self)`
+Creates the report, instantiate the Event classes given at `__init__` and calls their `generate()` to create the events and add them to the report
+compile can return None if there is nothing to send.
+
+This method is overriden in `ReportStatusAlerts` in order to return None if
+there were no changes in alerts and therefore no need to send the report.
+`run()`, which calls `compile()` will check the return for None and won't call
+`send()` if there is None to send.
+
+#### `run(self)`
+Calls `self.compile()`, and if there is any data returned then calls `self.send()` to actually send the report.
+
+### `send(self, report: dict, force: bool = False)`
+Sends the report
+
+## Event
+
+Event implements the specific event that is sent in a report. The Event base class fills the boilerplate of the event such as the time fields.
+Specific events inherit from Event and implement the difference.
+
+#### `__init__`
+Each derived Event can implements its own signature.
+
+#### `gather(self)`
+Most classes that derive from `Event` will implement `gather(self)` which collects and returns a dictionary of the payload data that needs to be sent in this event. (I.e. without the Event headers)
+Afterwhich the derived class with override `generate` to push this data into the Event payload. See `EventInventory.generate` for example.
+
+### Event base class
+
+#### `__init__(self, agent)`
+Initialize the Event object
+
+#### `generate(self, event_type: str, component: str, report_times: ReportTimes)`
+creates the event dictionary (headers etc).
+usually is overriden in derived classes. The override method must call `super.gather` in its first line.
+
+#### `set_content(self, content)`
+Set the `body.payload.content` to `content`.
+Usually called by an overridden gather method.
+
+#### `data`
+Member variable. Contains the populated event dictionary.
+
+### EventGeneric base class
+Inherits from `Event` and is used as a base class for `ReportStatusHealth`, `ReportInventory`, `EventLastContact` and `EventStatusAlerts`
+
+#### `generate(self, event_type: str, component: str, description: str, report_times: ReportTimes)`
+overrides Event.generate and adds information that is common to all the reports that are listed above.
+
+
+## URInfo interface
+Helper interface to provide information about a UR, such as its ID for stale, ID for cooldown, timeout for cooldown, etc.
+usually short lived. i.e:
+```python
+if URUploadSnap(req).id_for_cooldown() in self.ur_cooldowns:
+    continue  # don't process until the cooldown is over
+```
+
+Each UR type must implement the following:
+
+- `__init__(self, req: dict)`: Intialize the URInfo. `req` is the UR `request` received.
+- `id(self)`: Returns a unique ID representing this UR for `stale` handling - i.e. uniquely identifying this UR.
+- `id_for_cooldown(self)`: Returns an ID representing this TYPE of UR for
+  "cooldown" purposes, and not the specific instance thereof.  i.e. all
+  UploadSnap URs of the same level should receive the same ID: `upload_snap-1`
+  for upload snap level 1 (1 == no SOS report)
+- `cooldown_timeout(self)`: Return the time in seconds that this type of UR should have. Example:
+  - for "upload snap" level 2 the cooldown timeout is 2 hours because it's a heavy operation on the cluster,
+  - for "upload snap" level 1 the cooldown timeout is 5 minutes because its a light operation
+
+Currently there is only one implementation of this interface - URUploadSnap()
+
diff --git a/src/pybind/mgr/call_home_agent/exceptions.py b/src/pybind/mgr/call_home_agent/exceptions.py

new file mode 100644 (file)

index 0000000..396623d
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/exceptions.py
@@ -0,0 +1,7 @@
+
+class SendError(Exception):
+    """
+    Raised when requests.send() fails
+    """
+    pass
+
diff --git a/src/pybind/mgr/call_home_agent/module.py b/src/pybind/mgr/call_home_agent/module.py

index 9769d180f7c01c6b248c3950ee4009b2e478ddcf..5c7eeeebf55c31fbf4b919d3023f41d47dfa0eb1 100644 (file)
--- a/src/pybind/mgr/call_home_agent/module.py
+++ b/src/pybind/mgr/call_home_agent/module.py
@@ -1,1006 +1,109 @@
-"""
-IBM Ceph Call Home Agent
-Authors:
-    Yaarit Hatuka <yhatuka@ibm.com>
-    Juan Miguel Olmo Martinez <jolmomar@ibm.com>
-"""
-
-from typing import List, Any, Tuple, Dict, Optional, Callable
-import time
+from .report import Report
+from .report_last_contact import ReportLastContact
+from .workflow_upload_snap import WorkFlowUploadSnap
+from .report_ur_error import ReportURError
+from .report_inventory import ReportInventory
+from .report_status_alerts import ReportStatusAlerts
+from .report_status_health import ReportStatusHealth
+from .report_performance import ReportPerformance
+
+from mgr_module import MgrModule, Option, CLIReadCommand, CLIWriteCommand, HandleCommandResult, CommandResult
+
+from typing import Optional, Tuple, Any
+import datetime
  import json
-import requests
-import asyncio
  import os
-from datetime import datetime
-import uuid
-import glob
-import re
+import sys
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM
  import base64
-import zstandard as zstd
-
-from mgr_module import (Option, CLIReadCommand, CLIWriteCommand, MgrModule,
-                        HandleCommandResult)
-# from .dataClasses import ReportHeader, ReportEvent
-from .dataDicts import ReportHeader, ReportEvent, ceph_command
-
-# Dict to store operations requested from Call Home Mesh
-operations = {}
-
-# Constants for operations types:
-from .dataDicts import UPLOAD_SNAP, UPLOAD_FILE, DISABLE_SI_MESSAGES, CONFIRM_RESPONSE, NOT_SUPPORTED
-
-# Constants for operation status
-from .dataDicts import OPERATION_STATUS_NEW, OPERATION_STATUS_IN_PROGRESS, \
-                       OPERATION_STATUS_COMPLETE, OPERATION_STATUS_ERROR, \
-                       OPERATION_STATUS_REQUEST_REJECTED
-
-# Constants for operation status delivery
-from .dataDicts import ST_NOT_SENT, ST_SENT
-
-from .config import get_settings
-
-# Constant for store default ceph logs folder
-# Diagnostic files are collected in this folder
-DIAGS_FOLDER = '/var/log/ceph'
-
-class SendError(Exception):
-    pass
-
-# Prometheus API returns all alerts. We want to send only deltas in the alerts
-# report - i.e. send a *new* alert that has been fired since the last report
-# was sent, and send a “resolved” notification when an alert is removed from
-# the prometheus API.
-# To do so we keep a list of alerts (“sent_alerts”) we have already sent, and
-# use that to create a delta report in generate_alerts_report(). The alert
-# report is not sent if there are no deltas.
-# `ceph callhome reset alerts` zeros out sent_alerts list and therefore the
-# next report will contain the relevant alerts that are fetched from the
-# Prometheus API.
-sent_alerts = {}
-
-def get_prometheus_url(mgr: Any) -> str:
-    """
-    Provides the prometheus server URL
-    """
-    daemon_list = mgr.remote('cephadm', 'list_daemons', service_name='prometheus')
-    if daemon_list.exception_str:
-        raise Exception(f"Alert report: Error finding the Prometheus instance: {daemon_list.exception_str}")
-    if len(daemon_list.result) < 1:
-        raise Exception(f"Alert report: Can't find the Prometheus instance")
-
-    d = daemon_list.result[0]
-    host = d.ip if d.ip else d.hostname  # ip is of type str
-    port = str(d.ports[0]) if d.ports else ""  # ports is a list of ints
-    if not (host and port):
-        raise Exception(f"Can't get Prometheus IP and/or port from manager")
-
-    return f"http://{host}:{port}/api/v1"
-
-def get_status(mgr: Any) -> dict:
-    r, outb, outs = mgr.mon_command({
-        'prefix': 'status',
-        'format': 'json'
-    })
-    if r:
-        error = f"status command failed: {outs}"
-        mgr.log.error(error)
-        return {'error': error}
-    try:
-        status_dict = json.loads(outb)
-        status_dict["ceph_version"] = mgr.version
-        status_dict["health_detail"] = json.loads(mgr.get('health')['json'])
-        status_dict["support"] = get_support_metrics(mgr)
-        status_dict["support"]["health_status"] = status_dict["health_detail"]["status"]
-        status_dict["support"]["health_summary"] = get_health_summary(status_dict["health_detail"])
-        return status_dict
-    except Exception as ex:
-        mgr.log.exception(str(ex))
-        return {'exception': str(ex)}
-
-def get_health_summary(ceph_health: dict) -> str:
-    health_summary = ""
-    for error_key, error_details in ceph_health["checks"].items():
-        msg = "\n".join([item["message"] for item in error_details.get("detail",[])])
-        health_summary += f'{error_key}({error_details["severity"]}): {error_details["summary"]["message"]}\n{msg}\n'
-    return health_summary
-
-def get_support_metrics(mgr) -> dict:
-    """
-    Collect cluster metrics needed for Ceph support team tools
-    """
-    support_metrics = {}
-    status_interval_minutes = os.environ.get('CHA_INTERVAL_STATUS_REPORT_SECONDS',
-                              mgr.get_module_option('interval_status_report_seconds'))
-    try:
-        prom_url = get_prometheus_url(mgr)
-        query_url = f"{prom_url}/query"
-        queries = {
-            'total_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
-            'total_raw_usage_bytes': 'sum(ceph_osd_stat_bytes_used)',
-            'usage_percentage': '(sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)) * 100',
-            'slow_ops_total': 'sum(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})',
-            'osds_total_with_slow_ops': 'count(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}>0) or on() vector(0)',
-            'pg_total': 'sum(ceph_pg_total)',
-            'pg_active': 'sum(ceph_pg_active)',
-            'pg_clean': 'sum(ceph_pg_clean)',
-            'pg_degraded': 'sum(ceph_pg_degraded)',
-            'pg_unknown': 'sum(ceph_pg_unknown)',
-            'pg_down': 'sum(ceph_pg_down)',
-            'pg_scrubbing': 'sum(ceph_pg_scrubbing)',
-            'pg_deep_scrubbing': 'sum(ceph_pg_deep)',
-            'network_receive_errors': f'avg(increase(node_network_receive_errs_total{{device!="lo"}}[{status_interval_minutes}m]))',
-            'network_send_errors': f'avg(increase(node_network_transmit_errs_total{{device!="lo"}}[{status_interval_minutes}m]))',
-            'network_receive_packet_drops': f'avg(increase(node_network_receive_drop_total{{device!="lo"}}[{status_interval_minutes}m]))',
-            'network_transmit_packet_drops': f'avg(increase(node_network_transmit_drop_total{{device!="lo"}}[{status_interval_minutes}m]))',
-            'inconsistent_mtu': 'sum(node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))) or vector(0))',
-            'pool_number': 'count(ceph_pool_bytes_used)',
-            'raw_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
-            'raw_capacity_consumed_bytes': 'sum(ceph_pool_bytes_used)',
-            'logical_stored_bytes': 'sum(ceph_pool_stored)',
-            'pool_growth_bytes': f'sum(delta(ceph_pool_stored[{status_interval_minutes}m]))',
-            'pool_bandwidth_bytes': f'sum(rate(ceph_pool_rd_bytes[{status_interval_minutes}m]) + rate(ceph_pool_wr_bytes[{status_interval_minutes}m]))',
-            'pg_per_osd_ratio':'(avg(ceph_osd_numpg)/sum(ceph_pg_total))*100',
-            'monitors_number': 'count(ceph_mon_metadata)',
-            'monitors_not_in_quorum_number': 'count(ceph_mon_quorum_status!=1) or on() vector(0)',
-            'clock_skews_number': 'ceph_health_detail{name="MON_CLOCK_SKEW"} or on() vector(0)',
-        }
-
-        t1 = time.time()
-        for k,q in queries.items():
-            data = exec_prometheus_query(query_url, q)
-            try:
-                support_metrics[k] = float(data['data']['result'][0]['value'][1])
-            except Exception as ex:
-                 mgr.log.error(f"Error reading status metric for support <{k}>: {ex} - {data}")
-        total_time = round((time.time() - t1) * 1000, 2)
-        support_metrics['time_to_get_support_data_ms'] = total_time
-        mgr.log.info(f"Time to get support data for status report: {total_time} ms")
-    except Exception as ex:
-        mgr.log.error(f"Error collecting support data for status report: {ex}")
-
-    return support_metrics
-
-def exec_prometheus_query(query_url: str, prom_query: str) -> dict:
-    """
-    Execute a Prometheus query and returns the result as dict
-    """
-    result = {}
-    r = None
-    try:
-        r = requests.get(query_url, params={'query': prom_query})
-        result = json.loads(r.text)
-        r.raise_for_status()
-    except Exception as ex:
-        raise Exception(f"Error executing Prometheus query: {ex}-{result}")
-    return result
-
-def get_prometheus_status(prometheus_url:str) -> dict:
-    """Get information about prometheus server status"""
-
-    result = {}
-    r = None
-    try:
-        r = requests.get(f"{prometheus_url}/targets")
-        r.raise_for_status()
-        result = json.loads(r.text)
-    except Exception as ex:
-        raise Exception(f"Error trying to get Prometheus status: {ex}")
-    return result
-
-def inventory_get_hardware_status(mgr: Any) -> dict:
-    try:
-        hw_status = mgr.remote('orchestrator', 'node_proxy_summary')
-        if hw_status.exception_str:
-            raise Exception(hw_status.exception_str)
-        return hw_status.result
-    except Exception as e:
-        mgr.log.exception(str(e))
-        return {'error': str(e)}
-
-def inventory(mgr: Any) -> dict:
-    """
-    Produce the content for the inventory report
-
-    Returns a dict with a json structure with the ceph cluster inventory information
-    """
-    inventory = {}
-    inventory["crush_map"] = mgr.get("osd_map_crush")
-    inventory["devices"] = mgr.get("devices")
-    inventory["df"] = mgr.get("df")
-    inventory["fs_map"] = mgr.get("fs_map")
-    inventory["hosts"] = mgr.list_servers()
-    inventory["manager_map"] = mgr.get("mgr_map")
-    inventory["mon_map"] = mgr.get("mon_map")
-    inventory["osd_map"] = mgr.get("osd_map")
-    inventory["osd_metadata"] = mgr.get("osd_metadata")
-    inventory["osd_tree"] = mgr.get("osd_map_tree")
-    inventory["pg_summary"] = mgr.get("pg_summary")
-    inventory["service_map"] = mgr.get("service_map")
-    inventory["status"] = get_status(mgr)
-    inventory["hardware_status"] = inventory_get_hardware_status(mgr)
-
-    return {'inventory': inventory}
-
-def performance(mgr: Any) -> dict:
-    """
-    Produce the content for the performance report
-
-    Returns a dict with a json structure with the ceph cluster performance information
-    """
-
-    performance_metrics = {}
-    perf_interval_minutes = int(os.environ.get('CHA_INTERVAL_PERFORMANCE_REPORT_SECONDS',
-                              mgr.get_module_option('interval_performance_report_seconds'))/60)
-
-    try:
-        queries = {
-            "ceph_osd_op_r_avg"        : {"query": f"sum(avg_over_time(ceph_osd_op_r[{perf_interval_minutes}m]))/count(ceph_osd_metadata)",
-                                          "help" : f"Average of read operations per second and per OSD in the cluster in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_r_min"        : {"query": f"min(min_over_time(ceph_osd_op_r[{perf_interval_minutes}m]))",
-                                          "help" : f"Minimum read operations per second in the cluster in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_r_max"        : {"query": f"max(max_over_time(ceph_osd_op_r[{perf_interval_minutes}m]))",
-                                           "help": f"Maximum of write operations per second in the cluster in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_r_out_bytes_avg" : {"query": f"sum(avg_over_time(ceph_osd_op_r_out_bytes[{perf_interval_minutes}m]))/count(ceph_osd_metadata)",
-                                          "help" : f"Average of cluster output bytes(reads) and per OSD in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_r_out_bytes_min" : {"query": f"min(min_over_time(ceph_osd_op_r_out_bytes[{perf_interval_minutes}m]))",
-                                          "help" : f"Minimum of cluster output bytes(reads) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_r_out_bytes_max" : {"query": f"max(max_over_time(ceph_osd_op_r_out_bytes[{perf_interval_minutes}m]))",
-                                          "help" : f"Maximum of cluster output bytes(reads) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_avg"        : {"query": f"sum(avg_over_time(ceph_osd_op_w[{perf_interval_minutes}m]))/count(ceph_osd_metadata)",
-                                          "help" : f"Average of cluster input operations per second(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_min"        : {"query": f"min(min_over_time(ceph_osd_op_w[{perf_interval_minutes}m]))",
-                                          "help" : f"Mimimum of cluster input operations per second(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_max"        : {"query": f"max(max_over_time(ceph_osd_op_w[{perf_interval_minutes}m]))",
-                                          "help" : f"Maximum of cluster input operations per second(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_in_bytes_avg"       : {"query": f"sum(avg_over_time(ceph_osd_op_w_in_bytes[{perf_interval_minutes}m]))/count(ceph_osd_metadata)",
-                                                  "help" : f"Average of cluster input bytes(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_in_bytes_min"       : {"query": f"min(min_over_time(ceph_osd_op_w_in_bytes[{perf_interval_minutes}m]))",
-                                                  "help" : f"Minimum of cluster input bytes(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_w_in_bytes_max"       : {"query": f"max(max_over_time(ceph_osd_op_w_in_bytes[{perf_interval_minutes}m]))",
-                                                  "help" : f"Maximum of cluster input bytes(writes) in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_read_latency_avg_ms"  : {"query": f"avg(rate(ceph_osd_op_r_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Average of cluster output latency(reads) in milliseconds in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_read_latency_max_ms"  : {"query": f"max(rate(ceph_osd_op_r_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Maximum of cluster output latency(reads) in milliseconds in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_read_latency_min_ms"  : {"query": f"min(rate(ceph_osd_op_r_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Minimum of cluster output latency(reads) in milliseconds  in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_write_latency_avg_ms" : {"query": f"avg(rate(ceph_osd_op_w_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Average of cluster input latency(writes) in milliseconds in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_write_latency_max_ms" : {"query": f"max(rate(ceph_osd_op_w_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Maximum of cluster input latency(writes) in milliseconds  in the last {perf_interval_minutes} minutes"},
-            "ceph_osd_op_write_latency_min_ms" : {"query": f"min(rate(ceph_osd_op_w_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{perf_interval_minutes}m]) * 1000)",
-                                                  "help" : f"Maximum of cluster input latency(writes) in milliseconds in the last {perf_interval_minutes} minutes"},
-            "ceph_physical_device_latency_reads_ms"    : {"query": 'node_disk_read_time_seconds_total / node_disk_reads_completed_total * on (instance, device) group_left(ceph_daemon) label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)") * 1000',
-                                                        "help" : "Read latency in milliseconds per physical device used by ceph OSD daemons"},
-            "ceph_physical_device_latency_writes_ms"   : {"query": 'node_disk_write_time_seconds_total / node_disk_writes_completed_total * on (instance, device) group_left(ceph_daemon) label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)") * 1000',
-                                                        "help" : "Write latency in milliseconds per physical device used by ceph OSD daemons"},
-            "ceph_physical_device_read_iops"           : {"query": 'node_disk_reads_completed_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
-                                                        "help" : "Read operations per second per physical device used by ceph OSD daemons"},
-            "ceph_physical_device_write_iops"          : {"query": 'node_disk_writes_completed_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
-                                                        "help" : "Write operations per second per physical device used by ceph OSD daemons"},
-            "ceph_physical_device_read_bytes"          : {"query": 'node_disk_read_bytes_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
-                                                        "help" : "Read bytes per physical device used by ceph OSD daemons in the last"},
-            "ceph_physical_device_written_bytes"       : {"query": 'node_disk_written_bytes_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
-                                                        "help" : "Write bytes per physical device used by ceph OSD daemons in the last"},
-            "ceph_physical_device_utilization_seconds" : {"query": '(node_disk_io_time_seconds_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")) * on (ceph_daemon) group_left(device_class) ceph_osd_metadata',
-                                                          "help":"Seconds total of Input/Output operations per physical device used by ceph OSD daemons"},
-            "ceph_pool_objects"     : {"query": "ceph_pool_objects * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help": "Number of Ceph pool objects per Ceph pool"},
-            "ceph_pool_write_iops"  : {"query": f"rate(ceph_pool_wr[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : "Per-second average rate of increase of write operations per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pool_read_iops"   : {"query": f"rate(ceph_pool_rd[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of increase of read operations per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pool_write_bytes" : {"query": f"rate(ceph_pool_wr_bytes[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of increase of written bytes per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pool_read_bytes"  : {"query": f"rate(ceph_pool_rd_bytes[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of increase of read bytes per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pg_activating"    : {"query": f"rate(ceph_pg_activating[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of Placement Groups activated per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pg_backfilling"   : {"query": f"rate(ceph_pg_backfilling[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of Placement Groups backfilled per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pg_creating"      : {"query": f"rate(ceph_pg_creating[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of Placement Groups created per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pg_recovering"    : {"query": f"rate(ceph_pg_recovering[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help" : f"Per-second average rate of Placement Groups recovered per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_pg_deep"          : {"query": f"rate(ceph_pg_deep[{perf_interval_minutes}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
-                                       "help":  f"Per-second average rate of Placement Groups deep scrubbed per Ceph pool during the last {perf_interval_minutes} minutes"},
-            "ceph_rgw_avg_get_latency_ms" : {"query": f'(rate(ceph_rgw_get_initial_lat_sum[{perf_interval_minutes}m]) or vector(0)) * 1000 / rate(ceph_rgw_get_initial_lat_count[{perf_interval_minutes}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
-                                             "help" : f"Average latency in milliseconds for GET operations per Ceph RGW daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_rgw_avg_put_latency_ms" : {"query": f"(rate(ceph_rgw_put_initial_lat_sum[{perf_interval_minutes}m]) or vector(0)) * 1000 / rate(ceph_rgw_put_initial_lat_count[{perf_interval_minutes}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata",
-                                             "help" : f"Average latency in milliseconds for PUT operations per Ceph RGW daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_rgw_requests_per_second": {"query": f'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[{perf_interval_minutes}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
-                                             "help" : f"Request operations per second per Ceph RGW daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_rgw_get_size_bytes" :     {"query": f'label_replace(sum by (instance_id) (rate(ceph_rgw_get_b[{perf_interval_minutes}m])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
-                                             "help" : f"Per-second average rate of GET operations size per Ceph RGW daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_rgw_put_size_bytes" :     {"query": f'label_replace(sum by (instance_id) (rate(ceph_rgw_put_b[{perf_interval_minutes}m])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
-                                             "help" : f"Per-second average rate of PUT operations size per Ceph RGW daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_read_requests_per_second"   : {"query": f'rate(ceph_objecter_op_r{{ceph_daemon=~"mds.*"}}[{perf_interval_minutes}m])',
-                                                     "help" : f"Per-second average rate of read requests per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_write_requests_per_second"  : {"query": f'rate(ceph_objecter_op_w{{ceph_daemon=~"mds.*"}}[{perf_interval_minutes}m])',
-                                                     "help" : f"Per-second average rate of write requests per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_client_requests_per_second" : {"query": f'rate(ceph_mds_server_handle_client_request[{perf_interval_minutes}m])',
-                                                     "help" : f"Per-second average rate of client requests per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_reply_latency_avg_ms" : {"query": f'avg(rate(ceph_mds_reply_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{perf_interval_minutes}m]) * 1000)',
-                                               "help" : f"Average of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_reply_latency_max_ms" : {"query": f'max(rate(ceph_mds_reply_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{perf_interval_minutes}m]) * 1000)',
-                                               "help" : f"Maximum of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "ceph_mds_reply_latency_min_ms" : {"query": f'min(rate(ceph_mds_reply_latency_sum[{perf_interval_minutes}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{perf_interval_minutes}m]) * 1000)',
-                                               "help" : f"Minimum of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {perf_interval_minutes} minutes"},
-            "hw_cpu_busy"                          : {"query": f"1- rate(node_cpu_seconds_total{{mode='idle'}}[{perf_interval_minutes}m])",
-                                                      "help" : f"Percentaje of CPU utilization per core during the last {perf_interval_minutes} minutes"},
-            "hw_ram_utilization"                   : {"query": f'(node_memory_MemTotal_bytes -(node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_Slab_bytes))/node_memory_MemTotal_bytes',
-                                                      "help" : "RAM utilization"},
-            "hw_node_physical_disk_read_ops_rate"  : {"query": f"rate(node_disk_reads_completed_total[{perf_interval_minutes}m])",
-                                                      "help" : f"Per-second average rate of read operations per physical storage device in the host during the last {perf_interval_minutes} minutes"},
-            "hw_node_physical_disk_write_ops_rate" : {"query": f"rate(node_disk_writes_completed_total[{perf_interval_minutes}m])",
-                                                      "help" : f"Per-second average rate of write operations per physical storage device in the host during the last {perf_interval_minutes} minutes"},
-            "hw_disk_utilization_rate"             : {"query": f"rate(node_disk_io_time_seconds_total[{perf_interval_minutes}m])",
-                                                      "help" : f"Per-second average rate of input/output operations time(seconds) per physical storage device in the host during the last {perf_interval_minutes} minutes"},
-            "hw_network_bandwidth_receive_load_bytes" : {"query": f"rate(node_network_receive_bytes_total[{perf_interval_minutes}m])",
-                                                         "help" : f"Per-second average rate of received bytes per network card in the host during the last {perf_interval_minutes} minutes"},
-            "hw_network_bandwidth_transmit_load_bytes": {"query": f"rate(node_network_transmit_bytes_total[{perf_interval_minutes}m])",
-                                                         "help" : f"Per-second average rate of transmitted bytes per network card in the host during the last {perf_interval_minutes} minutes"},
-            "ceph_nvmeof_gateway_total"                        : {"query": "count by(group) (ceph_nvmeof_gateway_info) or vector(0)",
-                                                                  "help" : "Number of Ceph NVMe-oF daemons or gatways running"},
-            "ceph_nvmeof_subsystem_total"                      : {"query": "count by(group) (count by(nqn,group) (ceph_nvmeof_subsystem_metadata))",
-                                                                  "help" : "Number of Ceph NVMe-oF subsystems running"},
-            "ceph_nvmeof_reactor_total"                        : {"query": 'max by(group) (max by(instance) (count by(instance) (ceph_nvmeof_reactor_seconds_total{mode="busy"})) * on(instance) group_right ceph_nvmeof_gateway_info)',
-                                                                  "help" : "Number of reactors per gateway"},
-            "ceph_nvmeof_gateway_reactor_cpu_seconds_total"    : {"query": f'max by(group) (avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{{mode="busy"}}[{perf_interval_minutes}m])) * on(instance) group_right ceph_nvmeof_gateway_info)',
-                                                                   "help" : "Highest gateway CPU load"},
-            "ceph_nvmeof_namespaces_total"                     : {"query": "max by(group) (count by(instance) (count by(bdev_name,instance) (ceph_nvmeof_bdev_metadata )) * on(instance) group_right ceph_nvmeof_gateway_info)",
-                                                                  "help" : "Total number of namespaces"},
-            "ceph_nvmeof_capacity_exported_bytes_total"        : {"query": "topk(1,sum by(instance) (ceph_nvmeof_bdev_capacity_bytes)) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
-                                                                  "help" : "Ceph NVMe-oF total capacity exposed"},
-            "ceph_nvmeof_clients_connected_total "             : {"query": "count by(instance) (sum by(instance,host_nqn) (ceph_nvmeof_host_connection_state == 1)) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
-                                                                  "help" : "Number of clients connected to Ceph NVMe-oF"},
-            "ceph_nvmeof_gateway_iops_total "                  : {"query": f"sum by(instance) (rate(ceph_nvmeof_bdev_reads_completed_total[{perf_interval_minutes}m]) + rate(ceph_nvmeof_bdev_writes_completed_total[{perf_interval_minutes}m])) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
-                                                                  "help" : "IOPS per Ceph NVMe-oF gateway"},
-            "ceph_nvmeof_subsystem_iops_total"                 : {"query": f"sum by(group,nqn) (((rate(ceph_nvmeof_bdev_reads_completed_total[{perf_interval_minutes}m]) + rate(ceph_nvmeof_bdev_writes_completed_total[{perf_interval_minutes}m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
-                                                                  "help" : "IOPS per Ceph NVMe-oF subsystem"},
-            "ceph_nvmeof_gateway_throughput_bytes_total"       : {"query": f"sum by(instance) (rate(ceph_nvmeof_bdev_read_bytes_total[{perf_interval_minutes}m]) + rate(ceph_nvmeof_bdev_written_bytes_total[{perf_interval_minutes}m])) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
-                                                                  "help" : "Throughput per Ceph NVMe-oF gateway"},
-            "ceph_nvmeof_subsystem_throughput_bytes_total"     : {"query": f"sum by(group,nqn) (((rate(ceph_nvmeof_bdev_read_bytes_total[{perf_interval_minutes}m]) + rate(ceph_nvmeof_bdev_written_bytes_total[{perf_interval_minutes}m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
-                                                                  "help" : "Throughput per Ceph NVMe-oF subsystem"},
-            "ceph_nvmeof_gateway_read_avg_latency_seconds"     : {"query": f"avg by(group,instance) (((rate(ceph_nvmeof_bdev_read_seconds_total[{perf_interval_minutes}m]) / rate(ceph_nvmeof_bdev_reads_completed_total[{perf_interval_minutes}m])) > 0) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
-                                                                  "help" : "Read latency average in seconds per Ceph NVMe-oF gateway"},
-            "ceph_nvmeof_gateway_write_avg_latency_seconds "   : {"query": f"avg by(group,instance) (((rate(ceph_nvmeof_bdev_write_seconds_total[{perf_interval_minutes}m]) / rate(ceph_nvmeof_bdev_writes_completed_total[{perf_interval_minutes}m])) > 0) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
-                                                                  "help":  "Write average in seconds per Ceph NVMe-oF gateway"},
-            "ceph_nvmeof_gateway_read_p95_latency_seconds"     : {"query": f"quantile by(group,instance) (.95,((rate(ceph_nvmeof_bdev_read_seconds_total[{perf_interval_minutes}m]) / (rate(ceph_nvmeof_bdev_reads_completed_total[{perf_interval_minutes}m]) >0)) * on(instance) group_left(group) ceph_nvmeof_gateway_info))",
-                                                                  "help":  "Read latency for 95{%} of the Ceph NVMe-oF gateways"},
-            "ceph_nvmeof_gateway_write_p95_latency_seconds"    : {"query": f"quantile by(group,instance) (.95,((rate(ceph_nvmeof_bdev_write_seconds_total[{perf_interval_minutes}m]) / (rate(ceph_nvmeof_bdev_writes_completed_total[{perf_interval_minutes}m]) >0)) * on(instance) group_left(group) ceph_nvmeof_gateway_info))",
-                                                                  "help":  "Write latency for 95{%} of the Ceph NVMe-oF gateways"}
-        }
-
-        t1 = time.time()
-
-        status = ""
-        prometheus_url = get_prometheus_url(mgr)
-        query_url = f"{prometheus_url}/query"
-
-        # Metrics retrieval
-        metrics_errors = False
-        for k,q in queries.items():
-            try:
-                data = exec_prometheus_query(query_url, q["query"])
-                # remove single metric timestamps
-                try:
-                    for metric in data['data']['result']:
-                        metric["value"] = metric["value"][1:]
-                except Exception:
-                    pass
-                performance_metrics[k] = {"help": q["help"],
-                                          "result": data['data']['result']}
-            except Exception as ex:
-                msg = f"Error reading performance metric <{k}>: {ex}"
-                mgr.log.error(msg)
-                metrics_errors = True
-                continue
-
-        if metrics_errors:
-            status = "Error getting metrics from Prometheus. Active Ceph Manager log contains details\n"
-
-        # Prometheus server health
-        prometheus_status = get_prometheus_status(prometheus_url)
-        targets_down = list(filter(lambda x: x['health'] != 'up', prometheus_status['data']['activeTargets']))
-        if targets_down:
-            status += f"Error(scrape targets not up): Not able to retrieve metrics from {targets_down} target/s. Review Prometheus server status\n"
-
-        # Ceph status
-        performance_metrics["ceph_version"] = mgr.version
-        performance_metrics["ceph_health_detail"] = json.loads(mgr.get('health')['json'])
-
-        total_time = round((time.time() - t1) * 1000, 2)
-        performance_metrics['time_to_get_performance_metrics_ms'] = total_time
-        mgr.log.info(f"Time to get performance metrics: {total_time} ms")
-        performance_metrics['timestamp'] = t1
-        performance_metrics['human_timestamp'] = datetime.fromtimestamp(t1).strftime('%Y-%m-%d %H:%M:%S')
-    except Exception as ex:
-        msg = f"Error collecting performance metrics: {ex}"
-        mgr.log.error(msg)
-        status += msg + '\n'
-
-    # Performance report status
-    if status == "":
-        performance_metrics["status"] = "OK"
-    else:
-        performance_metrics["status"] = status
-
-    # performance data compressed and serialized to a JSON string
-    performance_json = json.dumps(performance_metrics)
-    cctx = zstd.ZstdCompressor()
-    compressed_perfo = cctx.compress(performance_json.encode('utf-8'))
-
-    compressed_base64_perfo = base64.b64encode(compressed_perfo).decode('utf-8')
-
-
-    return {"perfstats": {
-                        "file_stamp": performance_metrics['human_timestamp'],
-                        "file_stamp_ms": int(t1 * 1000),
-                        "local_file_stamp": performance_metrics['human_timestamp'],
-                        "nd_stats": compressed_base64_perfo,
-                        "ng_stats": "",
-                        "nm_stats": "",
-                        "nn_stats": "",
-                        "nv_stats": "",
-                        "node_number": 1,     # because IBM Call Home reqs.
-                        "nodes_in_cluster": 1 # because IBM Call Home reqs.
-                        }
-            }
-
-def status(mgr: Any) -> dict:
-    """
-    Produce the content for the status report
-
-    Returns a dict with a json structure with the ceph cluster health information
-    """
-    return {'status': get_status(mgr)}
-
-def last_contact(mgr: Any) -> dict:
-    """
-    Produce the content for the last_contact report
-
-    Returns a dict with just the timestamp of the last contact with the cluster
-    """
-    return {'last_contact': format(int(time.time()))}
-
-def get_operation(key) -> dict:
-    """
-    Retuns the operation data.
-    Used for keep compatibility with the Report class API
-    """
-    return operations[key]
-
-def collect_diagnostic_commands(mgr: Any, operation_key: str) -> str:
-    """
-    Collect information from the cluster
-
-        ceph status
-        ceph health detail
-        ceph osd tree
-        ceph report
-        ceph osd dump
-        ceph df
-
-    """
-    mgr.log.info(f"Operations ({operation_key}): Collecting diagnostics commands")
-    output = ""
-    output += "\nceph status\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='status')
-    output += "\n'ceph health detail\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='health', detail='detail')
-    output += "\nceph osd tree\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='osd tree')
-    output += "\nceph report\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='report')
-    output += "\nceph osd dump\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='osd dump')
-    output += "\nceph df detail\n" + ceph_command(mgr=mgr, srv_type='mon', prefix='df', detail='detail')
-
-    mgr.log.info(f"Operations ({operation_key}): diagnostics commands collected")
-
-    try:
-        cmds_file_prefix = 'ceph_commands_case'
-        # Remove previous commands files
-        for file in glob.glob(f'{DIAGS_FOLDER}/{cmds_file_prefix}*'):
-            os.remove(file)
-        timestamp_sos_file = int(time.time() * 1000)
-        try:
-            case_id = operations[operation_key]['pmr']
-        except KeyError:
-            case_id = "unknown"
-        file_name = f'{cmds_file_prefix}_{case_id}_{timestamp_sos_file}.txt'
-        with open(f'{DIAGS_FOLDER}/{file_name}', 'w') as commands_file:
-            commands_file.write(output)
-        mgr.log.info(f"Operations ({operation_key}): diagnostics commands stored in file {file_name}")
-    except Exception as ex:
-        raise Exception(f"Operations ({operation_key}): Error trying to save the commands file for diagnostics: {ex}")
-
-    return file_name
-def get_best_collect_node(mgr: Any) -> Tuple[str, str]:
-    """
-    Select the best monitor node where to run a sos report command
-    retuns the best monitor node and the active manager
-    """
-    nodes = {}
-    active_manager = ""
-    best_monitor = ""
-
-    # We add all the monitors
-    monitors = mgr.remote('cephadm', 'list_daemons', service_name='mon')
-    if monitors.exception_str:
-        raise Exception(monitors.exception_str)
-
-    for daemon in monitors.result:
-        nodes[daemon.hostname] = 1
-
-    # lets add one point to a monitor if it is a cephadm admin node
-    cluster_nodes = mgr.remote('cephadm', 'get_hosts')
-    if cluster_nodes.exception_str:
-        raise Exception(cluster_nodes.exception_str)
-
-    for host in cluster_nodes.result:
-        if '_admin' in host.labels:
-            try:
-                nodes[host.hostname] += 1
-                break
-            except KeyError:
-                pass
-
-    # get the active mgr.
-    managers = mgr.remote('cephadm', 'list_daemons', service_name='mgr')
-    if managers.exception_str:
-        raise Exception(monitors.exception_str)
-
-    for daemon in managers.result:
-        if daemon.is_active:
-            active_manager = daemon.hostname
-            try:
-                nodes[daemon.hostname] += 1
-            except KeyError:
-                pass
-
-    # get the winner monitor
-    best_monitor = max(nodes, key=nodes.get)
-
-    return best_monitor, active_manager
-
-def collect_sos_report(mgr: Any, operation_key: str) -> str:
-    """
-    SOS report gathered from a Ceph Monitor node
-    Best node to execute the sos command is
-    1. Monitor + admin node + active mgr
-    2. Monitor + admin node
-    3. monitor
-    """
-
-    # Remove previous sos report files:
-    for file in glob.glob(f'{DIAGS_FOLDER}/sosreport_case_*'):
-        os.remove(file)
-
-    # Get the best monitor node to execute the sos report
-    best_mon, active_mgr = get_best_collect_node(mgr)
-    mgr_target = ""
-    if best_mon != active_mgr and active_mgr:
-        mgr_target = f"--mgr-target {active_mgr}"
-    support_case = operations[operation_key]["pmr"]
-    mgr.log.info(f"Operations ({operation_key}): selected host for sos command is {best_mon}, active manager is {active_mgr}")
-
-    # Execute the sos report command
-    sos_cmd_execution = mgr.remote('cephadm', 'sos',
-                                      hostname = best_mon,
-                                      sos_params = f'{mgr_target} report --batch --quiet --case-id {support_case}')
-    mgr.log.info(f"Operations ({operation_key}): sos command executed succesfully")
-    if sos_cmd_execution.exception_str:
-        raise Exception(f"Error trying to get the sos report files for diagnostics(error_code): {sos_cmd_execution.exception_str}")
-
-    # output is like:
-    # 'New sos report files can be found in /var/log/ceph/<fsid>/sosreport_case_124_1706548742636_*'
-    pattern = r'sosreport_case_\S+'
-    matches = re.findall(pattern, sos_cmd_execution.result)
-    if matches:
-        mgr.log.info(f"Operations ({operation_key}): sos command files pattern is: {matches[0]}")
-        result = matches[0]
-    else:
-        mgr.log.error(f"Operations ({operation_key}): sos report files pattern not found in: {sos_cmd_execution.result}")
-        result = ""
-
-    # If there is any issue executing the command, the output will be like:
-    # ['Issue executing <['sos', 'report', '--batch', '--quiet', '--case-id', 'TS015034298', '-p', 'container']>: 0:[plugin:ceph_mon] Failed to find ceph version, command collection will be limited
-    #
-    # New sos report files can be found in /var/log/ceph/<fsid>/sosreport_case_TS015034298_1709809018376_*']
-    # in this case, we leave a warning in the log about the issue
-    pattern = r'^Issue executing.*'
-    matches = re.findall(pattern, sos_cmd_execution.result)
-    if matches:
-         mgr.log.warn(f"Operations ({operation_key}): review sos command execution in {best_mon}: {matches[0]}")
-
-    return result
-
-
-def notProcessed(item: dict) -> bool:
-    """
-    Determines if a received <inbound request> containing a si_requestid
-    has been already processed
-    """
-    not_processed = True
-
-    si_requestid = item.get('options', {}).get('si_requestid', '')
-    if si_requestid:
-        not_processed = operations.get(si_requestid, "") == ""
-
-    return not_processed
-
-def add_operation(item: dict, l1_cooling_window_seconds: int,
-                  l2_cooling_window_seconds: int, event_id: str = '') -> str:
-    """
-    Add an operation coming from an inbound request to the operation dicts.
-    return the the key to locate the new operation in operations dict
-
-    item = {  'operation': 'upload_snap',
-                'options': {'pmr': 'TS1234567',
-                            'level': '3',
-                            'si_requestid':'2345',
-                            'enable_status':
-                            'true',
-                            'version': 1}
-            }
-
-    Items in the operations dict are like:
-
-    {'2345': {'pmr': 'TS1234567',
-              'level': '3',
-              'si_requestid':'2345',
-              'enable_status': 'true',
-              'version': 1,
-              'type': 'upload_snap',
-              'status': 'new',
-              'description: '',
-              'status_sent': '',
-              'progress': 0,
-              'event_id': 'IBM-RedHatMarine-ceph-368ffc04.....'},
-              'created': 1707818993.8846028
-     '1234' : {....}
-    }
-    """
-
-    key = str(uuid.uuid4())
-
-    # reject requests with no valid structure:
-    if 'operation' not in item or 'options' not in item:
-        operations[key]['type']= NOT_SUPPORTED
-        operations[key]['status'] = OPERATION_STATUS_REQUEST_REJECTED
-        operations[key]['progress'] = 0
-        operations[key]['description'] = f'Operations ({key}): Received unknown operation: {item}'
-        operations[key]['status_sent'] = ST_NOT_SENT
-        operations[key]['event_id'] = event_id
-        return key
-
-    # reject operations not supported
-    if item['operation'] != UPLOAD_SNAP: # Only "upload snap" ops are allowed
-        operations[key] = item['options']
-        operations[key]['type']= item['operation']
-        operations[key]['status'] = OPERATION_STATUS_REQUEST_REJECTED
-        operations[key]['progress'] = 0
-        operations[key]['description'] = f'Operations ({key}): Rejected <{item["operation"]}> operation <{key}>: Operation not supported'
-        operations[key]['status_sent'] = ST_NOT_SENT
-        operations[key]['event_id'] = event_id
-        return key
-
-    # reject UPLOAD SNAP operations without required fields
-    if not ('pmr' in item['options'] and
-            'level' in  item['options'] and
-            'si_requestid' in item['options']):
-        operations[key]['type']= NOT_SUPPORTED
-        operations[key]['status'] = OPERATION_STATUS_REQUEST_REJECTED
-        operations[key]['progress'] = 0
-        operations[key]['description'] = f"Operations ({key}): required fields (pmr, level, si_requestid)\
-              not present in <{item['operation']}> operation: {item}"
-        operations[key]['status_sent'] = ST_NOT_SENT
-        operations[key]['event_id'] = event_id
-        return key
-
-    # reject UPLOAD SNAP operations with same level than other being processed
-    # if they are inside the cooling window time interval for the level
-    for op_key, op in operations.items():
-        if op['type'] == UPLOAD_SNAP and op['level'] == item['options']['level']:
-            # we have another log upload operation for same level
-            # verify if it is inside the "cooling window for the level"
-            if op['level'] == 1:
-                cooling_window_seconds = l1_cooling_window_seconds
-            else:
-                cooling_window_seconds = l2_cooling_window_seconds
-            if int(time.time() - op['created']) <= cooling_window_seconds:
-                operations[key] = item['options']
-                operations[key]['type']= item['operation']
-                operations[key]['status'] = OPERATION_STATUS_REQUEST_REJECTED
-                operations[key]['progress'] = 0
-                operations[key]['description'] = f"Operations ({key}): <{item['operation']}> operation\
-                    <{item['options']['pmr']}>:There is another operation with identifier {op_key}\
-                        which has the same level and is already being processed"
-                operations[key]['status_sent'] = ST_NOT_SENT
-                operations[key]['event_id'] = event_id
-                return key
-
-    #reject UPLOAD SNAP operations with same si_requestid than other being processed
-    if item['options']['si_requestid'] in operations.keys():
-        operations[key] = item['options']
-        operations[key]['type']= item['operation']
-        operations[key]['status'] = OPERATION_STATUS_REQUEST_REJECTED
-        operations[key]['progress'] = 0
-        operations[key]['description'] = f"Operations ({key}): <{item['operation']}> \
-            operation <{item['options']['si_requestid']}>: operation is being processed now"
-        operations[key]['status_sent'] = ST_NOT_SENT
-        operations[key]['event_id'] = event_id
-        return key
-
-    # Accept valid UPLOAD SNAP operation
-    key = item['options']['si_requestid']
-    operations[key] = item['options']
-    operations[key]['type']= item['operation']
-    operations[key]['status'] = OPERATION_STATUS_NEW
-    operations[key]['progress'] = 0
-    operations[key]['description'] = f'Operations ({key}): Accepted new <{item["operation"]}> \
-        operation <{key}>'
-    operations[key]['status_sent'] = ST_NOT_SENT
-    operations[key]['event_id'] = event_id
-    operations[key]['created'] = time.time()
-    return key
-
-class Report:
-    def __init__(self, report_type: str, component: str, description: str, icn: str, owner_tenant_id: str, fn: Callable[[], str], url: str, proxy: str, seconds_interval: int,
-                 mgr_module: Any, key: str = "", event_id: str = ''):
-        self.report_type = report_type                # name of the report
-        self.component = component                    # component
-        self.icn = icn                                # ICN = IBM Customer Number
-        self.owner_tenant_id = owner_tenant_id        # IBM tenant ID
-        self.fn = fn                                  # function used to retrieve the data
-        self.url = url                                # url to send the report
-        self.interval = seconds_interval              # interval to send the report (seconds)
-        self.mgr = mgr_module
-        self.description = description
-        self.last_id = ''
-        self.proxies = {'http': proxy, 'https': proxy} if proxy else {}
-        self.key = key                                # used in operations reports
-        self.event_id = event_id                      # used in operations reports
-
-        # Last upload settings
-        self.last_upload_option_name = 'report_%s_last_upload' % self.report_type
-        last_upload = self.mgr.get_store(self.last_upload_option_name, None)
-        if last_upload is None:
-            self.last_upload = str(int(time.time()) - self.interval + 1)
-        else:
-            self.last_upload = str(int(last_upload))
-
-    def generate_report(self) -> dict:
-        try:
-            if self.key:
-                content = self.fn(self.key)
-            else:
-                content = self.fn(self.mgr)
-            if content is None:
-                return None
-
-            report = {}
-            report_dt = datetime.timestamp(datetime.now())
-
-            report = ReportHeader.collect(self.report_type,
-                                  self.mgr.get('mon_map')['fsid'],
-                                  self.mgr.version,
-                                  report_dt,
-                                  self.mgr,
-                                  self.mgr.target_space,
-                                  self.event_id)
-
-            event_section = ReportEvent.collect(self.report_type,
-                                        self.component,
-                                        report_dt,
-                                        self.mgr.get('mon_map')['fsid'],
-                                        self.icn,
-                                        self.owner_tenant_id,
-                                        self.description,
-                                        content,
-                                        self.mgr,
-                                        self.key)
-
-            report['events'].append(event_section)
-            self.last_id = report["event_time_ms"]
-
-            return report
-        except Exception as ex:
-            raise Exception('<%s> report not available: %s\n%s' % (self.report_type, ex, report))
-
-    def filter_report(self, fields_to_remove: list) -> str:
-        filtered_report = self.generate_report()
-        if filtered_report is None:
-            return None
-
-        for field in fields_to_remove:
-            if field in filtered_report:
-                del filtered_report[field]
-
-        return json.dumps(filtered_report)
-
-    def send(self, force: bool = False) -> str:
-        # Do not send report if the required interval is not reached
-        if not force:
-            if (int(time.time()) - int(self.last_upload)) < self.interval:
-                self.mgr.log.info('%s report not sent because interval not reached', self.report_type)
-                return ""
-
-        # Do not sent report if interval is set to 0
-        if self.interval == 0 and not force:
-            self.mgr.log.info('%s report not sent because interval set to 0', self.report_type)
-            return ""
+import re
+import jwt
+import traceback
+import requests
+import sched
+import time
+#from threading import Event
+import threading
  
-        resp = None
-        try:
-            report = self.generate_report()
-            if report is None:
-                # the report can tell that it doesnt want to be sent by returning None
-                return None
-            if self.proxies:
-                self.mgr.log.info('Sending <%s> report to <%s> (via proxies <%s>)', self.report_type, self.url,
-                                  self.proxies)
-            else:
-                self.mgr.log.info('Sending <%s> report to <%s>', self.report_type, self.url)
-            resp = requests.post(url=self.url,
-                                 headers={'accept': 'application/json', 'content-type': 'application/json'},
-                                 data=json.dumps(report),
-                                 proxies=self.proxies)
-            self.mgr.log.debug(f"Report response: {resp.text}")
-            resp.raise_for_status()
-            self.process_response(self.report_type, resp)
-            self.last_upload = str(int(time.time()))
-            self.mgr.set_store(self.last_upload_option_name, self.last_upload)
-            self.mgr.health_checks.pop('CHA_ERROR_SENDING_REPORT', None)
-            self.mgr.log.info('Successfully sent <%s> report(%s) to <%s>', self.report_type, self.last_id, self.url)
-            return resp.text
-        except Exception as e:
-            explanation = resp.text if resp else ""
-            raise SendError('Failed to send <%s> to <%s>: %s %s' % (self.report_type, self.url, str(e), explanation))
+class URUploadSnap:
+    def __init__(self, agent, req: dict):
+        self._req = req
+        self.agent = agent
+        self._options = self._req.get('options', {})
  
-    def process_response(self, report_type: str, resp: requests.Response) -> None:
+    def id(self) -> str:
          """
-        Process operations after sending a "report" and receiving a succesful response
+        An ID that uniquely represents this UploadSnap request.
+        Currently we use this as the key for ur_stale.
          """
-        try:
-            if report_type == 'last_contact':
-                # retrieve operations from response
-                inbound_requests = resp.json().get('response_state', {}).get('transactions',{}).get('Unsolicited_Storage_Insights_RedHatMarine_ceph_Request', {}).get('response_object', {}).get('product_request', {}).get('asset_event_detail', {}).get('body', {}).get('inbound_requests', {})
-
-                if inbound_requests:
-                    event_id = resp.json().get('transaction', {}).get('event_id', '')
-                    self.mgr.log.info(f"Operations: New inbound_requests = {inbound_requests} for event_id {event_id}")
-                    # Add the operation to the operations queue
-                    for item in inbound_requests:
-                        if notProcessed(item):
-                            # Add Confirm response operation to operations dict
-                            key = str(uuid.uuid4())
-                            operations[key] = {}
-                            operations[key]['type']= CONFIRM_RESPONSE
-                            operations[key]['status'] = OPERATION_STATUS_COMPLETE
-                            operations[key]['progress'] = 0
-                            operations[key]['description'] = CONFIRM_RESPONSE
-                            operations[key]['status_sent'] = ST_NOT_SENT
-                            operations[key]['event_id'] = event_id
-
-                            self.mgr.log.info(f"Operations: Added confirm response operation for {item}")
-
-                            # Add the operation to operations dict
-                            key = add_operation(item,
-                                                self.mgr.level_one_upload_cooling_window_seconds,
-                                                self.mgr.level_two_upload_cooling_window_seconds,
-                                                event_id)
-                            self.mgr.log.info(f"Operations: Added operation {item}")
-                        else:
-                            self.mgr.log.info(f"Operations: Rejected already processed operation with SI request id = {item.get('options', {}).get('si_requestid', '')}")
-        except Exception as ex:
-            self.mgr.log.error(f"Operations: error: {ex} adding {item}")
-
-def alert_uid(alert: dict) -> str:
-    """
-    Retuns a unique string identifying this alert
-    """
-    return json.dumps(alert['labels'], sort_keys=True) + alert['activeAt'] + alert['value']
+        return f"{self._req.get('operation', '')}-{self._options.get('level', '')}-{self._options.get('pmr', '')}-{self._options.get('si_requestid', '')}"
  
-def is_alert_relevant(alert: dict) -> bool:
-    """
-    Returns True if this alert should be sent, False if it should be filtered out of the report
-    """
-    state = alert.get('state', '')
-    severity = alert.get('labels', {}).get('severity', '')
+    def id_for_cooldown(self) -> str:
+        return f"{self._req.get('operation', '')}-{self._options.get('level', '')}"
  
-    return state == 'firing' and severity == 'critical'
+    def cooldown_timeout(self) -> int:
+        """
+        level 2 is SOS report which is much heavier than level 1. for SOS report, dont allow more than once every 2 hours
+        for level 1, once every 5 minutes
+        """
+        if 'level' in self._options and int(self._options['level']) > 1:
+            return self.agent.cooldown_timeout_upload_snap_2
+        return self.agent.cooldown_timeout_upload_snap_1
  
-def get_prometheus_alerts(mgr):
+class CallHomeAgent(MgrModule):
      """
-    Returns a list of all the alerts currently active in Prometheus
+    Provides MgrModule interface and central services for "Report" derived classes
      """
-    try:
-        alerts_url = f"{get_prometheus_url(mgr)}/alerts"
-        # Get the alerts
-        resp = {}
-        try:
-            resp = requests.get(alerts_url).json()
-        except Exception as e:
-            raise Exception(f"Error getting alerts from Prometheus at {alerts_url} : {e}")
-
-        if 'data' not in resp or 'alerts' not in resp['data']:
-            raise Exception(f"Prometheus returned a bad reply: {resp}")
-
-        alerts = resp['data']['alerts']
-        return alerts
-    except Exception as e:
-        mgr.log.error(f"Can't fetch alerts from Prometheus: {e}")
-        return [{
-                'labels': {
-                    'alertname': 'callhomeErrorFetchPrometheus',
-                    'severity': 'critical'
-                },
-                'annotations': {
-                    'description': str(e)
-                },
-                'state': 'firing',
-                # 'activeAt' and 'value' are here for alert_uid() to work. they should be '0' so that we won't send this alert again and again
-                'activeAt': '0',
-                'value': '0'
-            }]
-
-def generate_alerts_report(mgr : Any):
-    global sent_alerts
-    # Filter the alert list
-    current_alerts_list = list(filter(is_alert_relevant, get_prometheus_alerts(mgr)))
-
-    current_alerts = {alert_uid(a):a for a in current_alerts_list}
-    # Find all new alerts - alerts that are currently active but were not sent until now (not in sent_alerts)
-    new_alerts = [a for uid, a in current_alerts.items() if uid not in sent_alerts]
-    resolved_alerts = [a for uid, a in sent_alerts.items() if uid not in current_alerts]
-
-    sent_alerts = current_alerts
-    if len(new_alerts) == 0 and len(resolved_alerts) == 0:
-        return None  # This will prevent the report from being sent
-    alerts_to_send = {'new_alerts': new_alerts, 'resolved_alerts': resolved_alerts}
-    return alerts_to_send
-
-class CallHomeAgent(MgrModule):
-    MODULE_OPTIONS: List[Option] = [
+    # Env vars (if they exist) have preference over module options
+    MODULE_OPTIONS: list[Option] = [
          Option(
-            name='target',
+            name='target',  # call home URL
              type='str',
-            default='https://esupport.ibm.com/connect/api/v1',
+            default = os.environ.get('CHA_TARGET', 'https://esupport.ibm.com/connect/api/v1'),
              desc='Call Home endpoint'
          ),
          Option(
              name='interval_inventory_report_seconds',
              type='int',
              min=0,
-            default=60 * 60 * 24,  # one day
+            default = int(os.environ.get('CHA_INTERVAL_INVENTORY_REPORT_SECONDS', 60 * 60 * 24)),  # one day
              desc='Time frequency for the inventory report'
          ),
          Option(
              name='interval_performance_report_seconds',
              type='int',
              min=0,
-            default=60 * 5,  # 5 minutes
+            default = int(os.environ.get('CHA_INTERVAL_PERFORMANCE_REPORT_SECONDS', 60 * 5)),  # 5 minutes
              desc='Time frequency for the performance report'
          ),
          Option(
              name='interval_status_report_seconds',
              type='int',
              min=0,
-            default=60 * 30,  # 30 minutes
+            default = int(os.environ.get('CHA_INTERVAL_STATUS_REPORT_SECONDS', 60 * 30)),  # 30 minutes
              desc='Time frequency for the status report'
          ),
          Option(
              name='interval_last_contact_report_seconds',
              type='int',
              min=0,
-            default=60 * 30,  # 30 minutes
+            default = int(os.environ.get('CHA_INTERVAL_LAST_CONTACT_REPORT_SECONDS', 60 * 30)),  # 30 minutes
              desc='Time frequency for the last contact report'
          ),
          Option(
              name='interval_alerts_report_seconds',
              type='int',
              min=0,
-            default=60 * 5,  # 5 minutes
+            default = int(os.environ.get('CHA_INTERVAL_ALERTS_REPORT_SECONDS', 60 * 5)),  # 5 minutes
              desc='Time frequency for the alerts report'
          ),
+        Option(
+            name='interval_performance_report_seconds',
+            type='int',
+            min=0,
+            default = int(os.environ.get('CHA_INTERVAL_PERFORMANCE_REPORT_SECONDS', 60 * 5)),  # 5 minutes
+            desc='Time frequency for the performance report'
+        ),
          Option(
              name='customer_email',
              type='str',
@@ -1088,19 +191,19 @@ class CallHomeAgent(MgrModule):
          Option(
              name='proxy',
              type='str',
-            default='',
+            default = os.environ.get('CHA_PROXY', ''),
              desc='Proxy to reach Call Home endpoint'
          ),
          Option(
              name='target_space',
              type='str',
-            default='prod',
+            default = os.environ.get('CHA_TARGET_SPACE', 'prod'),  # Set to 'dev'/'test' for development/testing
              desc='Target space for reports (dev, staging or production)'
          ),
          Option(
              name='si_web_service_url',
              type='str',
-            default='https://join.insights.ibm.com/api/v1/em-integration',
+            default = os.environ.get('CHA_SI_WEB_SERVICE_URL', 'https://join.insights.ibm.com/api/v1/em-integration'),
              desc='URL used to register Ceph cluster in SI (staging or production)'
          ),
          Option(
@@ -1128,541 +231,521 @@ class CallHomeAgent(MgrModule):
              desc='Password obtained from the IBM Transfer ID service'
          ),
          Option(
-            name='upload_ops_persistence_seconds',
+            name='stale_timeout',
              type='int',
-            default=864000,
-            desc='Time interval during which requests with same SI request ID will not be processed'
+            default=86400 * 10,
+            desc='Time interval in seconds during which requests with a repeating SI request ID will be ignored'
          ),
          Option(
-            name='level_one_upload_cooling_window_seconds',
+            name='cooldown_timeout_upload_snap_1',
              type='int',
              default=300,
-            desc='Time interval needed to pass before a new diagnostics upload operation level one will be accepted'
+            desc='Time interval in seconds to allow a cooldown between level 1 upload snap requests'
          ),
          Option(
-            name='level_two_upload_cooling_window_seconds',
+            name='cooldown_timeout_upload_snap_2',
              type='int',
-            default=3600,
-            desc='Time interval needed to pass before a new diagnostics upload operation level two(or upper) will be accepted'
+            default=3600 * 2,
+            desc='Time interval in seconds to allow a cooldown between level 2 upload snap requests'
          ),
      ]
  
      def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super(CallHomeAgent, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
+
+        self.reports = [
+                {
+                    'class': ReportInventory,
+                    'name': 'inventory',
+                    'interval_option_name': 'interval_inventory_report_seconds',
+                },
+                {
+                    'class': ReportLastContact,
+                    'name': 'last_contact',
+                    'interval_option_name': 'interval_last_contact_report_seconds',
+                },
+                {
+                    'class': ReportStatusAlerts,
+                    'name': 'alerts',
+                    'interval_option_name': 'interval_alerts_report_seconds',
+                },
+                {
+                    'class': ReportStatusHealth,
+                    'name': 'status',
+                    'interval_option_name': 'interval_status_report_seconds',
+                },
+                {
+                    'class': ReportPerformance,
+                    'name': 'performance',
+                    'interval_option_name': 'interval_performance_report_seconds',
+                },
+        ]
+
+
+        self.connectivity_status = {
+                'connectivity': False,  # Conenctivity status.
+                'last_checked': 0,  # Unix timestamp of when the last connectivity attempt was.
+                'connectivity_error': 'No connectivity attempted'  # Error is only relevant when 'connectivity'==False
+        }
  
          # set up some members to enable the serve() method and shutdown()
          self.run = True
  
-        # Load operations from db, this makes them persistent across mgr restarts
-        self.init_operations()
-
          # Module options
          self.refresh_options()
  
          # Health checks
          self.health_checks: Dict[str, Dict[str, Any]] = dict()
  
-        # Coroutines management
-        self.loop = asyncio.new_event_loop()  # type: ignore
-        # Array to hold coroutines launched
-        self.tasks = []
+        # Unsolicited Request support
+
+        # identify messages that we received in the past self.stale_timeout seconds (10 days). such messages will be ignored and removed from the queue.
+        # maps unique ID to time when this entry is not relevant anymore and should be deleted
+        store_ur_stale = self.get_store('ur_stale')
+        if store_ur_stale is not None:
+            self.ur_stale = json.loads(store_ur_stale)
+            self.log.debug(f"ur_stale loaded from db after restart: {self.ur_stale}")
+        else:
+            self.ur_stale = {}
+
+        # mechanism to prevent the mgr being bombarded by new requests. provides a cooldown time between processing the same type of message.
+        # Requests that arrive during the time the cooldown window for that type of request is active, will wait in the queue until the cooldown
+        # time is over and then will be processed.
+        # key: a a string representing the operation, such as "upload_snap-2" (2 is for level 2 - include SOS report)
+        # value: the time when the last message of this type was processed.
+        store_ur_cooldown = self.get_store('ur_cooldown')
+        if store_ur_cooldown is not None:
+            self.ur_cooldown = json.loads(store_ur_cooldown)
+            self.log.debug(f"ur_cooldown loaded from db after restart: {self.ur_cooldown}")
+        else:
+            self.ur_cooldown: dict[str, datetime] = {}
+
+        self.ur_queue = []
+        self.ceph_cluster_id = self.get('mon_map')['fsid']
+
+        self.event = threading.Event()  # Used to wake up serve if need to refresh options or to exit the module
+
+        # clean up 7.* and 8.0 configuration options
+        if self.get_store('db_operations') is not None:
+            self.log.info("Cleaning old module's db_operations")
+            self.set_store('db_operations', None)
+
+        
+    def get_jwt_jti(self) -> str:
+        # Extract jti from JWT. This is another way to identify clusters in addition to the ICN.
+        jwt_jti = ""
+        reg_credentials_str = self.ceph_command(srv_type='mon',
+                                                prefix='config-key get',
+                                                key='mgr/cephadm/registry_credentials')
+        if not reg_credentials_str:
+            return ""
  
-        # Prepare reports
-        self.prepare_reports()
+        jti_token_fail = ""
+        try:
+            reg_credentials = json.loads(reg_credentials_str)
+            user_jwt_password = r"{}".format(reg_credentials['password'])
+            registry_url = reg_credentials['url']
+            if re.match(self.valid_container_registry, registry_url):
+                jwt_jti = jwt.decode(user_jwt_password, options={
+                                    "verify_signature": False})["jti"]
+                self.log.info("JWT jti field extracted succesfully")
+            else:
+                jti_token_fail = f"url for registry credentials stored in <mgr/cephadm/registry_url> does not match with the expected ones <{self.valid_container_registry}>"
+        except Exception as ex:
+            jti_token_fail = str(ex)
  
-    def init_operations(self) -> None:
-        # We fetch from db the operations we already processed,
-        # and assign it to the global operations dictionary
-        db_operations = self.get_store('db_operations')
+        if jti_token_fail:
+            self.log.warning(
+                f"not able to extract <jti> from JWT token, a valid not empty jti token is required in <mgr/cephadm/registry_password> field password: {jti_token_fail}")
  
-        global operations
-        if db_operations is not None:
-            # We already set_store('db_operations') in the past
-            operations = json.loads(db_operations)
-            self.log.debug(f"operations loaded from db after restart: {operations}")
+        return jwt_jti
+        
  
      def refresh_options(self):
-        # Env vars (if they exist) have preference over module options
-        self.cha_target_url = str(os.environ.get('CHA_TARGET', self.get_module_option('target')))
-
-        self.interval_inventory_seconds = int(
-            os.environ.get('CHA_INTERVAL_INVENTORY_REPORT_SECONDS',
-                           self.get_module_option('interval_inventory_report_seconds')))  # type: ignore
-        self.interval_performance_seconds = int(
-            os.environ.get('CHA_INTERVAL_PERFORMANCE_REPORT_SECONDS',
-                           self.get_module_option('interval_performance_report_seconds')))  # type: ignore
-        self.interval_status_seconds = int(
-            os.environ.get('CHA_INTERVAL_STATUS_REPORT_SECONDS',
-                           self.get_module_option('interval_status_report_seconds')))  # type: ignore
-        self.interval_last_contact_seconds = int(
-            os.environ.get('CHA_INTERVAL_LAST_CONTACT_REPORT_SECONDS',
-                           self.get_module_option('interval_last_contact_report_seconds')))  # type: ignore
-        self.interval_alerts_seconds = int(
-            os.environ.get('CHA_INTERVAL_ALERTS_REPORT_SECONDS',
-                           self.get_module_option('interval_alerts_report_seconds')))  # type: ignore
-        self.proxy = str(os.environ.get('CHA_PROXY', self.get_module_option('proxy')))
-        self.target_space = os.environ.get('CHA_TARGET_SPACE', self.get_module_option('target_space'))
-        self.si_web_service_url = os.environ.get('CHA_SI_WEB_SERVICE_URL', self.get_module_option('si_web_service_url'))
-
-        # Customer identifiers do not use environment vars to be set
-        self.icn = self.get_module_option('icn')
-        self.customer_email = self.get_module_option('customer_email')
-        self.customer_first_name = self.get_module_option('customer_first_name')
-        self.customer_last_name = self.get_module_option('customer_last_name')
-        self.customer_phone = self.get_module_option('customer_phone')
-        self.customer_company_name = self.get_module_option('customer_company_name')
-        self.customer_address = self.get_module_option('customer_address')
-        self.customer_country_code = self.get_module_option('customer_country_code')
-
-        # Owner identifiers used in IBM storage insights do not use environment vars to be set
-        self.owner_tenant_id = self.get_module_option('owner_tenant_id')
-        self.owner_ibm_id = self.get_module_option('owner_ibm_id')
-        self.owner_company_name = self.get_module_option('owner_company_name')
-        self.owner_first_name = self.get_module_option('owner_first_name')
-        self.owner_last_name = self.get_module_option('owner_last_name')
-        self.owner_email = self.get_module_option('owner_email')
-
-        # Other options not using env vars
-        self.valid_container_registry = self.get_module_option('valid_container_registry')
-        self.upload_ops_persistence_seconds = self.get_module_option('upload_ops_persistence_seconds')
-        self.level_one_upload_cooling_window_seconds = self.get_module_option('level_one_upload_cooling_window_seconds')
-        self.level_two_upload_cooling_window_seconds = self.get_module_option('level_two_upload_cooling_window_seconds')
-
-        # ecurep options:
-        self.ecurep_url = self.get_module_option('ecurep_url')
-        self.ecurep_userid = self.get_module_option('ecurep_userid')
-        self.ecurep_password = self.get_module_option('ecurep_password')
-
-    def upload_file(self, op_key: str, file_name: str, chunk_pattern: str = '') -> None:
+        # Note - self.get_module_option() returns the correct type, as long as a type is defined for the option
+        for opt in self.MODULE_OPTIONS:
+            setattr(self, opt['name'], self.get_module_option(opt['name']))
+            self.log.debug(f" {opt['name']} = {getattr(self, opt['name'])}")
+
+        self.proxies = {'http': proxy, 'https': proxy} if self.proxy else {}
+
+        self.jwt_jti = self.get_jwt_jti()
+
+    def ceph_command(self, srv_type: str, prefix: str, srv_spec: Optional[str] = '', inbuf: str = '', **kwargs):
+        # Note: A simplified version of the function used in dashboard ceph services
+        """
+        :type prefix: str
+        :param srv_type: mon |
+        :param kwargs: will be added to argdict
+        :param srv_spec: typically empty. or something like "<fs_id>:0"
+        :param to_json: if true return as json format
+        """
+        argdict = {
+            "prefix": prefix,
+        }
+        argdict.update({k: v for k, v in kwargs.items() if v is not None})
+        result = CommandResult("")
+        self.send_command(result, srv_type, srv_spec, json.dumps(argdict), "", inbuf=inbuf)
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error(f"Execution of command '{prefix}' failed. (r={r}, outs=\"{outs}\", kwargs={kwargs})")
+        try:
+            return outb or outs
+        except Exception as ex:
+            self.log.error(f"Execution of command '{prefix}' failed: {ex}")
+            return outb
+
+    def connectivity_update(self, response: dict) -> None:
+        """
+        Validate that the response is from IBM call home and update the connectivity check struct
+        """
+
+        self.connectivity_status['last_checked'] = time.time()
+
+        # When sending a message to CH, the reply contains a
+        # "service"="ibm_callhome_connect". but if the message sent is of bad
+        # format, CH returns an error in a different format, But this still
+        # means that we connected successfully to CH.
+        if (
+            response.get('service', "") == 'ibm_callhome_connect'
+            or response.get('body',{}).get('env' ,{}).get('namespace', '') != ''
+        ):
+            self.connectivity_status['connectivity'] = True
+            self.connectivity_status['connectivity_error'] = "Success"
+        else:
+            self.connectivity_status['connectivity'] = False
+            self.connectivity_status['connectivity_error'] = f"Bad response from Call Home: {json.dumps(self._filter_report(response), indent=4)}"
+
+    def connectivity_update_error(self, message) -> None:
+        self.connectivity_status['last_checked'] = time.time()
+        self.connectivity_status['connectivity'] = False
+        self.connectivity_status['connectivity_error'] = f"Can't connect to Call Home: {message}"
+
+    def process_response(self, resp: dict) -> None:
          """
-        Upload a file to ecurep.
-        If a chunk_pattern is provided the file is divided in chunks
+        Process HTTP responses we receive from call home after sending a report.
          """
  
-        # We first consider the module options to allow for flexible
-        # workarounds should we need them, otherwise we load the default keys
+        req = "unknown"  # define it to something so that if the below code throws from before the "for", for exdample from json.loads() then req will be defined
+        try:
+            # retrieve unsolicited requests from response
+            try:
+                inbound_requests = resp['response_state']['transactions']['Unsolicited_Storage_Insights_RedHatMarine_ceph_Request']['response_object']['product_request']['asset_event_detail']['body']['inbound_requests']
+            except:
+                # Most of the fields above do not appear at all when there is no UR in the LastContact response. This is OK and not an error, therefore we don't log this.
+                return
+
+            if not inbound_requests:
+                # No UR to process
+                return
+
+            report_event_id = resp.get('transaction', {}).get('event_id', '')
+            self.log.info(f"New inbound_requests = {inbound_requests} for report_event_id {report_event_id}")
+
+            # Note: if we should just ignore stale messages then do it here. Currently we do send an error message for each stale message.
+            # Add the operation to the UR queue
+            for req in inbound_requests:
+                # create the unique ID that identifies this message to check to compare to the stale list
+                # Note: if we decide to add time_ms to stale check then it should be added here
+                unique_id = URUploadSnap(self, req).id()
+                if unique_id in self.ur_stale:
+                    self.log.info(f"Unsolicited request {unique_id} is stale. dropping request.")
+                    continue
+                # protect from denial of service
+                if len(self.ur_queue) > 20:
+                    self.log.warning(f"Unsolicited queue too long. dropping request.")
+                    continue
+                self.ur_queue.append({'request': req, 'report_event_id': report_event_id})
+                self.log.info(f"Queued unsolicited request for processing: {req}")
+        except Exception as ex:
+            self.log.error(f"process_response: error processing the following requests: {req}\nException: {ex}")
+
+        self.ur_queue_run()  # Call immediately to deal with any UR that can be served now.
+
+    def ur_queue_run(self) -> None:
+        try:
+            for ur_elem in list(self.ur_queue):  # Iterate over a copy of the list as we're deleting items from it when we execute them
+                try:
+                    req = ur_elem['request']
+                    report_event_id = ur_elem['report_event_id']
+                    req_type = req.get('operation', '')
+                    if req_type == 'upload_snap':
+                        ur_req = URUploadSnap(self, req)
+                        # check that the request is not in the cooldown window. wait till its gone from there to continue processing the request
+                        ur_cooldown_id = ur_req.id_for_cooldown()
+                        if ur_cooldown_id in self.ur_cooldown:
+                            continue
+                        WorkFlowUploadSnap(self, req, ur_req.id(), report_event_id).run()
+                        self.ur_queue.remove(ur_elem)
+                    else:
+                        self.log.warning(f"Unknown unsolicited request of type '{req_type}'. Deleting it from queue")
+                        ReportURError(self, report_event_id).run()  # May not have "operation" nor "options" in "req"
+                        self.ur_queue.remove(ur_elem)
+                        continue
+
+                    # the cooldown timeout depend on the message type, so we need to add it here.
+                    # The stale timeout is constant, so we'll check it when cleaning the list, therefore we'll be able to change it in runtime
+                    self.ur_cooldown[ur_cooldown_id] = int(time.time()) + ur_req.cooldown_timeout()
+                    self.ur_stale[ur_req.id()] = int(time.time())
+                except Exception as e:
+                    self.log.error(f"Error processing ur_queue: {e}\n{traceback.format_exc()}")
+                    raise
+
+            now = time.time()
+            # Clean cooldown list
+            for k, v in list(self.ur_cooldown.items()):
+                if v < now:
+                    del self.ur_cooldown[k]
+
+            # Clean stale lists
+            for k, v in list(self.ur_stale.items()):
+                if v + self.stale_timeout < now:
+                    del self.ur_stale[k]
+        finally:
+            self.set_store('ur_stale', json.dumps(self.ur_stale))
+            self.set_store('ur_cooldown', json.dumps(self.ur_cooldown))
+
+    def get_ecurep_user_pass(self) -> Tuple[str, str]:
          if self.ecurep_userid and self.ecurep_password:
-            ecurep_userid = self.ecurep_userid
-            ecurep_password = self.ecurep_password
+            return self.ecurep_userid, self.ecurep_password
          else:
              try:
-                id_data = get_settings()
+                id_data = self.get_secrets()
                  # bail out early when the keys are missing
-                ecurep_userid = id_data['ecurep_transfer_id']
-                ecurep_password = id_data['ecurep_password']
+                return id_data['ecurep_transfer_id'], id_data['ecurep_password']
              except Exception as e:
                  self.log.error(f"Error loading ECuRep keys: {e}")
                  raise
  
-        auth = (ecurep_userid, ecurep_password)
-        if self.owner_company_name == "":
-            owner = "MyCompanyUploadClient"
-        else:
-            owner = self.owner_company_name
-        case_id = operations[op_key]['pmr']
-        si_requestid = operations[op_key]['si_requestid']
-        resp = None
+    def get_secrets(self) -> dict:
+        decryption_key = b'yDVH70MMpzBnu5Y1dKfJrw=='
+        decyption_nonce = b'1K6HRTiLD80laBi6'
+        if 'UNITTEST' in os.environ:
+            return {'api_key': 'test_api_key', 'private_key': 'test_private_key'}
  
-        # Get the unique Upload ID for the file
          try:
-            # 1. Obtain the file id to upload the file
-            ecurep_file_id_url = f'{self.ecurep_url}/app/upload_tid?name={file_name}&client={owner}'
-            self.log.info(f"Operations ({si_requestid}): getting unique upload id from <{ecurep_file_id_url}>")
-            resp = requests.post(url=ecurep_file_id_url, auth=auth)
-            resp.raise_for_status()
-            file_id_for_upload = resp.json().get('id')
-            self.log.info(f"Operations ({si_requestid}): unique id for upload is <{file_id_for_upload}>")
-        except Exception as ex:
-            explanation = resp.text if resp else ""
-            raise SendError(f'Operations ({si_requestid}): Failed to send <{file_name}> to <{ecurep_file_id_url}>: {ex}: {explanation}')
+            encrypted_keys = self._load_encrypted_keys()
+            aes_key = base64.b64decode(decryption_key)
+            nonce = base64.b64decode(decyption_nonce)
+            aesgcm = AESGCM(aes_key)
+            clear_keys = aesgcm.decrypt(nonce, encrypted_keys, b'')
+            keys = json.loads(clear_keys)
+            return keys
+        except Exception as e:
+            raise Exception(f"Error getting encrypted settings: {e}")
+
+    def _load_encrypted_keys(self) -> bytes:
+        call_home_keys = '/usr/share/ceph/mgr/call_home_agent/ceph_call_home'  # default location of the key file
+        key_file = os.environ.get('CALLHOMEKEYSFILE', call_home_keys)
+        if not os.path.isfile(key_file):
+            raise Exception(f"Can't find key file {key_file}")
  
+        with open(key_file, 'rb') as f:
+            return f.read()
+
+    # Scheduling API: functions called by Report class to schedule the next run of the report.
+    #    We can implement the underlying scheduling engine using coroutimes, threads or event loop.
+
+    def run_scheduled_ur_queue_run(self) -> None:
          try:
-            # 2. Upload the file
-            ecurep_file_upload_url = f'{self.ecurep_url}/app/upload_sf/files/{file_id_for_upload}?case_id={case_id}&client={owner}'
-            file_size = 0
-            if chunk_pattern:
-                files_to_upload = (glob.glob(f'{DIAGS_FOLDER}/{chunk_pattern}'))
-                for part in files_to_upload:
-                    file_size += os.path.getsize(part)
-            else:
-                files_to_upload = [f'{DIAGS_FOLDER}/{file_name}']
-                file_size = os.path.getsize(f'{DIAGS_FOLDER}/{file_name}')
-
-            start_byte = 0
-            part_sent = 0
-            self.log.info(f"Operations ({si_requestid}): uploading file {file_name} to <{ecurep_file_upload_url}>")
-            for file_path in sorted(files_to_upload):
-                chunk_size = os.path.getsize(file_path)
-                with open(file_path, 'rb') as file:
-                    if chunk_pattern:
-                        self.log.info(f"Operations ({si_requestid}): uploading part {file_path} to <{ecurep_file_upload_url}>")
-                    resp = requests.post(url = ecurep_file_upload_url,
-                                        data = file.read(),
-                                        headers = {'Content-Type': 'application/octet-stream',
-                                                   'X-File-Name': file_name,
-                                                   'X-File-Size': f'{file_size}',
-                                                   'Content-Range': f'bytes {start_byte}-{chunk_size + start_byte}/{file_size}'
-                                        },
-                    )
-                    self.log.info(f'Operations ({si_requestid}): uploaded {file_name} -> bytes {start_byte}-{chunk_size + start_byte}/{file_size}')
-                    resp.raise_for_status()
-                start_byte += chunk_size
-                part_sent += 1
-                if chunk_pattern:
-                    operations[op_key]["progress"] = int(part_sent/len(files_to_upload) * 100)
-                operations[op_key]["description"] = f"file <{file_name}> is being sent"
-                self.send_operation_report(op_key)
+            self.ur_queue_run();
+            self.health_checks.pop('CHA_ERROR_SERVING_UR', None)
+            self.scheduler.enter(30, 1, self.run_scheduled_ur_queue_run)
          except Exception as ex:
-            explanation = resp.text if resp else ""
-            raise SendError(f'Operations ({si_requestid}): Failed to send <{file_path}> to <{ecurep_file_upload_url}>: {ex}: {explanation}')
-
-    def prepare_reports(self):
-        self.reports = {'inventory': Report('inventory',
-                                            'ceph_inventory',
-                                            'Ceph cluster composition',
-                                            self.icn,
-                                            self.owner_tenant_id,
-                                            inventory,
-                                            self.cha_target_url,
-                                            self.proxy,
-                                            self.interval_inventory_seconds,
-                                            self),
-                        'status': Report('status',
-                                         'ceph_health',
-                                         'Ceph cluster status and health',
-                                         self.icn,
-                                         self.owner_tenant_id,
-                                         status,
-                                         self.cha_target_url,
-                                         self.proxy,
-                                         self.interval_status_seconds,
-                                         self),
-                        'last_contact': Report('last_contact',
-                                               'ceph_last_contact',
-                                               'Last contact timestamps with Ceph cluster',
-                                               self.icn,
-                                               self.owner_tenant_id,
-                                               last_contact,
-                                               self.cha_target_url,
-                                               self.proxy,
-                                               self.interval_last_contact_seconds,
-                                               self),
-                        'alerts': Report('status',
-                                         'ceph_alerts',
-                                         'Ceph cluster alerts',
-                                         self.icn,
-                                         self.owner_tenant_id,
-                                         generate_alerts_report,
-                                         self.cha_target_url,
-                                         self.proxy,
-                                         self.interval_alerts_seconds,
-                                         self),
-                        'performance': Report('performance',
-                                              'ceph_performance',
-                                              'Cluster performance metrics',
-                                              self.icn,
-                                              self.owner_tenant_id,
-                                              performance,
-                                              self.cha_target_url,
-                                              self.proxy,
-                                              self.interval_performance_seconds,
-                                              self)
-        }
+            send_error = str(ex)
+            self.log.error(f"Error running uncolicited request handler: {ex}\n{traceback.format_exc()}")
+            self.health_checks.update({
+                'CHA_ERROR_SERVING_UR': {
+                    'severity': 'error',
+                    'summary': f"IBM Ceph Call Home Agent manager module: Error running uncolicited request handler",
+                    'detail': [send_error]
+                }
+            })
  
-    def config_notify(self) -> None:
-        """
-        This only affects changes in ceph config options.
-        To change configuration using env. vars a restart of the module
-        will be neeed or the change in one config option will refresh
-        configuration coming from env vars
-        """
-        self.refresh_options()
-        self.prepare_reports()
-        self.clean_coroutines()
-        self.launch_coroutines()
+        self.set_health_checks(self.health_checks)
  
-    async def control_task(self, seconds: int) -> None:
+    def run_scheduled_report(self, report_class, interval_option_name, last_upload_option_name) -> None:
          """
-            Coroutine to allow cancel and reconfigure coroutines in only 10s
+        Called from the scheduler to run a report
+        report_class: One of the report or workflow classes - not a report object
          """
+        # Save to store the time in which we last tried to send this report. even if we fail to send.
+        # this will help not send in a loop if the sending itself crashes the manager.
+        self.set_store(last_upload_option_name, str(int(time.time())))  # argument 2 must be str or None
+
          try:
-            while self.run:
-                await asyncio.sleep(seconds)
-        except asyncio.CancelledError:
-            return
+            report_class(self).run()
+        except Exception as ex:
+            send_error = str(ex)
+            self.log.error(f"Error running report/workflow {report_class.__name__}: {ex}\n{traceback.format_exc()}")
+            self.health_checks.update({
+                'CHA_ERROR_SENDING_REPORT': {
+                    'severity': 'error',
+                    'summary': f"IBM Ceph Call Home Agent manager module: error sending <{report_class.__name__}> report to endpoint {self.target}",
+                    'detail': [send_error]
+                }
+            })
  
-    async def process_operations(self, seconds: int) -> None:
-        """
-            Coroutine to process operations:
+        self.set_health_checks(self.health_checks)
+        wait_time = getattr(self, interval_option_name)
+        self.scheduler.enter(wait_time, 1, self.run_scheduled_report, argument=(report_class, interval_option_name, last_upload_option_name))
  
-            Remove "completed" operations
-            Takes "new" operations moving them to "in progress"
-            Process the operation moving it to "complete" or "error"
+    def schedule_tasks(self) -> None:
+        for report in self.reports:
+            # interval==0 means it's disabled
+            interval = getattr(self, report['interval_option_name'])
+            if interval == 0:
+                continue
  
-            {{'1234': {'pmr': 'TS1234567',
-                       'level': '3',
-                       'enable_status': 'true',
-                       'version': 1,
-                       'status': 0,
-                       'type': 'upload_snap',
-                       'si_requestid': '1234',
-                       'created': 1707818993.8846028}}
+            # Get the last time it ran
+            last_upload_option_name = f"report_{report['class'].__name__}_last_upload"
+            last_upload = int(self.get_store(last_upload_option_name, 0))
  
-        """
-        try:
-            while self.run:
-                try:
-                    self.log.info("Operations: started")
-                    # Clean any operation in final state
-                    for operation_key in list(operations):
-                        self.log.info("Operations: cleaning finished operations")
-                        if operations[operation_key]['status'] in [OPERATION_STATUS_COMPLETE,
-                                                                  OPERATION_STATUS_ERROR,
-                                                                  OPERATION_STATUS_REQUEST_REJECTED] and operations[operation_key]['status_sent'] == ST_SENT:
-
-                            # Do not delete operations inside the upload snap cooling window
-                            if operations[operation_key]['type'] == UPLOAD_SNAP and 'created' in operations[operation_key].keys():
-                               if int(time.time() - operations[operation_key]['created']) <= self.upload_ops_persistence_seconds:
-                                   continue
-
-                            self.log.info(f'Operations ({operation_key}): Removed finished  <{operations[operation_key]["type"]}> operation with status <{operations[operation_key]["status"]}>')
-                            del operations[operation_key]
-
-                    # Process rest of operations
-                    self.log.info("Operations: Processing ....")
-                    for operation_key, operation in operations.items():
-
-                        # Pending finished operations
-                        if  operation['status'] in [OPERATION_STATUS_COMPLETE,
-                                                    OPERATION_STATUS_ERROR,
-                                                    OPERATION_STATUS_REQUEST_REJECTED] and operation['status_sent'] == ST_NOT_SENT:
-                            self.log.info("Operations: Processing finished operations ....")
-                            self.send_operation_report(operation_key)
-
-
-                        # Process new operations
-                        if operation["status"] == OPERATION_STATUS_NEW:
-                            self.log.info("Operations: Processing new operations ....")
-                            try:
-                                operation["status"] = OPERATION_STATUS_IN_PROGRESS
-                                self.log.info(f'Operations ({operation_key}):  <{operation["type"]}> operation status is <{operation["status"]}> now>')
-                                commands_file = collect_diagnostic_commands(self, operation_key)
-                                sos_files_pattern = ""
-                                if int(operation["level"]) > 1:
-                                    sos_files_pattern = collect_sos_report(self, operation_key)
-                                self.send_diagnostics(operation_key, commands_file, sos_files_pattern)
-                                self.log.info(f'Operations ({operation_key}): Completed <{operation["type"]}> operation')
-                                operation["status"] = OPERATION_STATUS_COMPLETE
-                                operation["progress"] = 100
-                                operation["description"] = OPERATION_STATUS_COMPLETE
-                            except Exception as ex:
-                                self.log.error(f'Operations ({operation_key}): Error processing <{operation["type"]}> operation: {ex}')
-                                operation["status"] = OPERATION_STATUS_ERROR
-
-                            # if it was ok or not, we always report the state
-                            self.send_operation_report(operation_key)
-                    self.log.info('Operations: Processing operations finished')
-                except Exception as ex:
-                    self.log.error(f"Operations ({operation_key}): error: {ex}")
-
-                # persist operations
-                self.set_store('db_operations', json.dumps(operations))
-                self.log.debug(f"updating operations db: {json.dumps(operations)}")
-
-                await asyncio.sleep(seconds)
-        except asyncio.CancelledError:
-            return
-
-    async def report_task(self, report: Report) -> None:
-        """
-            Coroutine for sending the report passed as parameter
-        """
-        self.log.info('Launched task for <%s> report each %s seconds)', report.report_type, report.interval)
+            now = int(time.time())
  
-        try:
-            while self.run:
-                try:
-                    report.send()
-                except Exception as ex:
-                    send_error = str(ex)
-                    self.log.error(send_error)
-                    self.health_checks.update({
-                        'CHA_ERROR_SENDING_REPORT': {
-                            'severity': 'error',
-                            'summary': 'IBM Ceph Call Home Agent manager module: error sending <{}> report to '
-                                    'endpoint {}'.format(report.report_type, self.cha_target_url),
-                            'detail': [send_error]
-                        }
-                    })
+            # We want to immediately send all reports the first time that Ceph runs (and therfore there is no report_*_last_upload in the DB).
+            # this is for the inventory report to be sent immediately (and not after 24h)
+            if last_upload == 0:
+                next_send = now
+            else:
+                next_send = max(last_upload + interval, now)
  
-                self.set_health_checks(self.health_checks)
-                await asyncio.sleep(report.interval)
-        except asyncio.CancelledError:
-            return
+            self.scheduler.enter(next_send - now, 1, self.run_scheduled_report, argument=(report['class'], report['interval_option_name'], last_upload_option_name))
  
-    def launch_coroutines(self) -> None:
-        """
-         Launch module coroutines (reports or any other async task)
-        """
-        try:
-            # tasks for periodic reports
-            for report_name, report in self.reports.items():
-                t = self.loop.create_task(self.report_task(report))
-                self.tasks.append(t)
-            # task for process requested operations
-            t = self.loop.create_task(self.process_operations(30))
-            self.tasks.append(t)
-            # create control task to allow to reconfigure reports in 10 seconds
-            t = self.loop.create_task(self.control_task(10))
-            self.tasks.append(t)
-            # run the async loop
-            self.loop.run_forever()
-        except Exception as ex:
-            if str(ex) != 'This event loop is already running':
-                self.log.exception(str(ex))
+        # Schedule the Uncolicited Request handler
+        self.scheduler.enter(30, 1, self.run_scheduled_ur_queue_run)
  
-    def serve(self) -> None:
+    def config_notify(self) -> None:
          """
-            - Launch coroutines for report tasks
+        This only affects changes in ceph config options.
+        To change configuration using env. vars a restart of the module
+        will be neeed or the change in one config option will refresh
+        configuration coming from env vars
          """
-        self.log.info('Starting IBM Ceph Call Home Agent')
+        self.refresh_options()
+        # Reset the scheduler - effectively emptying it
+        self.scheduler = sched.scheduler(time.time, time.sleep)
+        self.schedule_tasks()
+        self.event.set()
  
-        # Launch coroutines for the reports
-        self.launch_coroutines()
+    def serve(self):
+        self.log.info('Starting IBM Ceph Call Home Agent')
+        self.scheduler = sched.scheduler(time.time, time.sleep)
+        self.schedule_tasks()
+        while self.run:
+            # Passing False causes the scheduler.run() to return the time until the next event. therefore we're not blocked in
+            # the scheduler, but we block ourselves using self.event.sleep() which can be interrupted by self.event.set()
+            # which we use in shutdown() and refresh_options()
+            next_event_seconds = self.scheduler.run(False)
+            self.event.wait(next_event_seconds)
+            self.event.clear()
  
          self.log.info('Call home agent finished')
  
-    def clean_coroutines(self) -> None:
-        """
-        This method is called by the mgr when the module needs to shut
-        down (i.e., when the serve() function needs to exit).
-        """
-        self.log.info('Cleaning coroutines')
-        for t in self.tasks:
-            t.cancel()
-        self.tasks = []
-
      def shutdown(self) -> None:
          self.log.info('Stopping IBM call home module')
          self.run = False
-        self.clean_coroutines
-        self.loop.stop()
+        self.event.set()
          return super().shutdown()
  
-    def send_diagnostics(self, op_key: str, cmd_file_name: str, sos_files_pattern: str) -> None:
-        """
-        """
-        # Send commands file:
-        self.upload_file(op_key, cmd_file_name)
-
-        # Send sos file splitted when we have files
-        if sos_files_pattern:
-            sos_file_name = f'{sos_files_pattern[:-2]}.xz'
-            self.upload_file(op_key, sos_file_name, sos_files_pattern)
-
-    def send_operation_report(self, key:str) -> None:
-        try:
-            # Use a counter to make event_id unique for each operation
-            counter = 0
-            try:
-                counter = operations[key]["counter"]
-            except KeyError:
-                operations[key]["counter"] = counter
-
-            op_report = Report(report_type= f'status',
-                               component = 'ceph_operations',
-                               description=f'operation {operations[key]["type"]}',
-                               icn= self.icn,
-                               owner_tenant_id= self.owner_tenant_id,
-                               fn= get_operation,
-                               url= self.cha_target_url,
-                               proxy= self.proxy,
-                               seconds_interval= 0,
-                               mgr_module = self,
-                               key= key,
-                               event_id= f'{operations[key]["event_id"]}-{counter}')
-            op_report.send(force=True)
-            operations[key]['status_sent'] = ST_SENT
-            operations[key]['counter'] += 1
-            self.log.info(f'Operations ({key}): call home report sent. description: {operations[key]["description"]}, status: {operations[key]["status"]}, progress: {operations[key]["progress"]}')
-            return
-        except Exception as ex:
-            self.log.error(f'Operations ({key}): Error sending <{operations[key]["type"]}> \
-                             operation report <{key}>: {ex}')
-            raise(ex)
-
      @CLIReadCommand('callhome stop')
-    def stop_cmd(self) -> Tuple[int, str, str]:
+    def cli_stop(self) -> Tuple[int, str, str]:
          self.shutdown()
-        return HandleCommandResult(stdout=f'Remember to disable the '
-                                   'call home module')
+        return HandleCommandResult(stdout=f'Remember to disable the call home module')
  
      @CLIReadCommand('callhome reset alerts')
-    def reset_alerts(self, mock: Optional[bool] = False) -> Tuple[int, str, str]:
+    def cli_reset_alerts(self, mock: Optional[bool] = False) -> Tuple[int, str, str]:
          """
          Resets the local list of alerts that were sent to Call Home to allow
          for existing alerts to be resent.
  
          :param mock: generates a dummy alert
+            If there are no relevant alerts in the cluster, an "alerts" report will not be sent.
+            "--mock" is useful in this case, to allow the user to send a dummy "alerts" report to Call Home.
          """
-        global sent_alerts
-        if mock:
-            # If there are no relevant alerts in the cluster, an "alerts" report will not be sent.
-            # "--mock" is useful in this case, to allow the user to send a dummy "alerts" report to Call Home.
-            mocked_alert = {'labels': {'label': 'test'}, 'activeAt': '42', 'value': '17'}
-            sent_alerts = {alert_uid(mocked_alert): mocked_alert}
-        else:
-            sent_alerts = {}
+        ReportStatusAlerts.resetAlerts(mock)
          return HandleCommandResult(stdout=f"Sent alerts list has been reset. Next alerts report will send all current alerts.")
  
+    def _filter_report(self, report_dict: dict, fields_to_remove = ['api_key', 'private_key']) -> dict:
+        for field in fields_to_remove:
+            report_dict.pop(field, None)
+
+        report_dict.get('transaction', {}).pop('api_key', None)
+        return report_dict
+
+    def _find_report_by_name(self, name: str) -> dict:
+        found_reports = list(filter(lambda r: r['name'] == name, self.reports))
+        if not found_reports:
+            return None
+        return found_reports[0]
+
      @CLIReadCommand('callhome show')
-    def print_report_cmd(self, report_type: str) -> Tuple[int, str, str]:
+    def cli_show(self, report_type: str) -> Tuple[int, str, str]:
          """
              Prints the report requested.
              Available reports: inventory, status, last_contact, alerts, performance
              Example:
                  ceph callhome show inventory
          """
-        global sent_alerts
-        if report_type in self.reports.keys():
-            if report_type == 'alerts':
-                # The "alerts" report only sends alerts that are not in 'sent_alerts', and then updates 'sent_alerts'
-                # with the alerts sent. For 'callhome show' not to affect the regular workflow, we need to restore
-                # 'sent_alerts' to what it was before 'callhome show' generated the alerts report.
-                tmp_sent_alerts = sent_alerts
-            filtered_report = self.reports[report_type].filter_report(['api_key', 'private_key'])
-            if report_type == 'alerts':
-                sent_alerts = tmp_sent_alerts
-            if filtered_report is None:
-                return HandleCommandResult(stdout=f"Report is empty")
-            return HandleCommandResult(stdout=f"{filtered_report}")
-        else:
-            return HandleCommandResult(stderr='Unknown report type')
+        report = self._find_report_by_name(report_type)
+        if report is None:
+            return HandleCommandResult(stderr=f"Unknown report type {report_type}.")
+
+        if report_type == 'alerts':
+            # The "alerts" report only sends alerts that are not in 'sent_alerts', and then updates 'sent_alerts'
+            # with the alerts sent. For 'callhome show' not to affect the regular workflow, we need to restore
+            # 'sent_alerts' to what it was before 'callhome show' generated the alerts report.
+            tmp_sent_alerts = ReportStatusAlerts.sent_alerts
+
+        report_dict = report['class'](self).compile()
+        if report_dict is None:
+            return HandleCommandResult(stdout=f"Report {report_type} is empty. Nothing to send.")
+
+        filtered_report = self._filter_report(report_dict)
+
+        if report_type == 'alerts':
+            ReportStatusAlerts.sent_alerts = tmp_sent_alerts
+
+        return HandleCommandResult(stdout=f"{json.dumps(filtered_report, indent=4)}")
+
+    def test_connectivity(self) -> Tuple[int, str, str]:
+        return self.cli_send("status")
+
+    def get_call_home_status(self) -> dict[any, any]:
+        return self.connectivity_status
+
+    @CLIReadCommand('callhome connectivity status')
+    def cli_connectivity_status(self) -> Tuple[int, str, str]:
+        return HandleCommandResult(stdout=json.dumps(self.connectivity_status, indent=4))
  
      @CLIReadCommand('callhome send')
-    def send_report_cmd(self, report_type: str) -> Tuple[int, str, str]:
+    def cli_send(self, report_type: str) -> Tuple[int, str, str]:
          """
              Command for sending the report requested.
              Available reports: inventory, status, last_contact, alerts, performance
              Example:
                  ceph callhome send inventory
          """
+        report = self._find_report_by_name(report_type)
+        if report is None:
+            return HandleCommandResult(stderr=f"Unknown report type {report_type}.", retval=-1)
+
          try:
-            if report_type in self.reports.keys():
-                resp = self.reports[report_type].send(force=True)
-            else:
-                raise Exception('Unknown report type')
+            resp = report['class'](self).run()
          except Exception as ex:
-            return HandleCommandResult(stderr=str(ex))
-        else:
-            if resp == None:
-                return HandleCommandResult(stdout=f'{report_type} report: Nothing to send\n')
-            else:
-                return HandleCommandResult(stdout=f'{report_type} report sent successfully:\n{resp}')
+            return HandleCommandResult(stderr=str(ex), retval=-1)
  
+        if resp == None:
+            return HandleCommandResult(stdout=f'{report_type} report: Nothing to send\n')
+
+        try:
+            resp = json.dumps(self._filter_report(json.loads(resp)), indent=4)
+        except:
+            pass
+        return HandleCommandResult(stdout=f'{report_type} report sent successfully:\n{resp}')
  
      @CLIReadCommand('callhome list-tenants')
-    def list_tenants(self, owner_ibm_id: str, owner_company_name: str,
+    def cli_list_tenants(self, owner_ibm_id: str, owner_company_name: str,
                         owner_first_name: str, owner_last_name: str,
                         owner_email: str) -> Tuple[int, str, str]:
          """
@@ -1688,18 +771,19 @@ class CallHomeAgent(MgrModule):
                                          'IBM-SRM-SenderApp': 'CEPH-EM',
                                          'IBM-SRM-Request': 'SI-SignUp-Check'},
                                  data=json.dumps(owner_data),
-                                proxies=self.proxy)
+                                proxies=self.proxy,
+                                timeout=30)
  
              resp.raise_for_status()
          except Exception as ex:
              explanation = resp.text if resp else str(ex)
-            self.log.error(explanation)
-            return HandleCommandResult(stderr=explanation)
+            self.log.error(f"Failed to list tenants: {explanation}")
+            return HandleCommandResult(stderr=f"Failed to list tenants: {explanation}")
          else:
-            return HandleCommandResult(stdout=f'{json.dumps(resp.json())}')
+            return HandleCommandResult(stdout=f'{json.dumps(resp.json(), indent=4)}')
  
      @CLIWriteCommand('callhome set tenant')
-    def set_tenant_id(self, owner_tenant_id: str, owner_ibm_id: str,
+    def cli_set_tenant(self, owner_tenant_id: str, owner_ibm_id: str,
                        owner_company_name: str, owner_first_name: str,
                        owner_last_name: str, owner_email: str) -> Tuple[int, str, str]:
          """
@@ -1712,14 +796,15 @@ class CallHomeAgent(MgrModule):
              self.set_module_option('owner_first_name', owner_first_name)
              self.set_module_option('owner_last_name', owner_last_name)
              self.set_module_option('owner_email', owner_email)
-            self.prepare_reports()
          except Exception as ex:
              return HandleCommandResult(stderr=str(ex))
          else:
              return HandleCommandResult(stdout=f'IBM tenant id set to {owner_tenant_id}')
+        finally:
+            self.refresh_options()  # This will always run, no matter what.
  
      @CLIReadCommand('callhome get user info')
-    def customer(self) ->  Tuple[int, str, str]:
+    def cli_get_user_info(self) -> Tuple[int, str, str]:
          """
          Show the information about the customer used to identify the customer
          in IBM call home and IBM storage insights systems
@@ -1746,61 +831,45 @@ class CallHomeAgent(MgrModule):
              }))
  
      @CLIReadCommand('callhome upload diagnostics')
-    def upload_diags(self, support_ticket: str, level: int) ->  Tuple[int, str, str]:
+    def cli_upload_diagnostics(self, support_ticket: str, level: int) -> Tuple[int, str, str]:
          """
-        Upload Ceph cluster diagnostics to Ecurep for an specific customer support ticket
+        Upload Ceph cluster diagnostics to Ecurep for a specific customer support ticket.
          """
-
+        # The upload happends immediately without the constraints of self.ur_stale or self.ur_cooldown
+        # and does not populate those after sending. No need to clear any queue for this command to
+        # be executed
          try:
-            request = {'operation': 'upload_snap',
-                    'options': {'pmr': f'{support_ticket}',
-                                'level': f'{level}',
-                                'si_requestid':f'si_request_{uuid.uuid4()}',
-                                'enable_status': 'true',
-                                'version': 1}
-            }
-            key = add_operation(request,
-                                self.level_one_upload_cooling_window_seconds,
-                                self.level_two_upload_cooling_window_seconds)
-
+            request = {
+                    'options': {
+                        'pmr': support_ticket,
+                        'level': level
+                        # No need to simulate si_requestid
+                        }
+                    }
+            WorkFlowUploadSnap(self, request, 'cli_upload_diagnostics', None).run()
          except Exception as ex:
-            return HandleCommandResult(stderr=str(ex))
+            return HandleCommandResult(stderr=f"Error sending diagnostics: {ex}")
          else:
-            return HandleCommandResult(stdout=f'{operations[key]}')
+            return HandleCommandResult(stdout='Success')
  
-    @CLIReadCommand('callhome operations')
-    def list_operations(self) ->  Tuple[int, str, str]:
+    @CLIReadCommand('callhome list queues')
+    def cli_list_queues(self) -> Tuple[int, str, str]:
          """
-        Show the operations list
+        Show the state of the unsolicited requests queues
          """
-        try:
-            output = '\n'.join(f'{key}:{value}' for key, value in operations.items())
-        except Exception as ex:
-            return HandleCommandResult(stderr=str(ex))
-        else:
-            return HandleCommandResult(stdout=output)
+        ret = {'ur_queue': self.ur_queue, 'ur_stale': self.ur_stale, 'ur_cooldown': self.ur_cooldown}
+        return HandleCommandResult(stdout=json.dumps(ret, indent=4))
  
-    @CLIWriteCommand('callhome operations clean')
-    def clean_operations(self, operation_id: str = "") ->  Tuple[int, str, str]:
+    @CLIWriteCommand('callhome clear queues')
+    def cli_clear_queues(self) -> Tuple[int, str, str]:
          """
-        Remove an operation (if provided operation_id) from the operations list
-        If no operation_id provided clean completelly the operations list
+        Clear the unsolicited requests queues
          """
-        try:
-            if operation_id:
-                if operation_id in operations.keys():
-                    del operations[operation_id]
-            else:
-                operations.clear()
-
-            output = json.dumps(operations)
-
-            # persist operations
-            self.set_store('db_operations', json.dumps(operations))
-            self.log.debug(f"updating operations db after cleaning: {json.dumps(operations)}")
  
-        except Exception as ex:
-            return HandleCommandResult(stderr=str(ex))
-        else:
-            return HandleCommandResult(stdout=output)
+        self.ur_queue = []
+        self.ur_stale = {}
+        self.ur_cooldown = {}
+        self.set_store('ur_stale', json.dumps(self.ur_stale))
+        self.set_store('ur_cooldown', json.dumps(self.ur_cooldown))
  
+        return HandleCommandResult(stdout="Success")
diff --git a/src/pybind/mgr/call_home_agent/prometheus.py b/src/pybind/mgr/call_home_agent/prometheus.py

new file mode 100644 (file)

index 0000000..8b993d4
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/prometheus.py
@@ -0,0 +1,44 @@
+import requests
+from typing import Optional
+
+class Prometheus():
+
+    def __init__(self, mgr) -> None:
+        self.mgr = mgr
+        self.url = self.prometheus_url()
+
+    def prometheus_url(self) -> str:
+        daemon_list = self.mgr.remote('cephadm', 'list_daemons', service_name='prometheus')
+        if daemon_list.exception_str:
+            raise Exception(f"Error finding the Prometheus instance: {daemon_list.exception_str}")
+        if len(daemon_list.result) < 1:
+            raise Exception(f"Can't find the Prometheus instance")
+
+        d = daemon_list.result[0]
+        host = d.ip if d.ip else d.hostname  # ip is of type str
+        port = str(d.ports[0]) if d.ports else ""  # ports is a list of ints
+        if not (host and port):
+            raise Exception(f"Can't get Prometheus IP and/or port from manager")
+
+        return f"http://{host}:{port}/api/v1"
+
+    def get(self, endpoint: str, params: Optional[dict] = None) -> dict:
+        """
+        Execute a Prometheus query and return the result as dict
+        """
+        result = {}
+        try:
+            r = requests.get(f"{self.url}/{endpoint}", params=params)
+            r.raise_for_status()
+            result = r.json()
+        except Exception as e:
+            raise Exception(f"Error executing Prometheus query: {e} - {result}")
+        return result
+
+    def query(self, query: str) -> dict:
+        return self.get("query", {'query': query})
+
+    def status(self) -> dict:
+        """Get information about prometheus server status"""
+        return self.get("targets")
+
diff --git a/src/pybind/mgr/call_home_agent/report.py b/src/pybind/mgr/call_home_agent/report.py

new file mode 100644 (file)

index 0000000..9c741ef
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report.py
@@ -0,0 +1,191 @@
+
+from datetime import datetime
+from typing import Optional
+from .exceptions import SendError
+import time
+import requests
+import json
+
+class ReportTimes:
+    def __init__(self, now = datetime.now()):
+        self.time = now.strftime("%Y-%m-%d %H:%M:%S")
+        self.time_ms = int(datetime.timestamp(now) * 1000)
+        self.local_time = now.strftime("%a %b %d %H:%M:%S %Z")
+
+class Report:
+    """
+    Base class for all reports
+    """
+
+    def __init__(self, agent, report_type, event_classes = []):
+        """
+        Args:
+          agent: a reference to a CallHomeAgent object, which inherits from MgrModule
+        """ 
+        self.agent = agent
+        self.agent.log.debug(f"Instantiating {self.__class__.__name__}, report_type={report_type}")
+        self.report_type = report_type
+        self.event_classes = event_classes
+        self.report_event_id = None
+
+    def compile(self) -> Optional[dict]:
+        report_times = ReportTimes()
+        report = self.get_report_headers(report_times, self.report_event_id)
+        for event_class in self.event_classes:
+            event = event_class(self.agent).generate(report_times)
+            report['events'].append(event.data)
+
+        return report
+
+    def run(self) -> Optional[str]:
+        compiled = self.compile()
+        if compiled is None:
+            return None
+        return self.send(compiled)
+
+    def get_report_headers(self, report_times: ReportTimes, report_event_id = None) -> dict:
+        try:
+            secrets = self.agent.get_secrets()
+        except Exception as e:
+            self.agent.log.error(f"Error getting encrypted identification keys for {self.report_type} report: {e}. "
+                                 "Provide keys and restart IBM Ceph Call Home module")
+            secrets = {'api_key': '', 'private_key': ''}
+
+        target_space = self.agent.target_space  # One of 'prod', 'test', 'dev'
+
+        if not report_event_id:
+            report_event_id = f"IBM_chc_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{self.report_type}_report_{report_times.time_ms}"
+
+        header = {
+                "agent": "RedHat_Marine_firmware_agent",
+                "api_key": secrets['api_key'],
+                "private_key": secrets['private_key'],
+                "target_space": target_space,
+                "asset": "ceph",
+                "asset_id": self.agent.ceph_cluster_id,
+                "asset_type": "RedHatMarine",
+                "asset_vendor": "IBM",
+                "asset_virtual_id": self.agent.ceph_cluster_id,
+                "country_code": "",
+                "event_id": report_event_id,
+                "event_time": report_times.time,
+                "event_time_ms": report_times.time_ms,
+                "local_event_time": report_times.local_time,
+                "software_level": {
+                    "name": "ceph_software",
+                    "vrmf": self.agent.version
+                },
+                "type": "eccnext_apisv1s",
+                "version": "1.0.0.1",
+                "analytics_event_source_type": "asset_event",
+                "analytics_type": "ceph",
+                "analytics_instance":  self.agent.ceph_cluster_id,
+                "analytics_virtual_id": self.agent.ceph_cluster_id,
+                "analytics_group": "Storage",
+                "analytics_category": "RedHatMarine",
+                "events": []
+            }
+
+        #header.update(self._header_times(report_timestamp))
+
+        return header
+
+    def send(self, report: dict, force: bool = False) -> str:
+        resp = None
+        url = self.agent.target
+
+        if self.agent.proxies:
+            self.agent.log.info(f"Sending <{self.report_type}> report to <{url}> (via proxies <{self.agent.proxies}>)")
+        else:
+            self.agent.log.info(f"Sending <{self.report_type}> report to <{url}>")
+
+        try:
+            resp = requests.post(url=url,
+                                 headers={'accept': 'application/json', 'content-type': 'application/json'},
+                                 data=json.dumps(report),
+                                 proxies=self.agent.proxies,
+                                 timeout=60)
+            self.agent.log.debug(f"Report response: {resp.text}")
+            resp.raise_for_status()
+
+            ch_response = resp.json()
+            self.agent.connectivity_update(ch_response)
+        except Exception as e:
+            self.agent.connectivity_update_error(e)
+            raise
+
+        try:
+            self.agent.health_checks.pop('CHA_ERROR_SENDING_REPORT', None)
+            last_id = report.get('event_time_ms', 'Unknown')
+            self.agent.log.info(f"Successfully sent <{self.report_type}> report({last_id}) to <{url}>")
+            # Process unsolicited requests, i.e. requests sent to us by Call Home embedded in the HTTP response to last_contact messages.
+            # In the future we may get those in the response of other message types
+            self.agent.process_response(ch_response)
+            return resp.text
+        except Exception as e:
+            explanation = resp.text if resp else ""
+            raise SendError(f"Failed to send <{self.report_type}> to <{url}>: {e} {explanation}")
+
+# Event methods
+class Event:
+    def __init__(self, agent):
+        self.agent = agent
+
+    def generate(self, event_type: str, component: str, report_times: ReportTimes):
+        # The below line is what was the event_id in the old code (7.1).
+        # There is a problem where both [event_type="status",componet="ceph_alerts"] and [event_type="status",componet="ceph_health"]
+        #   can generate the same event_id if both are generated at the same millisecond. therefore we added "{component}" to the event_id
+        #self.event_event_id = f"IBM_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{report_times.time_ms}_{event_type}_event"
+        self.event_event_id = f"IBM_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{report_times.time_ms}_{event_type}_{component}_event"
+        self.data = {
+                "header": {
+                    "event_id": self.event_event_id, # "IBM_event_RedHatMarine_ceph_{}_{}_{}_event".format(ceph_cluster_id, event_time_ms, event_type),
+                    "event_time": report_times.time,
+                    "event_time_ms": report_times.time_ms,
+                    "event_type": event_type,
+                    "local_event_time": report_times.local_time # TODO check if including local_event_time also works in confirm_response and log_upload/status
+                    },
+                "body": {
+                    "component": component,
+                }
+        }
+
+        if self.agent.owner_tenant_id:  # send 'tenant_id' only if the cluster opted-in to Storage Insights.
+            self.data["header"]["tenant_id"] = self.agent.owner_tenant_id
+        return self
+
+    def set_content(self, content):
+        # payload may or may not exist. create it if it doesn't, append to it if it does
+        self.data['body'].setdefault('payload', {})['content'] = content
+
+class EventGeneric(Event):
+    def generate(self, event_type: str, component: str, description: str, report_times: ReportTimes):
+        super().generate(event_type, component, report_times)
+        self.data['body'].update( {
+            "context": {
+                "origin": 2,
+                "timestamp": report_times.time_ms,
+                "transid": report_times.time_ms
+            },
+            "description": description,
+            "payload": {
+                "request_time": report_times.time_ms,
+                "content": {},  # will be filled later
+                "ibm_customer_number": self.agent.icn,
+                "product_id_list" : [
+                    ['5900-AVA', 'D0CYVZX'],
+                    ['5900-AVA', 'D0CYWZX'],
+                    ['5900-AVA', 'D0CYXZX'],
+                    ['5900-AVA', 'D0DKDZX'],
+                    ['5900-AVA', 'E0CYUZX'],
+                    ['5900-AXK', 'D0DSJZX'],
+                    ['5900-AXK', 'D0DSKZX'],
+                    ['5900-AXK', 'D0DSMZX'],
+                    ['5900-AXK', 'D0DSLZX'],
+                    ['5900-AXK', 'E0DSIZX'],
+                ],
+                "jti": self.agent.jwt_jti
+            }
+        } )
+        return self
+
diff --git a/src/pybind/mgr/call_home_agent/report_inventory.py b/src/pybind/mgr/call_home_agent/report_inventory.py

new file mode 100644 (file)

index 0000000..b3dec67
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_inventory.py
@@ -0,0 +1,44 @@
+from .report import Report, ReportTimes, EventGeneric
+from .report_status_health import EventStatusHealth
+import time
+
+class ReportInventory(Report):
+    def __init__(self, agent) -> None:
+        super().__init__(agent, 'inventory', [EventInventory])
+
+class EventInventory(EventGeneric):
+    def gather(self):
+        inventory = {}
+        inventory["crush_map"] = self.agent.get("osd_map_crush")
+        inventory["devices"] = self.agent.get("devices")
+        inventory["df"] = self.agent.get("df")
+        inventory["fs_map"] = self.agent.get("fs_map")
+        inventory["hosts"] = self.agent.list_servers()
+        inventory["manager_map"] = self.agent.get("mgr_map")
+        inventory["mon_map"] = self.agent.get("mon_map")
+        inventory["osd_map"] = self.agent.get("osd_map")
+        inventory["osd_metadata"] = self.agent.get("osd_metadata")
+        inventory["osd_tree"] = self.agent.get("osd_map_tree")
+        inventory["pg_summary"] = self.agent.get("pg_summary")
+        inventory["service_map"] = self.agent.get("service_map")
+        inventory["hardware_status"] = self._get_hardware_status()
+
+        # Gather status report
+        inventory.update(EventStatusHealth(self.agent).gather())
+        return {'inventory': inventory}
+
+    def generate(self, report_times: ReportTimes):
+        super().generate('inventory', 'ceph_inventory', 'Ceph cluster composition', report_times)
+        self.set_content(self.gather())
+        return self
+
+    def _get_hardware_status(self) -> dict:
+        try:
+            hw_status = self.agent.remote('orchestrator', 'node_proxy_summary')
+            if hw_status.exception_str:
+                raise Exception(hw_status.exception_str)
+            return hw_status.result
+        except Exception as e:
+            self.agent.log.exception(str(e))
+            return {'error': str(e)}
+
diff --git a/src/pybind/mgr/call_home_agent/report_last_contact.py b/src/pybind/mgr/call_home_agent/report_last_contact.py

new file mode 100644 (file)

index 0000000..303402d
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_last_contact.py
@@ -0,0 +1,23 @@
+from .report import Report, ReportTimes, EventGeneric
+import time
+
+class ReportLastContact(Report):
+    def __init__(self, agent) -> None:
+        super().__init__(agent, 'last_contact', [EventLastContact])
+
+class EventLastContact(EventGeneric):
+    def gather(self) -> dict:
+        return {'last_contact': format(int(time.time()))}
+
+    def generate(self, report_times: ReportTimes):
+        super().generate('last_contact', 'ceph_last_contact', 'Last contact timestamps with Ceph cluster', report_times)
+
+        # self.data["body"]["event_transaction_id"] = f"IBM_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{report_times.time_ms}_last_contact_event"  # TODO check
+        self.data["body"]["context"]["messagetype"] = 1
+        self.data["body"]["enable_response_detail"] = True
+        self.data["body"]["enable_response_detail_filter"] = ["Unsolicited_Storage_Insights_RedHatMarine_ceph_Request"]
+
+        self.set_content(self.gather())
+        return self
+
+
diff --git a/src/pybind/mgr/call_home_agent/report_performance.py b/src/pybind/mgr/call_home_agent/report_performance.py

new file mode 100644 (file)

index 0000000..a0097f1
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_performance.py
@@ -0,0 +1,253 @@
+from .report import Report, ReportTimes, Event
+from .prometheus import Prometheus
+import time
+import json
+import requests
+import zstandard
+import math
+import base64
+from datetime import datetime
+from typing import Optional
+
+class ReportPerformance(Report):
+
+    def __init__(self, agent) -> None:
+        super().__init__(agent, 'performance', [EventPerformance])
+
+
+class EventPerformance(Event):
+    def generate(self, report_times: ReportTimes) -> None:
+        super().generate('performance', 'ceph_performance', report_times)
+
+
+        self.data['body'].update( {
+            "context": {
+                "origin": 2,
+                "timestamp": report_times.time_ms,
+                "transid": report_times.time_ms
+            },
+            "description": 'Cluster performance metrics',
+            "payload": {
+                "perfstats": self.gather()
+            }
+        } )
+
+        return self
+
+    def gather(self) -> dict:
+
+        p_i_m = math.ceil(self.agent.interval_performance_report_seconds / 60)  # Performance Interval in Minutes
+
+        queries = {
+            "ceph_osd_op_r_avg"        : {"query": f"sum(avg_over_time(ceph_osd_op_r[{p_i_m}m]))/count(ceph_osd_metadata)",
+                                          "help" : f"Average of read operations per second and per OSD in the cluster in the last {p_i_m} minutes"},
+            "ceph_osd_op_r_min"        : {"query": f"min(min_over_time(ceph_osd_op_r[{p_i_m}m]))",
+                                          "help" : f"Minimum read operations per second in the cluster in the last {p_i_m} minutes"},
+            "ceph_osd_op_r_max"        : {"query": f"max(max_over_time(ceph_osd_op_r[{p_i_m}m]))",
+                                           "help": f"Maximum of write operations per second in the cluster in the last {p_i_m} minutes"},
+            "ceph_osd_r_out_bytes_avg" : {"query": f"sum(avg_over_time(ceph_osd_op_r_out_bytes[{p_i_m}m]))/count(ceph_osd_metadata)",
+                                          "help" : f"Average of cluster output bytes(reads) and per OSD in the last {p_i_m} minutes"},
+            "ceph_osd_r_out_bytes_min" : {"query": f"min(min_over_time(ceph_osd_op_r_out_bytes[{p_i_m}m]))",
+                                          "help" : f"Minimum of cluster output bytes(reads) in the last {p_i_m} minutes"},
+            "ceph_osd_r_out_bytes_max" : {"query": f"max(max_over_time(ceph_osd_op_r_out_bytes[{p_i_m}m]))",
+                                          "help" : f"Maximum of cluster output bytes(reads) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_avg"        : {"query": f"sum(avg_over_time(ceph_osd_op_w[{p_i_m}m]))/count(ceph_osd_metadata)",
+                                          "help" : f"Average of cluster input operations per second(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_min"        : {"query": f"min(min_over_time(ceph_osd_op_w[{p_i_m}m]))",
+                                          "help" : f"Mimimum of cluster input operations per second(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_max"        : {"query": f"max(max_over_time(ceph_osd_op_w[{p_i_m}m]))",
+                                          "help" : f"Maximum of cluster input operations per second(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_in_bytes_avg"       : {"query": f"sum(avg_over_time(ceph_osd_op_w_in_bytes[{p_i_m}m]))/count(ceph_osd_metadata)",
+                                                  "help" : f"Average of cluster input bytes(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_in_bytes_min"       : {"query": f"min(min_over_time(ceph_osd_op_w_in_bytes[{p_i_m}m]))",
+                                                  "help" : f"Minimum of cluster input bytes(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_w_in_bytes_max"       : {"query": f"max(max_over_time(ceph_osd_op_w_in_bytes[{p_i_m}m]))",
+                                                  "help" : f"Maximum of cluster input bytes(writes) in the last {p_i_m} minutes"},
+            "ceph_osd_op_read_latency_avg_ms"  : {"query": f"avg(rate(ceph_osd_op_r_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Average of cluster output latency(reads) in milliseconds in the last {p_i_m} minutes"},
+            "ceph_osd_op_read_latency_max_ms"  : {"query": f"max(rate(ceph_osd_op_r_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Maximum of cluster output latency(reads) in milliseconds in the last {p_i_m} minutes"},
+            "ceph_osd_op_read_latency_min_ms"  : {"query": f"min(rate(ceph_osd_op_r_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Minimum of cluster output latency(reads) in milliseconds  in the last {p_i_m} minutes"},
+            "ceph_osd_op_write_latency_avg_ms" : {"query": f"avg(rate(ceph_osd_op_w_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Average of cluster input latency(writes) in milliseconds in the last {p_i_m} minutes"},
+            "ceph_osd_op_write_latency_max_ms" : {"query": f"max(rate(ceph_osd_op_w_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Maximum of cluster input latency(writes) in milliseconds  in the last {p_i_m} minutes"},
+            "ceph_osd_op_write_latency_min_ms" : {"query": f"min(rate(ceph_osd_op_w_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[{p_i_m}m]) * 1000)",
+                                                  "help" : f"Maximum of cluster input latency(writes) in milliseconds in the last {p_i_m} minutes"},
+            "ceph_physical_device_latency_reads_ms"    : {"query": 'node_disk_read_time_seconds_total / node_disk_reads_completed_total * on (instance, device) group_left(ceph_daemon) label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)") * 1000',
+                                                        "help" : "Read latency in milliseconds per physical device used by ceph OSD daemons"},
+            "ceph_physical_device_latency_writes_ms"   : {"query": 'node_disk_write_time_seconds_total / node_disk_writes_completed_total * on (instance, device) group_left(ceph_daemon) label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)") * 1000',
+                                                        "help" : "Write latency in milliseconds per physical device used by ceph OSD daemons"},
+            "ceph_physical_device_read_iops"           : {"query": 'node_disk_reads_completed_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
+                                                        "help" : "Read operations per second per physical device used by ceph OSD daemons"},
+            "ceph_physical_device_write_iops"          : {"query": 'node_disk_writes_completed_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
+                                                        "help" : "Write operations per second per physical device used by ceph OSD daemons"},
+            "ceph_physical_device_read_bytes"          : {"query": 'node_disk_read_bytes_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
+                                                        "help" : "Read bytes per physical device used by ceph OSD daemons in the last"},
+            "ceph_physical_device_written_bytes"       : {"query": 'node_disk_written_bytes_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")',
+                                                        "help" : "Write bytes per physical device used by ceph OSD daemons in the last"},
+            "ceph_physical_device_utilization_seconds" : {"query": '(node_disk_io_time_seconds_total * on (instance, device) group_left(ceph_daemon)  label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)")) * on (ceph_daemon) group_left(device_class) ceph_osd_metadata',
+                                                          "help":"Seconds total of Input/Output operations per physical device used by ceph OSD daemons"},
+            "ceph_pool_objects"     : {"query": "ceph_pool_objects * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help": "Number of Ceph pool objects per Ceph pool"},
+            "ceph_pool_write_iops"  : {"query": f"rate(ceph_pool_wr[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : "Per-second average rate of increase of write operations per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pool_read_iops"   : {"query": f"rate(ceph_pool_rd[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of increase of read operations per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pool_write_bytes" : {"query": f"rate(ceph_pool_wr_bytes[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of increase of written bytes per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pool_read_bytes"  : {"query": f"rate(ceph_pool_rd_bytes[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of increase of read bytes per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pg_activating"    : {"query": f"rate(ceph_pg_activating[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of Placement Groups activated per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pg_backfilling"   : {"query": f"rate(ceph_pg_backfilling[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of Placement Groups backfilled per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pg_creating"      : {"query": f"rate(ceph_pg_creating[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of Placement Groups created per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pg_recovering"    : {"query": f"rate(ceph_pg_recovering[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help" : f"Per-second average rate of Placement Groups recovered per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_pg_deep"          : {"query": f"rate(ceph_pg_deep[{p_i_m}m]) * on(pool_id) group_left(instance, name) ceph_pool_metadata",
+                                       "help":  f"Per-second average rate of Placement Groups deep scrubbed per Ceph pool during the last {p_i_m} minutes"},
+            "ceph_rgw_avg_get_latency_ms" : {"query": f'(rate(ceph_rgw_get_initial_lat_sum[{p_i_m}m]) or vector(0)) * 1000 / rate(ceph_rgw_get_initial_lat_count[{p_i_m}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
+                                             "help" : f"Average latency in milliseconds for GET operations per Ceph RGW daemon during the last {p_i_m} minutes"},
+            "ceph_rgw_avg_put_latency_ms" : {"query": f"(rate(ceph_rgw_put_initial_lat_sum[{p_i_m}m]) or vector(0)) * 1000 / rate(ceph_rgw_put_initial_lat_count[{p_i_m}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata",
+                                             "help" : f"Average latency in milliseconds for PUT operations per Ceph RGW daemon during the last {p_i_m} minutes"},
+            "ceph_rgw_requests_per_second": {"query": f'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[{p_i_m}m]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
+                                             "help" : f"Request operations per second per Ceph RGW daemon during the last {p_i_m} minutes"},
+            "ceph_rgw_get_size_bytes" :     {"query": f'label_replace(sum by (instance_id) (rate(ceph_rgw_get_b[{p_i_m}m])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
+                                             "help" : f"Per-second average rate of GET operations size per Ceph RGW daemon during the last {p_i_m} minutes"},
+            "ceph_rgw_put_size_bytes" :     {"query": f'label_replace(sum by (instance_id) (rate(ceph_rgw_put_b[{p_i_m}m])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
+                                             "help" : f"Per-second average rate of PUT operations size per Ceph RGW daemon during the last {p_i_m} minutes"},
+            "ceph_mds_read_requests_per_second"   : {"query": f'rate(ceph_objecter_op_r{{ceph_daemon=~"mds.*"}}[{p_i_m}m])',
+                                                     "help" : f"Per-second average rate of read requests per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "ceph_mds_write_requests_per_second"  : {"query": f'rate(ceph_objecter_op_w{{ceph_daemon=~"mds.*"}}[{p_i_m}m])',
+                                                     "help" : f"Per-second average rate of write requests per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "ceph_mds_client_requests_per_second" : {"query": f'rate(ceph_mds_server_handle_client_request[{p_i_m}m])',
+                                                     "help" : f"Per-second average rate of client requests per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "ceph_mds_reply_latency_avg_ms" : {"query": f'avg(rate(ceph_mds_reply_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{p_i_m}m]) * 1000)',
+                                               "help" : f"Average of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "ceph_mds_reply_latency_max_ms" : {"query": f'max(rate(ceph_mds_reply_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{p_i_m}m]) * 1000)',
+                                               "help" : f"Maximum of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "ceph_mds_reply_latency_min_ms" : {"query": f'min(rate(ceph_mds_reply_latency_sum[{p_i_m}m]) or vector(0) / on (ceph_daemon) rate(ceph_mds_reply_latency_count[{p_i_m}m]) * 1000)',
+                                               "help" : f"Minimum of the per-second average rate of reply latency(seconds) per Ceph MDS daemon during the last {p_i_m} minutes"},
+            "hw_cpu_busy"                          : {"query": f"1- rate(node_cpu_seconds_total{{mode='idle'}}[{p_i_m}m])",
+                                                      "help" : f"Percentaje of CPU utilization per core during the last {p_i_m} minutes"},
+            "hw_ram_utilization"                   : {"query": f'(node_memory_MemTotal_bytes -(node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_Slab_bytes))/node_memory_MemTotal_bytes',
+                                                      "help" : "RAM utilization"},
+            "hw_node_physical_disk_read_ops_rate"  : {"query": f"rate(node_disk_reads_completed_total[{p_i_m}m])",
+                                                      "help" : f"Per-second average rate of read operations per physical storage device in the host during the last {p_i_m} minutes"},
+            "hw_node_physical_disk_write_ops_rate" : {"query": f"rate(node_disk_writes_completed_total[{p_i_m}m])",
+                                                      "help" : f"Per-second average rate of write operations per physical storage device in the host during the last {p_i_m} minutes"},
+            "hw_disk_utilization_rate"             : {"query": f"rate(node_disk_io_time_seconds_total[{p_i_m}m])",
+                                                      "help" : f"Per-second average rate of input/output operations time(seconds) per physical storage device in the host during the last {p_i_m} minutes"},
+            "hw_network_bandwidth_receive_load_bytes" : {"query": f"rate(node_network_receive_bytes_total[{p_i_m}m])",
+                                                         "help" : f"Per-second average rate of received bytes per network card in the host during the last {p_i_m} minutes"},
+            "hw_network_bandwidth_transmit_load_bytes": {"query": f"rate(node_network_transmit_bytes_total[{p_i_m}m])",
+                                                         "help" : f"Per-second average rate of transmitted bytes per network card in the host during the last {p_i_m} minutes"},
+            "ceph_nvmeof_gateway_total"                        : {"query": "count by(group) (ceph_nvmeof_gateway_info) or vector(0)",
+                                                                  "help" : "Number of Ceph NVMe-oF daemons or gatways running"},
+            "ceph_nvmeof_subsystem_total"                      : {"query": "count by(group) (count by(nqn,group) (ceph_nvmeof_subsystem_metadata))",
+                                                                  "help" : "Number of Ceph NVMe-oF subsystems running"},
+            "ceph_nvmeof_reactor_total"                        : {"query": 'max by(group) (max by(instance) (count by(instance) (ceph_nvmeof_reactor_seconds_total{mode="busy"})) * on(instance) group_right ceph_nvmeof_gateway_info)',
+                                                                  "help" : "Number of reactors per gateway"},
+            "ceph_nvmeof_gateway_reactor_cpu_seconds_total"    : {"query": f'max by(group) (avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{{mode="busy"}}[{p_i_m}m])) * on(instance) group_right ceph_nvmeof_gateway_info)',
+                                                                   "help" : "Highest gateway CPU load"},
+            "ceph_nvmeof_namespaces_total"                     : {"query": "max by(group) (count by(instance) (count by(bdev_name,instance) (ceph_nvmeof_bdev_metadata )) * on(instance) group_right ceph_nvmeof_gateway_info)",
+                                                                  "help" : "Total number of namespaces"},
+            "ceph_nvmeof_capacity_exported_bytes_total"        : {"query": "topk(1,sum by(instance) (ceph_nvmeof_bdev_capacity_bytes)) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
+                                                                  "help" : "Ceph NVMe-oF total capacity exposed"},
+            "ceph_nvmeof_clients_connected_total "             : {"query": "count by(instance) (sum by(instance,host_nqn) (ceph_nvmeof_host_connection_state == 1)) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
+                                                                  "help" : "Number of clients connected to Ceph NVMe-oF"},
+            "ceph_nvmeof_gateway_iops_total "                  : {"query": f"sum by(instance) (rate(ceph_nvmeof_bdev_reads_completed_total[{p_i_m}m]) + rate(ceph_nvmeof_bdev_writes_completed_total[{p_i_m}m])) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
+                                                                  "help" : "IOPS per Ceph NVMe-oF gateway"},
+            "ceph_nvmeof_subsystem_iops_total"                 : {"query": f"sum by(group,nqn) (((rate(ceph_nvmeof_bdev_reads_completed_total[{p_i_m}m]) + rate(ceph_nvmeof_bdev_writes_completed_total[{p_i_m}m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
+                                                                  "help" : "IOPS per Ceph NVMe-oF subsystem"},
+            "ceph_nvmeof_gateway_throughput_bytes_total"       : {"query": f"sum by(instance) (rate(ceph_nvmeof_bdev_read_bytes_total[{p_i_m}m]) + rate(ceph_nvmeof_bdev_written_bytes_total[{p_i_m}m])) * on(instance) group_left(group) ceph_nvmeof_gateway_info",
+                                                                  "help" : "Throughput per Ceph NVMe-oF gateway"},
+            "ceph_nvmeof_subsystem_throughput_bytes_total"     : {"query": f"sum by(group,nqn) (((rate(ceph_nvmeof_bdev_read_bytes_total[{p_i_m}m]) + rate(ceph_nvmeof_bdev_written_bytes_total[{p_i_m}m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
+                                                                  "help" : "Throughput per Ceph NVMe-oF subsystem"},
+            "ceph_nvmeof_gateway_read_avg_latency_seconds"     : {"query": f"avg by(group,instance) (((rate(ceph_nvmeof_bdev_read_seconds_total[{p_i_m}m]) / rate(ceph_nvmeof_bdev_reads_completed_total[{p_i_m}m])) > 0) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
+                                                                  "help" : "Read latency average in seconds per Ceph NVMe-oF gateway"},
+            "ceph_nvmeof_gateway_write_avg_latency_seconds "   : {"query": f"avg by(group,instance) (((rate(ceph_nvmeof_bdev_write_seconds_total[{p_i_m}m]) / rate(ceph_nvmeof_bdev_writes_completed_total[{p_i_m}m])) > 0) * on(instance) group_left(group) ceph_nvmeof_gateway_info)",
+                                                                  "help":  "Write average in seconds per Ceph NVMe-oF gateway"},
+            "ceph_nvmeof_gateway_read_p95_latency_seconds"     : {"query": f"quantile by(group,instance) (.95,((rate(ceph_nvmeof_bdev_read_seconds_total[{p_i_m}m]) / (rate(ceph_nvmeof_bdev_reads_completed_total[{p_i_m}m]) >0)) * on(instance) group_left(group) ceph_nvmeof_gateway_info))",
+                                                                  "help":  "Read latency for 95{%} of the Ceph NVMe-oF gateways"},
+            "ceph_nvmeof_gateway_write_p95_latency_seconds"    : {"query": f"quantile by(group,instance) (.95,((rate(ceph_nvmeof_bdev_write_seconds_total[{p_i_m}m]) / (rate(ceph_nvmeof_bdev_writes_completed_total[{p_i_m}m]) >0)) * on(instance) group_left(group) ceph_nvmeof_gateway_info))",
+                                                                  "help":  "Write latency for 95{%} of the Ceph NVMe-oF gateways"}
+        }
+
+        errors = []
+        performance_metrics = {}
+        t1 = time.time()
+        try:
+            prometheus = Prometheus(self.agent)
+
+            # Metrics retrieval
+            query_errors = 0
+            for k, v in queries.items():
+                try:
+                    data = prometheus.query(v["query"])
+                    # remove single metric timestamps
+                    try:
+                        for metric in data['data']['result']:
+                            metric["value"] = metric["value"][1:]
+                    except Exception:
+                        pass
+                    performance_metrics[k] = {"result": data['data']['result']}
+                except Exception as e:
+                    self.agent.log.error(f"Error reading performance metric \"{k}\": {e}")
+                    query_errors += 1
+                    continue
+
+            if query_errors:
+                errors.append(f"Error getting metrics from Prometheus. Got {query_errors} errors. Active Ceph Manager log contains details")
+
+            # Prometheus server health
+            prometheus_status = prometheus.status()
+            targets_down = list(filter(lambda x: x['health'] != 'up', prometheus_status['data']['activeTargets']))
+            if targets_down:
+                errors.append(f"Error(scrape targets not up): Not able to retrieve metrics from {targets_down} targets. Review Prometheus server status")
+
+            # Ceph status
+            performance_metrics["ceph_health_detail"] = json.loads(self.agent.get('health')['json'])
+        except Exception as e:
+            msg = f"Error collecting performance metrics: {e}"
+            self.agent.log.error(msg)
+            errors.append(msg)
+
+        performance_metrics["ceph_version"] = self.agent.version
+        total_time = round((time.time() - t1) * 1000, 2)
+        performance_metrics['time_to_get_performance_metrics_ms'] = total_time
+        self.agent.log.debug(f"Time to get performance metrics: {total_time} ms")
+        performance_metrics['timestamp'] = t1
+        performance_metrics['human_timestamp'] = datetime.fromtimestamp(t1).strftime('%Y-%m-%d %H:%M:%S')
+
+        # Performance report status
+        if errors:
+            performance_metrics["status"] = "\n".join(errors)
+        else:
+            performance_metrics["status"] = "OK"
+
+        # performance data compressed and serialized to a JSON string
+        performance_json = json.dumps(performance_metrics)
+        cctx = zstandard.ZstdCompressor()
+        compressed = cctx.compress(performance_json.encode('utf-8'))
+        compressed_base64 = base64.b64encode(compressed).decode('utf-8')
+
+
+        return {"perfstats": {
+                            "file_stamp": performance_metrics['human_timestamp'],
+                            "file_stamp_ms": int(t1 * 1000),
+                            "local_file_stamp": performance_metrics['human_timestamp'],
+                            "nd_stats": compressed_base64,
+                            "ng_stats": "",
+                            "nm_stats": "",
+                            "nn_stats": "",
+                            "nv_stats": "",
+                            "node_number": 1,     # because IBM Call Home reqs.
+                            "nodes_in_cluster": 1 # because IBM Call Home reqs.
+                            }
+                }
+
+
diff --git a/src/pybind/mgr/call_home_agent/report_status_alerts.py b/src/pybind/mgr/call_home_agent/report_status_alerts.py

new file mode 100644 (file)

index 0000000..7396ca4
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_status_alerts.py
@@ -0,0 +1,120 @@
+from .report import Report, ReportTimes, EventGeneric
+from .prometheus import Prometheus
+import time
+import json
+import requests
+from typing import Optional
+
+class ReportStatusAlerts(Report):
+
+    # Prometheus API returns all alerts. We want to send only deltas in the alerts
+    # report - i.e. send a *new* alert that has been fired since the last report
+    # was sent, and send a “resolved” notification when an alert is removed from
+    # the prometheus API.
+    # To do so we keep a list of alerts (“sent_alerts”) we have already sent, and
+    # use that to create a delta report in generate_alerts_report(). The alert
+    # report is not sent if there are no deltas.
+    # `ceph callhome reset alerts` zeros out sent_alerts list and therefore the
+    # next report will contain the relevant alerts that are fetched from the
+    # Prometheus API.
+    sent_alerts = {}
+
+    def __init__(self, agent) -> None:
+        super().__init__(agent, 'status')
+
+    def compile(self) -> Optional[dict]:
+        report_times = ReportTimes()
+        report = self.get_report_headers(report_times)
+        event = EventStatusAlerts(self.agent).generate(report_times)
+        # If there are no alerts to send then return and dont send the report
+        if not event.has_content:
+            return None
+
+        report['events'].append(event.data)
+        return report
+        #self.send(report)
+
+    @staticmethod
+    def resetAlerts(mock: bool = False):
+        if mock:
+            # If there are no relevant alerts in the cluster, an "alerts" report will not be sent.
+            # "mock" is useful in this case, to allow the user to send a dummy "alerts" report to Call Home.
+            mocked_alert = {'labels': {'label': 'test'}, 'activeAt': '42', 'value': '17'}
+            sent_alerts = {alert_uid(mocked_alert): mocked_alert}
+        else:
+            sent_alerts = {}
+
+class EventStatusAlerts(EventGeneric):
+    def generate(self, report_times: ReportTimes) -> None:
+        super().generate('status', 'ceph_alerts', 'Ceph cluster alerts', report_times)
+
+        self.data["body"]["event_transaction_id"] = f"IBM_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{report_times.time_ms}_status_event"
+        self.data["body"]["complete"] = True
+
+        # if the status event contains alerts we add a boolean in the body to help with analytics
+        self.data["body"]["alert"] =  True
+        # Call Home requires the 'state' attribute in the 'body' section
+        self.data["body"]["state"] = "Ok"
+        content = self.gather()
+        self.has_content = bool(content)
+        self.set_content(content)
+        return self
+
+    def gather(self) -> dict:
+        # Filter the alert list
+        current_alerts_list = list(filter(self.is_alert_relevant, self.get_prometheus_alerts()))
+
+        current_alerts = {self.alert_uid(a):a for a in current_alerts_list}
+        # Find all new alerts - alerts that are currently active but were not sent until now (not in sent_alerts)
+        new_alerts = [a for uid, a in current_alerts.items() if uid not in ReportStatusAlerts.sent_alerts]
+        resolved_alerts = [a for uid, a in ReportStatusAlerts.sent_alerts.items() if uid not in current_alerts]
+
+        ReportStatusAlerts.sent_alerts = current_alerts
+        if len(new_alerts) == 0 and len(resolved_alerts) == 0:
+            return None  # This will prevent the report from being sent
+        alerts_to_send = {'new_alerts': new_alerts, 'resolved_alerts': resolved_alerts}
+        return alerts_to_send
+
+    def alert_uid(self, alert: dict) -> str:
+        """
+        Retuns a unique string identifying this alert
+        """
+        return json.dumps(alert['labels'], sort_keys=True) + alert['activeAt'] + alert['value']
+
+    def is_alert_relevant(self, alert: dict) -> bool:
+        """
+        Returns True if this alert should be sent, False if it should be filtered out of the report
+        """
+        state = alert.get('state', '')
+        severity = alert.get('labels', {}).get('severity', '')
+
+        return state == 'firing' and severity == 'critical'
+
+    def get_prometheus_alerts(self):
+        """
+        Returns a list of all the alerts currently active in Prometheus
+        """
+        try:
+            prometheus = Prometheus(self.agent)
+            resp = prometheus.get("alerts")
+            if 'data' not in resp or 'alerts' not in resp['data']:
+                raise Exception(f"Prometheus returned a bad reply: {resp}")
+
+            alerts = resp['data']['alerts']
+            return alerts
+        except Exception as e:
+            self.agent.log.error(f"Can't fetch alerts from Prometheus: {e}")
+            return [{
+                    'labels': {
+                        'alertname': 'callhomeErrorFetchPrometheus',
+                        'severity': 'critical'
+                    },
+                    'annotations': {
+                        'description': str(e)
+                    },
+                    'state': 'firing',
+                    # 'activeAt' and 'value' are here for alert_uid() to work. They should be '0' so that we won't send this alert again and again
+                    'activeAt': '0',
+                    'value': '0'
+                }]
+
diff --git a/src/pybind/mgr/call_home_agent/report_status_health.py b/src/pybind/mgr/call_home_agent/report_status_health.py

new file mode 100644 (file)

index 0000000..ce6a698
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_status_health.py
@@ -0,0 +1,113 @@
+from .report import Report, ReportTimes, EventGeneric
+import time
+import json
+import math
+from .prometheus import Prometheus
+
+class ReportStatusHealth(Report):
+    def __init__(self, agent) -> None:
+        super().__init__(agent, 'status', [EventStatusHealth])
+
+class EventStatusHealth(EventGeneric):
+    def generate(self, report_times: ReportTimes):
+        super().generate('status', 'ceph_health','Ceph cluster status and health', report_times)
+
+        self.data["body"]["event_transaction_id"] = f"IBM_event_RedHatMarine_ceph_{self.agent.ceph_cluster_id}_{report_times.time_ms}_status_event"
+        self.data["body"]["complete"] = True
+        content = self.gather()
+        self.set_content(content)
+        try:
+            status = content['status']['health']['status']
+        except:
+            status = "Unknown status"
+        self.data["body"]["state"] = status
+        return self
+
+    def gather(self) -> dict:
+        r, outb, outs = self.agent.mon_command({
+            'prefix': 'status',
+            'format': 'json'
+        })
+        if r:
+            error = f"status command failed: {outs}"
+            self.agent.log.error(error)
+            return {'status': {'error': error}}
+        try:
+            status_dict = json.loads(outb)
+            status_dict["ceph_version"] = self.agent.version
+            status_dict["health_detail"] = json.loads(self.agent.get('health')['json'])
+            status_dict["support"] = self.get_support_metrics()
+            status_dict["support"]["health_status"] = status_dict["health_detail"].get("status", "")
+            status_dict["support"]["health_summary"] = self.get_health_summary(status_dict["health_detail"])
+            return {'status' : status_dict}
+        except Exception as e:
+            self.agent.log.exception(str(e))
+            return {'status' : {'exception': str(e)}}
+
+    def get_health_summary(self, ceph_health: dict) -> str:
+        """
+        Stringify Ceph's health status
+        """
+        try:
+            health_items = []
+            for error_key, error_details in ceph_health["checks"].items():
+                details = "\n".join([item["message"] for item in error_details.get("detail",[])])
+                health_items.append(f'{error_key}({error_details["severity"]}): {error_details["summary"]["message"]}\n{details}')
+            return "\n\n".join(health_items)
+        except Exception as e:
+            return f"Error getting health status: {e}"
+
+    def get_support_metrics(self) -> dict:
+        """
+        Collect cluster metrics needed for Ceph support team tools
+        """
+        support_metrics = {}
+        s_i_m = math.ceil(self.agent.interval_status_report_seconds / 60)  # Status Interval in Minutes
+        queries = {
+            'total_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
+            'total_raw_usage_bytes': 'sum(ceph_osd_stat_bytes_used)',
+            'usage_percentage': '(sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)) * 100',
+            'slow_ops_total': 'sum(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})',
+            'osds_total_with_slow_ops': 'count(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}>0) or on() vector(0)',
+            'pg_total': 'sum(ceph_pg_total)',
+            'pg_active': 'sum(ceph_pg_active)',
+            'pg_clean': 'sum(ceph_pg_clean)',
+            'pg_degraded': 'sum(ceph_pg_degraded)',
+            'pg_unknown': 'sum(ceph_pg_unknown)',
+            'pg_down': 'sum(ceph_pg_down)',
+            'pg_scrubbing': 'sum(ceph_pg_scrubbing)',
+            'pg_deep_scrubbing': 'sum(ceph_pg_deep)',
+            'network_receive_errors': f'avg(increase(node_network_receive_errs_total{{device!="lo"}}[{s_i_m}m]))',
+            'network_send_errors': f'avg(increase(node_network_transmit_errs_total{{device!="lo"}}[{s_i_m}m]))',
+            'network_receive_packet_drops': f'avg(increase(node_network_receive_drop_total{{device!="lo"}}[{s_i_m}m]))',
+            'network_transmit_packet_drops': f'avg(increase(node_network_transmit_drop_total{{device!="lo"}}[{s_i_m}m]))',
+            'inconsistent_mtu': 'sum(node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))) or vector(0))',
+            'pool_number': 'count(ceph_pool_bytes_used)',
+            'raw_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
+            'raw_capacity_consumed_bytes': 'sum(ceph_pool_bytes_used)',
+            'logical_stored_bytes': 'sum(ceph_pool_stored)',
+            'pool_growth_bytes': f'sum(delta(ceph_pool_stored[{s_i_m}m]))',
+            'pool_bandwidth_bytes': f'sum(rate(ceph_pool_rd_bytes[{s_i_m}m]) + rate(ceph_pool_wr_bytes[{s_i_m}m]))',
+            'pg_per_osd_ratio':'(avg(ceph_osd_numpg)/sum(ceph_pg_total))*100',
+            'monitors_number': 'count(ceph_mon_metadata)',
+            'monitors_not_in_quorum_number': 'count(ceph_mon_quorum_status!=1) or on() vector(0)',
+            'clock_skews_number': 'ceph_health_detail{name="MON_CLOCK_SKEW"} or on() vector(0)',
+        }
+
+        try:
+            prometheus = Prometheus(self.agent)
+            t1 = time.time()
+            for k, v in queries.items():
+                data = prometheus.query(v)
+                try:
+                    support_metrics[k] = float(data['data']['result'][0]['value'][1])
+                except Exception as e:
+                     self.agent.log.error(f"Error reading status metric for support \"{k}\": {e} - {data}")
+            total_time = round((time.time() - t1) * 1000, 2)
+            support_metrics['time_to_get_support_data_ms'] = total_time
+            self.agent.log.debug(f"Time to get support data for status report: {total_time} ms")
+        except Exception as e:
+            self.agent.log.error(f"Error collecting support data for status report: {e}")
+
+        return support_metrics
+
diff --git a/src/pybind/mgr/call_home_agent/report_ur_error.py b/src/pybind/mgr/call_home_agent/report_ur_error.py

new file mode 100644 (file)

index 0000000..66d79eb
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/report_ur_error.py
@@ -0,0 +1,6 @@
+from .report import Report, ReportTimes
+
+class ReportURError(Report):
+    def __init__(self, agent, report_event_id):
+        super().__init__(agent, 'status', [])  # We assume that we dont need to send any event when reporting an error.
+        self.report_event_id = report_event_id
diff --git a/src/pybind/mgr/call_home_agent/tests/response_no_pending_ur.json b/src/pybind/mgr/call_home_agent/tests/response_no_pending_ur.json

new file mode 100644 (file)

index 0000000..85f048b
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/tests/response_no_pending_ur.json
@@ -0,0 +1,90 @@
+{
+  "service": "ibm_callhome_connect",
+  "version": "1.0.0.4",
+  "transaction": {
+    "event_id": "IBM-RedHatMarine-ceph-368ffc04-5319-11ee-9c69-123456789aaa-368ffc04-5319-11ee-9c69-123456789aaa_asset_event_819a108f-bde2-40bd-9516-2c6e9a623140_PJD",
+    "asset": "ceph",
+    "asset_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+    "asset_type": "RedHatMarine",
+    "asset_virtual_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+    "agent": "chc_ras_agent",
+    "software_level": {
+      "name": "ceph_software",
+      "vrmf": "ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable)"
+    },
+    "api_key": "mocked_api_key",
+    "event_time": "2024-06-23 19:36:48",
+    "event_time_ms": 1719171408779,
+    "message_key": "Storage_RedHat_ceph_368ffc04-5319-11ee-9c69-123456789aaa_368ffc04-5319-11ee-9c69-123456789aaa",
+    "message_key_json": {
+      "group": "Storage",
+      "category": "RedHat",
+      "type": "ceph",
+      "instance": "368ffc04-5319-11ee-9c69-123456789aaa",
+      "virtual_id": "368ffc04-5319-11ee-9c69-123456789aaa"
+    },
+    "rc": 200,
+    "events": 1,
+    "analytics_activation_id": "activation_1719171408778_rzhq8f4pxau_us-east.codeengine.appdomain.cloud_post_us-east_dev",
+    "target_space": "dev",
+    "env": {
+      "activation_id": "activation_1719171408778_rzhq8f4pxau_us-east.codeengine.appdomain.cloud_post_us-east_dev",
+      "action_name": "call_home_connect_post",
+      "namespace": "call-home-connect-service-dev",
+      "http_method": "post",
+      "geo": "us-east",
+      "url": "/api/v1",
+      "stage": "dev",
+      "message": "Successful Request"
+    },
+    "ien": 200,
+    "idb": 596
+  },
+  "events": [
+    {
+      "header": {
+        "_id": "last_contact_asset_event_IBM-RedHatMarine-ceph-368ffc04-5319-11ee-9c69-123456789aaa-0000020420805080_f376db1b-1c92-46c1-8bf1-138a061a94f5",
+        "rc": 200,
+        "component": "RedHatMarine_call_home_agent",
+        "payload": false,
+        "context": {
+          "messagetype": 1,
+          "origin": 2,
+          "timestamp": 1645737993,
+          "transid": 1645737993210
+        }
+      }
+    }
+  ],
+  "response_available": true,
+  "response_state": {
+    "transactions": {},
+    "persistence_result": {
+      "response_cos_data_success": false,
+      "response_cos_data_exception": "The specified key does not exist.",
+      "query_parameters": {
+        "Key": "dev/response/RedHatMarine/ceph/368ffc04-5319-11ee-9c69-123456789aaa/Unsolicited_Storage_Insights_RedHatMarine_ceph_Request_transaction.json",
+        "Bucket": "call-home-connect-response-data"
+      },
+      "found_detail": "Found Records [] available the time of query",
+      "missing_detail": "Missing Records [Unsolicited_Storage_Insights_RedHatMarine_ceph_Request] not available the time of query",
+      "found": 0,
+      "missing": 1
+    },
+    "filter_redaction": false,
+    "filter_records_reviewed": 0,
+    "filter_records_removed": 0,
+    "results": true,
+    "found_detail": "Found Records [] available the time of query",
+    "found": 0,
+    "missing": 1
+  },
+  "query_results_error": true,
+  "query_results_available": false,
+  "query_state": {
+    "success": false,
+    "error": true,
+    "error_text": "",
+    "results": {}
+  }
+}
diff --git a/src/pybind/mgr/call_home_agent/tests/response_yes_pending_ur.json b/src/pybind/mgr/call_home_agent/tests/response_yes_pending_ur.json

new file mode 100644 (file)

index 0000000..ede9ea2
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/tests/response_yes_pending_ur.json
@@ -0,0 +1,165 @@
+{
+  "service": "ibm_callhome_connect",
+  "version": "1.0.0.4",
+  "transaction": {
+    "event_id": "IBM-RedHatMarine-ceph-368ffc04-5319-11ee-9c69-123456789aaa-368ffc04-5319-11ee-9c69-123456789aaa_asset_event_819a108f-bde2-40bd-9516-2c6e9a623140_PJD",
+    "asset": "ceph",
+    "asset_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+    "asset_type": "RedHatMarine",
+    "asset_virtual_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+    "agent": "chc_ras_agent",
+    "software_level": {
+      "name": "ceph_software",
+      "vrmf": "ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable)"
+    },
+    "api_key": "mocked_api_key",
+    "event_time": "2024-06-23 19:39:07",
+    "event_time_ms": 1719171547044,
+    "message_key": "Storage_RedHat_ceph_368ffc04-5319-11ee-9c69-123456789aaa_368ffc04-5319-11ee-9c69-123456789aaa",
+    "message_key_json": {
+      "group": "Storage",
+      "category": "RedHat",
+      "type": "ceph",
+      "instance": "368ffc04-5319-11ee-9c69-123456789aaa",
+      "virtual_id": "368ffc04-5319-11ee-9c69-123456789aaa"
+    },
+    "rc": 200,
+    "events": 1,
+    "analytics_activation_id": "activation_1719171546991_s16zmwohf30_us-south.codeengine.appdomain.cloud_post_us-south_dev",
+    "target_space": "dev",
+    "env": {
+      "activation_id": "activation_1719171546991_s16zmwohf30_us-south.codeengine.appdomain.cloud_post_us-south_dev",
+      "action_name": "call_home_connect_post",
+      "namespace": "call-home-connect-service-dev",
+      "http_method": "post",
+      "geo": "us-south",
+      "url": "/api/v1",
+      "stage": "dev",
+      "message": "Successful Request"
+    },
+    "ien": 200,
+    "idb": 596
+  },
+  "events": [
+    {
+      "header": {
+        "_id": "last_contact_asset_event_IBM-RedHatMarine-ceph-368ffc04-5319-11ee-9c69-123456789aaa-0000020420805080_f376db1b-1c92-46c1-8bf1-138a061a94f5",
+        "rc": 200,
+        "component": "RedHatMarine_call_home_agent",
+        "payload": false,
+        "context": {
+          "messagetype": 1,
+          "origin": 2,
+          "timestamp": 1645737993,
+          "transid": 1645737993210
+        }
+      }
+    }
+  ],
+  "response_available": true,
+  "response_state": {
+    "transactions": {
+      "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request": {
+        "confirmed": false,
+        "complete": false,
+        "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+        "time": "2024-06-23T19:38:38.080Z",
+        "time_ms": 1719171518081,
+        "expiry": 1719430718081,
+        "expiration_mark": false,
+        "event_type": "product_request",
+        "unsolicted_request_type": "product_request",
+        "response_object": {
+          "confirmed": true,
+          "complete": false,
+          "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+          "time": "2022-07-10 08:37:23",
+          "time_ms": 1657503443000,
+          "expiry": 1657504223000,
+          "event_type": "product_request",
+          "unsolicited_request_type": "product_request",
+          "asset": "ceph",
+          "type": "ceph",
+          "asset_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+          "instance": "368ffc04-5319-11ee-9c69-123456789aaa",
+          "asset_type": "RedHatMarine",
+          "asset_vendor": "IBM",
+          "group": "Storage",
+          "category": "RedHatMarine",
+          "asset_virtual_id": "368ffc04-5319-11ee-9c69-123456789aaa",
+          "product_request": {
+            "event_time": "2022-07-10 08:37:23",
+            "event_time_ms": 1657503443000,
+            "connect_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request_PJD_TestTranGroup001_001_1657503443000",
+            "event_type": "product_request",
+            "event_source_definition": "product_request.eventnotifier.esupport.ibm.com",
+            "asset_event_detail": {
+              "body": {
+                "event_type": "product_request",
+                "inbound_requests": [
+                  {
+                    "operation": "upload_snap",
+                    "options": {
+                      "pmr": "TS1234567",
+                      "level": "3",
+                      "enable_status": "true",
+                      "version": 1
+                    }
+                  },
+                  {
+                    "operation": "upload_file",
+                    "options": {
+                      "pmr": "TS1234567",
+                      "file": "/opt/important/file",
+                      "node": "2",
+                      "enable_status": "false",
+                      "version": 1
+                    }
+                  },
+                  {
+                    "operation": "disable_si_messages",
+                    "options": {
+                      "disable": "true",
+                      "version": 1
+                    }
+                  }
+                ]
+              },
+              "header": {
+                "event_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+                "event_time": "2022-07-11T01:01:31Z",
+                "event_time_ms": 1657501291674,
+                "event_type": "product_request",
+                "local_event_time": "2022-07-11T01:01:31Z"
+              }
+            }
+          }
+        },
+        "response_cos_data_success": true,
+        "response_cos_data_entry_count": 1,
+        "response_cos_bucket": "call-home-connect-response-data",
+        "response_cos_key": "dev/response/RedHatMarine/ceph/368afc04-5319-11ee-9c69-123456789aaa/Unsolicited_Storage_Insights_RedHatMarine_ceph_Request_transaction.json"
+      }
+    },
+    "version": "0.0.0.1",
+    "create_time": "2024-06-23T19:38:38.080Z",
+    "create_time_ms": 1719171518081,
+    "update_time": "2024-06-23T19:38:38.080Z",
+    "update_time_ms": 1719171518081,
+    "filter_redaction": false,
+    "filter_records_reviewed": 1,
+    "filter_records_removed": 0,
+    "results": true,
+    "found_detail": "Found Records [Unsolicited_Storage_Insights_RedHatMarine_ceph_Request] available the time of query",
+    "found": 1,
+    "missing": 0
+  },
+  "query_results_error": true,
+  "query_results_available": false,
+  "query_state": {
+    "success": false,
+    "error": true,
+    "error_text": "",
+    "results": {}
+  }
+}
diff --git a/src/pybind/mgr/call_home_agent/tests/test_agent.py b/src/pybind/mgr/call_home_agent/tests/test_agent.py

index 640561ae3a223f5c04a6fcf401f8034b4e9af353..e3bb83429d474c1cb9ec834acbfad5fb2a55f726 100644 (file)
--- a/src/pybind/mgr/call_home_agent/tests/test_agent.py
+++ b/src/pybind/mgr/call_home_agent/tests/test_agent.py
@@ -1,10 +1,21 @@
  import unittest
  import time
  import json
+import os
+from collections import defaultdict
  
  from unittest.mock import MagicMock, Mock, patch
  
-from call_home_agent.module import Report, exec_prometheus_query
+#from call_home_agent.module import Report
+from call_home_agent.module import CallHomeAgent
+from call_home_agent.ReportLastContact import ReportLastContact, EventLastContact
+from call_home_agent.ReportInventory import ReportInventory, EventInventory
+from call_home_agent.ReportStatusAlerts import ReportStatusAlerts
+from call_home_agent.ReportStatusHealth import ReportStatusHealth
+from call_home_agent.WorkFlowUploadSnap import WorkFlowUploadSnap
+from call_home_agent.Report import Report, ReportTimes
+import mgr_module
+import traceback
  
  TEST_JWT_TOKEN = r"eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJ0ZXN0IiwiaWF0IjoxNjkxNzUzNDM5LCJqdGkiOiIwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAwMTIzNDU2Nzg5MCJ9.0F66k81_PmKoSd9erQoxnq73760SXs8WQTd3s8pqEFY\\"
  EXPECTED_JTI = '01234567890123456789001234567890'
@@ -14,207 +25,371 @@ JWT_REG_CREDS =json.dumps(JWT_REG_CREDS_DICT)
  PLAIN_PASSWORD_REG_CREDS_DICT = {"url": "test.icr.io", "username": "test_username", "password": "plain_password"}
  
  
-def fake_content(mgr):
-    return {'inventory': {}}
+class MockedMgr():
+    class Log:
+        def error(self, msg):
+            print(msg)
  
-def get_test_manager():
-    test_mgr = MagicMock()
-    test_mgr.get = Mock(return_value={'fsid': '12345'})
-    test_mgr.get_store = MagicMock(return_value=None)
+        def warning(self, msg):
+            print(msg)
  
-    test_mgr.version = "1"
-    test_mgr.target_space = "dev"
-    test_mgr.valid_container_registry = r"^.+\.icr\.io"
-    test_mgr.send_command = Mock(return_value=JWT_REG_CREDS)
-    return test_mgr
+        def info(self, msg):
+            print(msg)
+
+        def debug(self, msg):
+            #print(msg)
+            pass
+
+        def exception(self, msg):
+            print(msg)
+
+    class HealthChecks:
+        def pop(self, what, something):
+            pass
+
+    def __init__(self, *args, **kwargs):
+        self.version = '99.9'
+        self.log = self.Log()
+        self.health_checks = self.HealthChecks()
+
+    def get(self, what):
+        simple_commands = ["osd_map_crush", "devices", "df", "fs_map", "mgr_map", "osd_map_tree", "osd_metadata", "osd_map", "pg_summary", "service_map"]
+        if what == 'mon_map':
+            return {'fsid': 'mocked_fsid'}
+        elif what == 'health':
+            return {'json': json.dumps({'health': 'mocked health text'})}
+        elif what in simple_commands:
+            return {what: f"mocked {what}"}
+        else:
+            raise Exception(f"Unknown get what [{what}], please mock it")
+
+    def list_servers(self):
+        return ['mock_serverA', 'mock_serverB']
+
+    def mon_command(self, command):
+        return 0, json.dumps({'health': {'status': 'mocked health status  mon_cmd'}}), ""
+
+    def remote(self, component, command, service_name=None, hostname=None, sos_params=None):
+        m = MagicMock()
+        m.exception_str = ''
+        if command in ['list_daemons', 'get_hosts']:
+            #attrs = {'hostname': 'daemon_hostname'}
+            m.result = [Mock(hostname='daemon_hostname', labels=['_admin','meow'], ip='4.3.2.1', ports=[42])]
+            return m
+        elif command == 'sos':
+            m.result = ['sosreport_case_part1 sosreport_case_part2 sosreport_case_part3']
+            return m
+        else:
+            m.result = 'mocked hw status'
+            return m
+
+    def get_module_option(self, opt_name, default=None):
+        for opt in self.MODULE_OPTIONS:
+            if opt['name'] == opt_name:
+                return opt['default']
+        raise Exception(f"EEEEEEEEEEEEEEEEEEEEEEEEEE Can't find Option name {opt_name}")
+
+    def get_store(self, opt_name, default=None):
+        # if default is None:
+        #     raise Exception(f"EEEEEEEEEEEEEEEEEEEEEEEEEE Mocked get_store requires default")
+        return default
+
+    def set_store(self, opt_name, val):
+        pass
+
+    def set_health_checks(self, val):
+        pass
+
+    def shutdown(self):
+        pass
+
+def mocked_ceph_command(self, srv_type, prefix, key=None, mgr=None, detail=None):
+    if prefix == 'config-key get':
+        if key == 'mgr/cephadm/registry_credentials':
+            return JWT_REG_CREDS
+        else:
+            raise Exception(f"Unknown ceph command [{prefix}], key=[{key}], please mock it")
+    elif prefix in ['status', 'health', 'osd tree', 'report', 'osd dump', 'df']:
+        return f"mocked_ceph_command {prefix}"
+    else:
+        raise Exception(f"Unknown ceph command [{prefix}], please mock it")
+
+def mocked_requests_get(url, auth=None, data=None, headers=None, proxies=None, params=None):
+    """
+    Used by ReportStatusAlerts to query prometheous
+    """
+    m = Mock()
+    if "api/v1/query" in url:
+        # This is a request for Prometheous
+        m.json.return_value = {'data': {'result': [{'value': "1234"}] }}
+    elif "api/v1/targets" in url:
+        m.json.return_value = {'data': {'activeTargets': [{'health': "up"}] }}
+    else:
+        m.json.return_value = {'data': {'alerts': [] }}
+    return m
+
+
+original_time_time = time.time
+test_object = None
+debug = False
+verbose = False
+
+def mock_glob(pattern: str):
+    print(f"mock_glob: globbing {pattern}")
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    return [f"{current_dir}/testfile1", f"{current_dir}/testfile2", f"{current_dir}/testfile3"]
+
+#@patch('mgr_module.MgrModule.version', '99.9')
+class TestAgent(unittest.TestCase):
+
+    ########################### Time handling ############################
+    def mocked_time_time(self):
+        if self.test_end and self.mocked_now > self.test_end:
+            self.agent.shutdown()
+
+        debug and print(f"#### mocked_now is {self.mocked_now}")
+        return self.mocked_now
+
+    def mocked_sleep(self, seconds):
+        debug and print(f"#### mocked_sleep for {seconds} seconds")
+        self.mocked_now += seconds
+        #print("".join(traceback.format_stack()))
+
+    ########################### HTTP requests ############################
+    def mocked_requests_post(self, url, auth=None, data=None, headers=None, proxies=None, timeout=None):
+        print("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv request.post vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
+        print(f"  URL: {url}")
+        print(f"  now: {test_object.mocked_now if test_object.mocked_now is not None else 'None'}")
+        event_type = None
+        if data:
+            try:
+                pretty = json.dumps(json.loads(data), indent=4)
+                try:
+                    event_type = json.loads(data)['events'][0]['header']['event_type']
+                    if event_type == 'confirm_response':
+                        component = 'NA'
+                    else:
+                        component = json.loads(data)['events'][0]['body']['component']
+                    print(f"  event_type={event_type}  component={component}")
+                    self.sent_events[f"{event_type}-{component}"] += 1  # so we can assertEqual on it later
+                except:
+                    print('Data does not contain an event type')
+                verbose and print(f"Data: {pretty}")
+            except:
+                print(f"Data: {data}")
+        print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
+
+        m = Mock()
+        m.raise_for_status.return_value = None
+        m.text = json.dumps(self.requests_post_response)
+        m.json.return_value = json.loads(m.text)
+        if 'upload_tid' in url:  # ecurep
+            m.json.return_value = {'id': 'upload_tid123'}
+            return m
+        elif 'upload_sf' in url:  # ecurep
+            return m
+        elif 'esupport' in url:  # call home
+            if event_type == 'last_contact':
+                if self.mock_requests_send_has_ur:
+                    self.mock_requests_send_has_ur -= 1
+                    print("mocked_requests_post(): Returning yes UR")
+                    m.text = self.mocked_last_contact_response_yes_ur
+                    if self.mock_requests_cooldown_pmr:
+                        # replace the pmr so that it will be different UR for stale, but same for cooldown
+                        new_pmr = self.mock_requests_cooldown_pmr.pop()
+                        m.text = self.mocked_last_contact_response_yes_ur.replace('TS1234567', new_pmr)
+                        print(f"####### replacing PMR to {new_pmr}")
+                else:
+                    m.text = self.mocked_last_contact_response_no_ur
+                m.json.return_value = json.loads(m.text)
+            return m
+        else:
+            raise Exception(f"Unknown mocked_requests_post URL [{url}], please mock it")
+    ######################################################################
+
+    def mock_mgr(self):
+
+        CallHomeAgent.__bases__ = (MockedMgr,)
+        #patch('mgr_module.MgrModule.version', '99.9').start()
+        patch('call_home_agent.module.CallHomeAgent.ceph_command', mocked_ceph_command).start()
+        patch('call_home_agent.WorkFlowUploadSnap.DIAGS_FOLDER', '/tmp').start()
+        patch('call_home_agent.module.CallHomeAgent.get_secrets',
+              return_value={'api_key': 'mocked_api_key',
+                            'private_key': 'mocked_private_key',
+                            'ecurep_transfer_id': 'mocked_ecurep_transfer_id',
+                            'ecurep_password': 'mocked_ecurep_password'}
+              ).start()
+
+
+        patch('requests.post', self.mocked_requests_post).start()
+        patch('requests.get', mocked_requests_get).start()
+        patch('glob.glob', mock_glob).start()
+        patch('os.remove', Mock()).start()
+        patch('os.path.getsize', Mock(return_value=42)).start()
  
-class TestReport(unittest.TestCase):
      def setUp(self):
-        """ A test report is created to be used in each test
-        """
-        testMgr = get_test_manager()
-        self.patcher = patch('call_home_agent.dataDicts.get_settings')
-        self.mock_settings = self.patcher.start()
-
-        self.mock_settings.return_value = {'api_key': b'api_key',
-                                      'private_key': b'private_key'}
-
-        self.report = Report('inventory',
-                            'ceph_inventory',
-                            'Ceph cluster composition',
-                            'AB54321',
-                            'ibm_tenant_id',
-                            fake_content,
-                            "http://chesurl.com",
-                            "",
-                            15,
-                            testMgr)
-
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_content(self, mock_ceph_command):
-
-        """ Verify if some strategic fields contains the right info
-        """
-        mock_ceph_command.return_value = JWT_REG_CREDS
-
-        report = self.report.generate_report()
-
-        # header fields
-        self.assertEqual(report['agent'], "RedHat_Marine_firmware_agent")
-        self.assertNotEqual(report['api_key'], "")
-        self.assertNotEqual(report['private_key'], "")
-        self.assertEqual(report['asset'], "ceph")
-        self.assertEqual(report['analytics_event_source_type'], "asset_event")
-        self.assertEqual(report['analytics_group'], "Storage")
-        self.assertEqual(report['analytics_category'], "RedHatMarine")
-
-        # events list
-        events = report['events']
-        self.assertEqual(len(events), 1)
-
-        # event details
-        event = events[0]
-        self.assertTrue('header' in event.keys())
-        self.assertTrue('body' in event.keys())
-        self.assertEqual(event['header']['event_type'], self.report.report_type)
-        self.assertEqual(event['header']['tenant_id'], 'ibm_tenant_id')
-        self.assertEqual(event['body']['component'], 'ceph_inventory')
-
-        # event payload not empty
-        self.assertEqual(event['body']['payload']['content'], fake_content(MagicMock()))
-
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_jti_from_jwt(self, mock_ceph_command):
-        """ Extract jwt unique identifier from container registry
-        JWT user password
-        """
-        mock_ceph_command.return_value = JWT_REG_CREDS
-        report = self.report.generate_report()
-        event = report['events'][0]
-        self.assertEqual(event['body']['payload']['jti'], EXPECTED_JTI)
-
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_jti_from_jwt_not_available(self, mock_ceph_command):
-        """ Not able to extract jwt unique identifier from container registry
-            JWT user password.
-            Or the registry url is not the expected one
-        """
-        # password is not a JWT token
-        testMgr = get_test_manager()
-        mock_ceph_command.return_value = json.dumps(PLAIN_PASSWORD_REG_CREDS_DICT)
-        report = Report('inventory',
-                        'ceph_inventory',
-                        'Ceph cluster composition',
-                        'AB54321',
-                        'ibm_tenant_id',
-                        fake_content,
-                        "http://chesurl.com",
-                        "",
-                        15,
-                        testMgr)
-        report_dict = report.generate_report()
-        event = report_dict['events'][0]
-        self.assertEqual(event['body']['payload']['jti'], "")
-
-        # Url does not match the accepted registry url pattern
-        JWT_REG_CREDS_DICT['url'] = "quay.io/user"
-        mock_ceph_command.return_value = json.dumps(JWT_REG_CREDS_DICT)
-        report = Report('inventory',
-                        'ceph_inventory',
-                        'Ceph cluster composition',
-                        'AB54321',
-                        'ibm_tenant_id',
-                        fake_content,
-                        "http://chesurl.com",
-                        "",
-                        15,
-                        testMgr)
-        report_dict = report.generate_report()
-        event = report_dict['events'][0]
-        self.assertEqual(event['body']['payload']['jti'], "")
-
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_valid_registry_urls_for_jti(self, mock_ceph_command):
-        testMgr = get_test_manager()
-        test_credentials = JWT_REG_CREDS_DICT
-        for test_url in ["cp.icr.io", "cp.icr.io/cp", "cp.stg.icr.io", "cp.stg.icr.io/cp"]:
-            test_credentials["url"] = test_url
-            mock_ceph_command.return_value = json.dumps(test_credentials)
-            report = Report('inventory',
-                            'ceph_inventory',
-                            'Ceph cluster composition',
-                            'AB54321',
-                            'ibm_tenant_id',
-                            fake_content,
-                            "http://chesurl.com",
-                            "",
-                            15,
-                            testMgr)
-            report_dict = report.generate_report()
-            event = report_dict['events'][0]
-            self.assertEqual(event['body']['payload']['jti'], EXPECTED_JTI)
-
-    @patch('requests.post')
-    def test_send(self, mock_post):
-        """ Send the report properly implies to update the last_upload attribute
-        """
-        t = int(self.report.last_upload)
-        self.report.send()
-        self.assertGreaterEqual(int(self.report.last_upload), t)
-
-    @patch('requests.post')
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_communication_error(self, mock_ceph_cmd, mock_post):
-        """Any kind of error executing the "POST" will be raised
-        """
-        mock_ceph_cmd.return_value = {}
-        mock_post.side_effect=Exception('COM Error')
-        self.report.interval = 60
-        self.report.last_upload = str(int(time.time()) - 90)
-
-        with self.assertRaises(Exception) as context:
-            self.report.send()
-        self.assertTrue('COM Error' in str(context.exception))
-
-    @patch('requests.post')
-    def test_not_time_to_send(self, mock_post):
-        """A report only can be sent when the time to send the report arrives
-        """
-        self.report.interval = 60
-        self.report.last_upload = str(int(time.time()))
-        self.report.send()
-        mock_post.assert_not_called()
-
-    @patch('requests.post')
-    @patch('call_home_agent.dataDicts.ceph_command')
-    def test_not_time_to_send_but_forced(self, mock_ceph_cmd, mock_post):
-        """A report only can be sent when the time to send the report arrives,
-           except if you force the operation
-        """
-        mock_ceph_cmd.return_value = {}
-        self.report.interval = 60
-        self.report.last_upload = str(int(time.time()))
-        self.report.send(force=True)
-        mock_post.assert_called()
-
-    @patch('requests.get')
-    def test_exec_prometheus_query(self, mock_get):
-        request_get_response = MagicMock(status_code=200, reason='pepe', text='{"status":"success","data":{"resultType":"vector","result":[{"metric":{"ceph_health":"HEALTH_OK"},"value":[1616414100,"1"]}]}}')
-        mock_get.return_value = request_get_response
-        result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
-        assert result['status'] ==  "success"
-
-        # Test metric error (server is ok, but something wrong executing the query):
-        request_get_response.raise_for_status = MagicMock(side_effect=Exception("Error in metrics"))
-        with self.assertRaises(Exception) as exception_context:
-            result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
-        self.assertRegex(str(exception_context.exception), "Error in metrics")
-
-        # Result metrics not returned because a Prometheus server problem
-        mock_get.side_effect=Exception("Server error")
-        with self.assertRaises(Exception) as exception_context:
-            result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
-        self.assertRegex(str(exception_context.exception), "Server error")
-
-    def tearDown(self):
-        self.patcher.stop()
+        self.mock_mgr()
+        global test_object
+        test_object = self
+
+        ####### time handling #######
+        #self.mocked_now = original_time_time()
+        self.mocked_now = 0
+        self.test_end = None
+
+        ####### HTTP requests handling #######
+        self.mocked_last_contact_response_no_ur = None
+        self.mocked_last_contact_response_yes_ur = None
+        self.mock_requests_send_has_ur = 0
+        self.mock_requests_cooldown_pmr = []
+        self.sent_events = defaultdict(int)
+
+        # Load the json answers that requests.post() should return
+        with open(os.path.dirname(__file__) + '/response_no_pending_ur.json', 'r') as resp:
+            self.mocked_last_contact_response_no_ur = resp.read()
+
+        with open(os.path.dirname(__file__) + '/response_yes_pending_ur.json', 'r') as resp:
+            self.mocked_last_contact_response_yes_ur = resp.read()
+
+        self.requests_post_response = {'some': 'answer'}
+
+    def test_reports(self):
+        agent = CallHomeAgent()
+        self.agent = agent
+        ReportInventory(agent).run()
+        ReportLastContact(agent).run()
+        # ReportStatusAlerts: We dont fully mock the returned json to get_prometheus_alerts(), therefore
+        #   it raises an exception, catches it, and generates a "Can't read from prometheus" health alert.
+        ReportStatusAlerts(agent).run()
+        ReportStatusHealth(agent).run()
+
+    def test_last_contact_no_ur(self):
+        agent = CallHomeAgent()
+        self.agent = agent
+        ReportLastContact(agent).run()
+        self.assertEqual(len(agent.ur_queue), 0)
+        self.assertEqual(self.sent_events['status-ceph_log_upload'], 0)
+        self.assertEqual(self.sent_events['confirm_response-NA'], 0)
+
+    def test_last_contact_yes_ur(self):
+        agent = CallHomeAgent()
+        self.agent = agent
+        self.mock_requests_send_has_ur = True
+        ReportLastContact(agent).run()
+        self.assertEqual(self.sent_events['status-ceph_log_upload'], 4)
+        self.assertEqual(self.sent_events['confirm_response-NA'], 1)
+        self.assertEqual(len(agent.ur_queue), 0)
+        self.assertEqual(len(agent.ur_stale), 1)
+        self.assertEqual(len(agent.ur_cooldown), 1)
+
+    def test_serve_and_stale(self):
+        with patch('time.time', side_effect=self.mocked_time_time), patch('time.sleep', side_effect=self.mocked_sleep), patch('threading.Event.wait', side_effect=self.mocked_sleep):
+            agent = CallHomeAgent()
+
+            self.test_end = self.mocked_time_time() + 86000  # a bit less than a day
+            self.mock_requests_send_has_ur = 2
+            self.agent = agent
+            # This test checks the UR stale mechanism by running the agent for enough time for a stale message to be old enough to be deleted
+            # from the stale memory. By default, stale_timeout is 10 days but running this test, emulating 10 days takes too much time
+            # so we change the stale_timeout to 1 day so it will be quicker.
+            agent.stale_timeout = 86400
+            agent.serve()
+
+            self.assertEqual(len(agent.ur_queue), 0)
+            self.assertEqual(len(agent.ur_stale), 1) # 1 if less than 24 hours. 0 if more
+            self.assertEqual(len(agent.ur_cooldown), 0)
+            self.assertEqual(self.sent_events['confirm_response-NA'], 1)
+
+            self.mocked_sleep(1000) # now we're past 24 hours since start
+
+            self.test_end = self.mocked_time_time() + 43200  # half a day
+            self.mock_requests_send_has_ur = 2
+            agent.run = True
+            agent.serve()
+
+            self.assertEqual(self.sent_events['confirm_response-NA'], 2)
+            #self.assertEqual(1, 0)
+
+    def test_serve_and_cooldown(self):
+        with patch('time.time', side_effect=self.mocked_time_time), patch('time.sleep', side_effect=self.mocked_sleep), patch('threading.Event.wait', side_effect=self.mocked_sleep):
+            agent = CallHomeAgent()
+
+            self.test_end = self.mocked_time_time() + 86000  # a bit less than a day
+            self.mock_requests_send_has_ur = 2
+            self.mock_requests_cooldown_pmr = ['TS1234567', 'TS1234568']
+            self.agent = agent
+            agent.serve()
+
+            self.assertEqual(len(agent.ur_queue), 0)
+            self.assertEqual(len(agent.ur_stale), 2) # as we got 2 different PMRs, we have 2 for stale.
+            self.assertEqual(len(agent.ur_cooldown), 0)
+            self.assertEqual(self.sent_events['confirm_response-NA'], 2)
+
+    def test_report_multiple_events(self):
+        class ReportMultiple(Report):
+            def __init__(self, agent, event_classes) -> None:
+                super().__init__(agent, 'test_multiple', event_classes)
+
+        agent = CallHomeAgent()
+        self.agent = agent
+        ReportMultiple(agent, [EventInventory, EventLastContact]).run()
+        #self.assertEqual(1, 0)
+
+    def test_cli_print_report_cmd(self):
+        agent = CallHomeAgent()
+        ret = agent.cli_show('status')
+        
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        print(ret.stdout)
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        self.assertFalse('private_key' in ret.stdout)
+        self.assertFalse('api_key' in ret.stdout)
+        self.assertTrue('target_space' in ret.stdout)
+
+    def test_cli_send_report_cmd(self):
+        agent = CallHomeAgent()
+        ret = agent.cli_send('status')
+        
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        print(ret.stdout)
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        self.assertEqual(ret.stdout, 'status report sent successfully:\n{\n    "some": "answer"\n}')
+
+    def test_cli_upload_diagnostics(self):
+        agent = CallHomeAgent()
+        ret = agent.cli_upload_diagnostics('ticket123', 1)
+        
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        print(ret.stdout)
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        self.assertEqual(ret.stdout, 'Success')
+
+    def test_cli_list_queues(self):
+        agent = CallHomeAgent()
+        self.mock_requests_send_has_ur = 2
+        self.mock_requests_cooldown_pmr = ['TS1234567', 'TS1234568']
+        ReportLastContact(agent).run()
+        ReportLastContact(agent).run()
+        ret = agent.cli_list_queues()
+        
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        print(ret.stdout)
+        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
+        self.assertTrue('upload_snap-3-TS1234568-' in ret.stdout)
+
+    def test_connectivity_status(self):
+        agent = CallHomeAgent()
+        status = agent.get_call_home_status()
+        self.assertEqual(status['connectivity'], False)
+
+        # Send message, expect an error because it's not returning the correct json fields
+        agent.test_connectivity()
+        status = agent.get_call_home_status()
+        self.assertEqual(status['connectivity'], False)
+        self.assertEqual(status['connectivity_error'], 'Bad response from Call Home: {\n    "some": "answer"\n}')
+
+        self.requests_post_response = {"service": "ibm_callhome_connect", "more": "info"}
+        agent.test_connectivity()
+        status = agent.get_call_home_status()
+        self.assertEqual(status['connectivity'], True)
+        self.assertEqual(status['connectivity_error'], 'Success')
diff --git a/src/pybind/mgr/call_home_agent/tests/testfile1 b/src/pybind/mgr/call_home_agent/tests/testfile1

new file mode 100644 (file)

index 0000000..296363f
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/tests/testfile1
@@ -0,0 +1 @@
+file1/3
diff --git a/src/pybind/mgr/call_home_agent/tests/testfile2 b/src/pybind/mgr/call_home_agent/tests/testfile2

new file mode 100644 (file)

index 0000000..68020fe
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/tests/testfile2
@@ -0,0 +1 @@
+file2/3 abcd
diff --git a/src/pybind/mgr/call_home_agent/tests/testfile3 b/src/pybind/mgr/call_home_agent/tests/testfile3

new file mode 100644 (file)

index 0000000..72dca9b
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/tests/testfile3
@@ -0,0 +1 @@
+file3/3 efghigjlmnopw
diff --git a/src/pybind/mgr/call_home_agent/workflow_upload_snap.py b/src/pybind/mgr/call_home_agent/workflow_upload_snap.py

new file mode 100644 (file)

index 0000000..90e740c
--- /dev/null
+++ b/src/pybind/mgr/call_home_agent/workflow_upload_snap.py
@@ -0,0 +1,342 @@
+from .report import Report, ReportTimes, Event
+from .report_ur_error import ReportURError
+from .exceptions import *
+import time
+import urllib.parse
+from typing import Tuple, Optional
+import glob
+import os
+import traceback
+import re
+import requests
+
+# Constants for operation status
+OPERATION_STATUS_NEW = 'READY'
+OPERATION_STATUS_IN_PROGRESS = 'IN_PROGRESS'
+OPERATION_STATUS_COMPLETE = 'COMPLETE'
+OPERATION_STATUS_ERROR = 'ERROR'
+OPERATION_STATUS_REQUEST_REJECTED = 'REQUEST_REJECTED'
+
+# Constant for store default ceph logs folder
+# Diagnostic files are collected in this folder
+DIAGS_FOLDER = '/var/log/ceph'
+
+class WorkFlowUploadSnap:
+    def __init__(self, agent, req, req_id, report_event_id):
+        self.agent = agent
+        self.req = req
+        self.req_id = req_id  # unique ID for this request
+        self.pmr = self.req.get('options', {}).get('pmr', None)
+        self.report_event_id = report_event_id
+        self._event_id_counter = 0
+        self.si_requestid = self.req.get('options', {}).get('si_requestid', '')
+
+    def next_event_id(self):
+        # self.report_event_id may be None if the report was triggered by CLI cli_upload_diagnostics
+        ret = self.report_event_id if self.report_event_id else "cli" + f"-{self._event_id_counter}"
+        self._event_id_counter += 1
+        return ret
+
+    def run(self):
+        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : Processing new request {self.req}")
+        if not self.pmr:
+            self.agent.log.warning(f"WorkFlowUploadSnap <{self.req_id}> : Error - No PMR in request.")
+            ReportURError(self.agent, self.next_event_id())
+            return
+
+        try:
+            commands_file = self.collect_diagnostic_commands()
+            sos_files_pattern = ""
+            snap_level = int(self.req.get('options', {}).get('level', 1))
+            if snap_level > 1:
+                sos_files_pattern = self.collect_sos_report()
+            # Send commands file:
+            self.upload_file(commands_file, percent_complete = 1 if sos_files_pattern else 100)
+
+            # Send sos file splitted when we have files
+            if sos_files_pattern:
+                sos_file_name = f'{sos_files_pattern[:-2]}.xz'
+                self.upload_file(sos_file_name, sos_files_pattern)
+
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> :  Completed operation")
+        except Exception as ex:
+            self.agent.log.error(f'Operations ({self.req_id}): Error processing operation {self.req}. Exception={ex} trace={traceback.format_exc()}')
+            ReportStatusLogUpload(self.agent, self.next_event_id(), self.si_requestid, 0, f"ERROR: {ex}", OPERATION_STATUS_ERROR).run()
+
+        # if it was ok or not, we always report the state
+        ReportConfirmResponse(self.agent, self.next_event_id()).run()
+        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : Finished processing {self.req}")
+
+    def collect_diagnostic_commands(self) -> str:
+        """
+        Collect information from the cluster
+
+            ceph status
+            ceph health detail
+            ceph osd tree
+            ceph report
+            ceph osd dump
+            ceph df
+
+        """
+        output = ""
+        output += "\nceph status\n" + self.agent.ceph_command(srv_type='mon', prefix='status')
+        output += "\nceph health detail\n" + self.agent.ceph_command(srv_type='mon', prefix='health', detail='detail')
+        output += "\nceph osd tree\n" + self.agent.ceph_command(srv_type='mon', prefix='osd tree')
+        output += "\nceph report\n" + self.agent.ceph_command(srv_type='mon', prefix='report')
+        output += "\nceph osd dump\n" + self.agent.ceph_command(srv_type='mon', prefix='osd dump')
+        output += "\nceph df detail\n" + self.agent.ceph_command(srv_type='mon', prefix='df', detail='detail')
+
+        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : diagnostics commands collected")
+
+        try:
+            cmds_file_prefix = 'ceph_commands_case'
+            # Remove previous commands files
+            for file in glob.glob(f'{DIAGS_FOLDER}/{cmds_file_prefix}*'):
+                os.remove(file)
+            timestamp_sos_file = int(time.time() * 1000)
+            file_name = f'{cmds_file_prefix}_{self.pmr}_{timestamp_sos_file}.txt'
+            with open(f'{DIAGS_FOLDER}/{file_name}', 'w') as commands_file:
+                commands_file.write(output)
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : diagnostics commands stored in file {file_name}")
+            return file_name
+        except Exception as ex:
+            raise Exception(f"WorkFlowUploadSnap <{self.req_id}> : Error trying to save the commands file for diagnostics: {ex} trace={traceback.format_exc()}")
+
+        return ""
+
+    def collect_sos_report(self) -> str:
+        """
+        SOS report gathered from a Ceph Monitor node
+        Best node to execute the sos command is
+        1. Monitor + admin node + active mgr
+        2. Monitor + admin node
+        3. monitor
+        """
+
+        # Remove previous sos report files:
+        for file in glob.glob(f'{DIAGS_FOLDER}/sosreport_case_*'):
+            os.remove(file)
+
+        # Get the best monitor node to execute the sos report
+        best_mon, active_mgr = self.get_best_collect_node()
+        mgr_target = ""
+        if best_mon != active_mgr and active_mgr:
+            mgr_target = f"--mgr-target {active_mgr}"
+        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : selected host for sos command is {best_mon}, active manager is {active_mgr}")
+
+        # Execute the sos report command
+        sos_cmd_execution = self.agent.remote('cephadm', 'sos',
+                                          hostname = best_mon,
+                                          sos_params = f'{mgr_target} report --batch --quiet --case-id {self.pmr}')
+        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : sos command executed succesfully: {sos_cmd_execution.result}")
+        if sos_cmd_execution.exception_str:
+            raise Exception(f"Error trying to get the sos report files for diagnostics(error_code): {sos_cmd_execution.exception_str}")
+
+        # output is like:
+        # ['New sos report files can be found in /var/log/ceph/<fsid>/sosreport_case_124_1706548742636_*']
+        pattern = r'sosreport_case_\S+'
+        matches = re.findall(pattern, sos_cmd_execution.result[0])
+        if matches:
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : sos command files pattern is: {matches[0]}")
+            result = matches[0]
+        else:
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : sos report files pattern not found in: {sos_cmd_execution.result}")
+            result = ""
+
+        # If there is any issue executing the command, the output will be like:
+        # ['Issue executing <['sos', 'report', '--batch', '--quiet', '--case-id', 'TS015034298', '-p', 'container']>: 0:[plugin:ceph_mon] Failed to find ceph version, command collection will be limited
+        #
+        # New sos report files can be found in /var/log/ceph/<fsid>/sosreport_case_TS015034298_1709809018376_*']
+        # in this case, we leave a warning in the log about the issue
+        pattern = r'^Issue executing.*'
+        matches = re.findall(pattern, sos_cmd_execution.result[0])
+        if matches:
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : review sos command execution in {best_mon}: {matches[0]}")
+
+        return result
+
+    def get_best_collect_node(self) -> Tuple[str, str]:
+        """
+        Select the best monitor node where to run a sos report command
+        retuns the best monitor node and the active manager
+        """
+        nodes = {}
+        active_manager = ""
+        best_monitor = ""
+
+        # We add all the monitors
+        monitors = self.agent.remote('cephadm', 'list_daemons', service_name='mon')
+        if monitors.exception_str:
+            raise Exception(monitors.exception_str)
+
+        for daemon in monitors.result:
+            nodes[daemon.hostname] = 1
+
+        # lets add one point to a monitor if it is a cephadm admin node
+        cluster_nodes = self.agent.remote('cephadm', 'get_hosts')
+        if cluster_nodes.exception_str:
+            raise Exception(cluster_nodes.exception_str)
+
+        for host in cluster_nodes.result:
+            if '_admin' in host.labels:
+                try:
+                    nodes[host.hostname] += 1
+                    break
+                except KeyError:
+                    pass
+
+        # get the active mgr.
+        managers = self.agent.remote('cephadm', 'list_daemons', service_name='mgr')
+        if managers.exception_str:
+            raise Exception(monitors.exception_str)
+
+        for daemon in managers.result:
+            if daemon.is_active:
+                active_manager = daemon.hostname
+                try:
+                    nodes[daemon.hostname] += 1
+                except KeyError:
+                    pass
+
+        # get the winner monitor
+        best_monitor = max(nodes, key=nodes.get)
+
+        return best_monitor, active_manager
+
+    def upload_file(self, file_name: str, chunk_pattern: str = '', percent_complete: int = 100) -> None:
+        """
+        Upload a file to ecurep.
+        chunk_pattern: If provided, the file is divided in chunks
+        percent_complete: will send `percent_complete` percent in the ReportStatusLogUpload message.
+            If level == 2, we send 1% after the diagnostics file and before the sos report files,
+            if level == 1, we send 100% after the diagnostics file, as there are no more files to send
+        """
+
+        # We first consider the module options to allow for flexible
+        # workarounds should we need them, otherwise we load the default keys
+        auth = self.agent.get_ecurep_user_pass()
+        if self.agent.owner_company_name:
+            owner = self.agent.owner_company_name
+        else:
+            owner = "MyCompanyUploadClient"
+
+        resp = None
+        file_path = 'None'
+
+        # Get the unique Upload ID for the file
+        try:
+            #None 1. Obtain the file id to upload the file
+            ecurep_file_id_url = f'{self.agent.ecurep_url}/app/upload_tid?name={urllib.parse.quote(file_name)}&client={urllib.parse.quote(owner)}'
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : getting unique upload id from <{ecurep_file_id_url}>")
+            resp = requests.post(url=ecurep_file_id_url, auth=auth, timeout=30)
+            resp.raise_for_status()
+            file_id_for_upload = resp.json()['id']  # throw on purpose if there is no file_id_for_upload
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : unique id for upload is <{file_id_for_upload}>")
+        except Exception as ex:
+            explanation = resp.text if resp else ""
+            raise SendError(f'WorkFlowUploadSnap <{self.req_id}> : Failed to send <{file_name}> to <{ecurep_file_id_url}>: {ex}: {explanation} trace={traceback.format_exc()}')
+
+        try:
+            # 2. Upload the file
+            ecurep_file_upload_url = f'{self.agent.ecurep_url}/app/upload_sf/files/{file_id_for_upload}?case_id={urllib.parse.quote(self.pmr)}&client={urllib.parse.quote(owner)}'
+            file_size = 0
+            if chunk_pattern:
+                files_to_upload = (glob.glob(f'{DIAGS_FOLDER}/{chunk_pattern}'))
+                for part in files_to_upload:
+                    file_size += os.path.getsize(part)
+            else:
+                files_to_upload = [f'{DIAGS_FOLDER}/{file_name}']
+                file_size = os.path.getsize(f'{DIAGS_FOLDER}/{file_name}')
+
+            start_byte = 0
+            part_sent = 0
+            self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : uploading file {file_name} to <{ecurep_file_upload_url}>")
+            for file_path in sorted(files_to_upload):
+                chunk_size = os.path.getsize(file_path)
+                with open(file_path, 'rb') as file:
+                    if chunk_pattern:
+                        self.agent.log.info(f"WorkFlowUploadSnap <{self.req_id}> : uploading part {file_path} to <{ecurep_file_upload_url}>")
+                    resp = requests.post(url = ecurep_file_upload_url,
+                                        data = file.read(),
+                                        headers = {'Content-Type': 'application/octet-stream',
+                                                   'X-File-Name': file_name,
+                                                   'X-File-Size': f'{file_size}',
+                                                   'Content-Range': f'bytes {start_byte}-{chunk_size + start_byte}/{file_size}'
+                                        },
+                    )
+                    self.agent.log.info(f'WorkFlowUploadSnap <{self.req_id}> : uploaded {file_name} -> bytes {start_byte}-{chunk_size + start_byte}/{file_size}')
+                    resp.raise_for_status()
+                start_byte += chunk_size
+                part_sent += 1
+                if chunk_pattern:
+                    percent_progress = int(part_sent/len(files_to_upload) * 100)
+                    status = OPERATION_STATUS_COMPLETE if percent_progress == 100 else OPERATION_STATUS_IN_PROGRESS
+                    ReportStatusLogUpload(self.agent, self.next_event_id(), self.si_requestid, percent_progress, f"file <{file_name}> is being sent", status).run()
+                else:
+                    status = OPERATION_STATUS_COMPLETE if percent_complete == 100 else OPERATION_STATUS_IN_PROGRESS
+                    ReportStatusLogUpload(self.agent, self.next_event_id(), self.si_requestid, percent_complete, status, status).run()
+        except Exception as ex:
+            explanation = resp.text if resp else ""
+            raise SendError(f'WorkFlowUploadSnap <{self.req_id}> : Failed to send <{file_path}> to <{ecurep_file_upload_url}>: {ex}: {explanation} trace={traceback.format_exc()}')
+
+class ReportStatusLogUpload(Report):
+    def __init__(self, agent, report_event_id, si_requestid, percent_progress: int, description: str, status: str):
+        super().__init__(agent, 'upload_snap_progress')
+        self.percent_progress = percent_progress
+        self.description = description
+        self.status = status
+        self.report_event_id = report_event_id  # We use the same report envalope event_id that we received in the upload_snap UR
+        self.si_requestid = si_requestid
+
+    def compile(self) -> Optional[dict]:
+        # We override run because this event gets a non standard generate arguments
+        report_times = ReportTimes()
+        report = self.get_report_headers(report_times, self.report_event_id)
+        event = EventStatusLogUpload(self.agent).generate(report_times, self.si_requestid, self.percent_progress, self.description, self.status)
+        report['events'].append(event.data)
+        return report
+
+class EventStatusLogUpload(Event):
+    def gather(self) -> dict:
+        return {}
+
+    def generate(self, report_times: ReportTimes, si_requestid: str, percent_progress: int, description: str, status: str) -> None:
+        super().generate('status', 'ceph_log_upload', report_times)
+
+        complete = percent_progress == 100
+        self.data["body"] = {
+            "component": "ceph_log_upload",
+            "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+            "product":  "Red Hat Ceph",
+            "description": description,
+            "state" : f"{status} ({percent_progress}%)",
+            "complete" : complete,
+            "payload": {
+                "action": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+                "description": description,
+                "state" : status,
+                "progress": percent_progress,
+                "complete" : complete,
+                "si_requestid": si_requestid,
+            }
+        }
+        return self
+
+class ReportConfirmResponse(Report):
+    def __init__(self, agent, report_event_id) -> None:
+        super().__init__(agent, 'confirm_response', [EventConfirmResponse])
+        self.report_event_id = report_event_id  # We use the same report envalope event_id that we received in the upload_snap UR
+
+class EventConfirmResponse(Event):
+    def gather(self) -> dict:
+        return {}
+
+    def generate(self, report_times: ReportTimes):
+        super().generate('confirm_response', 'ceph_operations', report_times)
+
+        self.data["body"] = {
+                "event_transaction_id": "Unsolicited_Storage_Insights_RedHatMarine_ceph_Request",
+                "event_type": "last_contact",
+        }
+        return self
diff --git a/src/pybind/mgr/dashboard/controllers/call_home.py b/src/pybind/mgr/dashboard/controllers/call_home.py

index 45035ea6c384555bb1fae86bba021cf87e038736..474fa6cb8ad720992fd3ce4b0a08f9bf23b061a3 100644 (file)
--- a/src/pybind/mgr/dashboard/controllers/call_home.py
+++ b/src/pybind/mgr/dashboard/controllers/call_home.py
@@ -15,7 +15,7 @@ class CallHome(RESTController):
                         first_name: str, last_name: str,
                         email: str) -> Tuple[int, str, str]:
          try:
-            error_code, out, err = mgr.remote('call_home_agent', 'list_tenants', ibm_id, company_name,
+            error_code, out, err = mgr.remote('call_home_agent', 'cli_list_tenants', ibm_id, company_name,
                                                first_name, last_name, email)
              if error_code != 0:
                  raise DashboardException(f'Listing tenants error: {err}')
@@ -29,7 +29,7 @@ class CallHome(RESTController):
      def set(self, tenant_id: str, ibm_id: str, company_name: str,
                  first_name: str, last_name: str, email: str) -> Tuple[int, str, str]:
          try:
-            error_code, _, err = mgr.remote('call_home_agent', 'set_tenant_id', tenant_id, ibm_id,
+            error_code, _, err = mgr.remote('call_home_agent', 'cli_set_tenant', tenant_id, ibm_id,
                                                  company_name, first_name, last_name, email)
              if error_code != 0:
                  raise DashboardException(f'Error setting tenant id: {err}')
@@ -41,7 +41,7 @@ class CallHome(RESTController):
      @Endpoint('GET')
      def download(self, report_type: str):
          try:
-            error_code, out, err = mgr.remote('call_home_agent', 'print_report_cmd', report_type)
+            error_code, out, err = mgr.remote('call_home_agent', 'cli_show', report_type)
              if error_code != 0:
                  raise DashboardException(f'Error downloading report: {err}')
          except RuntimeError as e:
@@ -52,7 +52,7 @@ class CallHome(RESTController):
      @Endpoint('GET')
      def info(self):
          try:
-            error_code, out, err = mgr.remote('call_home_agent', 'customer')
+            error_code, out, err = mgr.remote('call_home_agent', 'cli_get_user_info')
              if error_code != 0:
                  raise DashboardException(f'Error getting customer info: {err}')
          except RuntimeError as e:
author	Yaarit Hatuka <yhatuka@ibm.com>
	Thu, 16 May 2024 02:08:32 +0000 (22:08 -0400)
committer	Justin Caratzas <jcaratza@redhat.com>
	Tue, 23 Sep 2025 13:07:09 +0000 (09:07 -0400)
src/pybind/mgr/call_home_agent/__init__.py		patch \| blob \| history
src/pybind/mgr/call_home_agent/config.py	[deleted file]	patch \| blob \| history
src/pybind/mgr/call_home_agent/dataDicts.py	[deleted file]	patch \| blob \| history
src/pybind/mgr/call_home_agent/design.md	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/exceptions.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/module.py		patch \| blob \| history
src/pybind/mgr/call_home_agent/prometheus.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_inventory.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_last_contact.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_performance.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_status_alerts.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_status_health.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/report_ur_error.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/tests/response_no_pending_ur.json	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/tests/response_yes_pending_ur.json	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/tests/test_agent.py		patch \| blob \| history
src/pybind/mgr/call_home_agent/tests/testfile1	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/tests/testfile2	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/tests/testfile3	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/call_home_agent/workflow_upload_snap.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/dashboard/controllers/call_home.py		patch \| blob \| history