mgr/diskprediction_local: Reverted dependencies, added HGST models

author Karanraj Chauhan <chauhank@bu.edu>

Wed, 6 Nov 2019 18:46:45 +0000 (13:46 -0500)

committer Karanraj Chauhan <chauhank@bu.edu>

Thu, 7 Nov 2019 13:38:48 +0000 (08:38 -0500)
author Karanraj Chauhan <chauhank@bu.edu>
Wed, 6 Nov 2019 18:46:45 +0000 (13:46 -0500)
committer Karanraj Chauhan <chauhank@bu.edu>
Thu, 7 Nov 2019 13:38:48 +0000 (08:38 -0500)
diff --git a/ceph.spec.in b/ceph.spec.in

index 50d1ff0027272aba89f154fda91b1ad122030b7e..519f0d257b57d54520ac45536798380f06df28f1 100644 (file)
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -537,16 +537,13 @@ Requires:       ceph-mgr = %{_epoch_prefix}%{version}-%{release}
  Requires:       python%{_python_buildid}-numpy
  %if 0%{without python2}
  Requires:       python3-scipy
-Requires:       python3-pandas
  %else
  Requires:       python2-scipy
-Requires:       python2-pandas
  %endif
  %endif
  %if 0%{?rhel} == 7
  Requires:       numpy
  Requires:       scipy
-Requires:       pandas
  %endif
  %description mgr-diskprediction-local
  ceph-mgr-diskprediction-local is a ceph-mgr plugin that tries to predict
diff --git a/debian/control b/debian/control

index 0c8d51ac2c9fd058d77a78b6fe50c054fcb06954..3f43f25e776e38ccf23e8f83da5c617e40a73744 100644 (file)
--- a/debian/control
+++ b/debian/control
@@ -269,7 +269,6 @@ Depends: ceph-mgr (= ${binary:Version}),
           python-numpy,
           python-scipy,
           python-sklearn,
-         python-pandas,
           ${misc:Depends},
           ${python:Depends},
           ${shlibs:Depends},
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/config.json b/src/pybind/mgr/diskprediction_local/models/redhat/config.json

index 127d6ea4d8b2292e59ee826af3bd647a3e23296c..62a0d828264bbe30188e6fdee075a21c5b0b6d1f 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/models/redhat/config.json
+++ b/src/pybind/mgr/diskprediction_local/models/redhat/config.json
@@ -1,3 +1,4 @@
  {
-"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"]
+"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"],
+"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_raw", "smart_199_raw"]
  }
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl

new file mode 100644 (file)

index 0000000..9894d9f

Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl

new file mode 100644 (file)

index 0000000..6f77b85

Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib

deleted file mode 100644 (file)

index ee7d420..0000000

Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl

new file mode 100644 (file)

index 0000000..280d59a

Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib

deleted file mode 100644 (file)

index 0b769b9..0000000

Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl

new file mode 100644 (file)

index 0000000..691bb03

Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/predictor.py b/src/pybind/mgr/diskprediction_local/predictor.py

index 1a19bf61a520df15c171d507528128dc6161adbc..548454145cea94c5b890db09c3c41279cecdd90d 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/predictor.py
+++ b/src/pybind/mgr/diskprediction_local/predictor.py
@@ -23,12 +23,10 @@ An example code is as follows:
  """
  import os
  import json
-import joblib
  import pickle
  import logging
  
  import numpy as np
-import pandas as pd
  from scipy import stats
  
  
@@ -90,10 +88,10 @@ class RHDiskFailurePredictor(object):
          # ensure all manufacturers whose context is defined in config file
          # have models and scalers saved inside model_dirpath
          for manufacturer in self.model_context:
-            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib")
+            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
              if not os.path.isfile(scaler_path):
                  return "Missing scaler file: {}".format(scaler_path)
-            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib")
+            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
              if not os.path.isfile(model_path):
                  return "Missing model file: {}".format(model_path)
  
@@ -123,45 +121,56 @@ class RHDiskFailurePredictor(object):
              )
              return None
  
-        # convert to dataframe, keeping only the required features
+        # convert to structured array, keeping only the required features
+        # assumes all data is in float64 dtype
          try:
-            disk_days_df = pd.DataFrame(disk_days, columns=model_smart_attr)
+            struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
+            values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
+            disk_days_sa = np.array(values, dtype=struc_dtypes)
          except KeyError as e:
              RHDiskFailurePredictor.LOGGER.debug(
                  "Mismatch in SMART attributes used to train model and SMART attributes available"
              )
              return None
  
+        # view structured array as 2d array for applying rolling window transforms
+        # do not include capacity_bytes in this. only use smart_attrs
+        disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
+                            .view(np.float64).reshape(disk_days_sa.shape + (-1,))
+
          # featurize n (6 to 12) days data - mean,std,coefficient of variation
          # current model is trained on 6 days of data because that is what will be
          # available at runtime
-        # NOTE: ensure unique indices so that features can be merged w/ pandas errors
-        disk_days_df = disk_days_df.reset_index(drop=True)
-        means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean()
-        stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std()
-        cvs = stds.divide(means, fill_value=0)
-
-        # rename and combine features into one df
-        means = means.rename(columns={col: "mean_" + col for col in means.columns})
-        stds = stds.rename(columns={col: "std_" + col for col in stds.columns})
-        cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns})
-        featurized_df = means.merge(stds, left_index=True, right_index=True)
-        featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True)
-
-        # drop rows where all features (mean,std,cv) are nans
-        featurized_df = featurized_df.dropna(how="all")
-
-        # fill nans created by cv calculation
-        featurized_df = featurized_df.fillna(0)
  
-        # capacity is not a feature that varies over time
-        featurized_df["user_capacity"] = disk_days_df["user_capacity"].iloc[0]
+        # rolling time window interval size in days
+        roll_window_size = 6
+
+        # rolling means generator
+        gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \
+                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
+        means = np.vstack(gen)
+
+        # rolling stds generator
+        gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \
+                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
+        stds = np.vstack(gen)
+
+        # coefficient of variation
+        cvs = stds / means
+        cvs[np.isnan(cvs)] = 0
+        featurized = np.hstack((
+                                means,
+                                stds,
+                                cvs,
+                                disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1)
+                                ))
  
          # scale features
-        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib")
-        scaler = joblib.load(scaler_path)
-        featurized_df = scaler.transform(featurized_df)
-        return featurized_df
+        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
+        with open(scaler_path, 'rb') as f:
+            scaler = pickle.load(f)
+        featurized = scaler.transform(featurized)
+        return featurized
  
      @staticmethod
      def __get_manufacturer(model_name):
@@ -210,9 +219,10 @@ class RHDiskFailurePredictor(object):
  
          # get model for current manufacturer
          model_path = os.path.join(
-            self.model_dirpath, manufacturer + "_predictor.joblib"
+            self.model_dirpath, manufacturer + "_predictor.pkl"
          )
-        model = joblib.load(model_path)
+        with open(model_path, 'rb') as f:
+            model = pickle.load(f)
  
          # use prediction for most recent day
          # TODO: ensure that most recent day is last element and most previous day
diff --git a/src/pybind/mgr/diskprediction_local/requirements.txt b/src/pybind/mgr/diskprediction_local/requirements.txt

index f524b07a1e74549bba890739e0d17ce28f6a07f3..d9c3157fdf9326dffd41b6df7654f666352e6e97 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/requirements.txt
+++ b/src/pybind/mgr/diskprediction_local/requirements.txt
@@ -1,4 +1,3 @@
  numpy==1.15.1
  scipy==1.1.0
-pandas==0.23.4
  scikit-learn==0.19.2
author	Karanraj Chauhan <chauhank@bu.edu>
	Wed, 6 Nov 2019 18:46:45 +0000 (13:46 -0500)
committer	Karanraj Chauhan <chauhank@bu.edu>
	Thu, 7 Nov 2019 13:38:48 +0000 (08:38 -0500)
ceph.spec.in		patch \| blob \| history
debian/control		patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/redhat/config.json		patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib	[deleted file]	patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib	[deleted file]	patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/diskprediction_local/predictor.py		patch \| blob \| history
src/pybind/mgr/diskprediction_local/requirements.txt		patch \| blob \| history