From: Karanraj Chauhan <chauhank@bu.edu>
Date: Wed, 6 Nov 2019 18:46:45 +0000 (-0500)
Subject: mgr/diskprediction_local: Reverted dependencies, added HGST models
X-Git-Tag: v15.1.0~981^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F29437%2Fhead;p=ceph.git

mgr/diskprediction_local: Reverted dependencies, added HGST models

Removed pandas from requirements.txt, ceph.spec.in, and debian/control
because of installation issues in RHEL/CentOS.

Replaced pandas usages in RHDiskFailurePredictor with similar numpy
counterparts (e.g. structured array instead of dataframe)
Replaced joblib usages with pickle because older version of scikit-learn
did not list joblib as a dependency and so it wasnt getting installed.
Using joblib would have required specifying it as a separate dependency
in spec file and requirements.

Added HGST models for RHDiskFailurePredictor.

Signed-off-by: Karanraj Chauhan <kachauha@redhat.com>
---

diff --git a/ceph.spec.in b/ceph.spec.in
index 50d1ff002727..519f0d257b57 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -537,16 +537,13 @@ Requires:       ceph-mgr = %{_epoch_prefix}%{version}-%{release}
 Requires:       python%{_python_buildid}-numpy
 %if 0%{without python2}
 Requires:       python3-scipy
-Requires:       python3-pandas
 %else
 Requires:       python2-scipy
-Requires:       python2-pandas
 %endif
 %endif
 %if 0%{?rhel} == 7
 Requires:       numpy
 Requires:       scipy
-Requires:       pandas
 %endif
 %description mgr-diskprediction-local
 ceph-mgr-diskprediction-local is a ceph-mgr plugin that tries to predict
diff --git a/debian/control b/debian/control
index 0c8d51ac2c9f..3f43f25e776e 100644
--- a/debian/control
+++ b/debian/control
@@ -269,7 +269,6 @@ Depends: ceph-mgr (= ${binary:Version}),
          python-numpy,
          python-scipy,
          python-sklearn,
-         python-pandas,
          ${misc:Depends},
          ${python:Depends},
          ${shlibs:Depends},
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/config.json b/src/pybind/mgr/diskprediction_local/models/redhat/config.json
index 127d6ea4d8b2..62a0d828264b 100644
--- a/src/pybind/mgr/diskprediction_local/models/redhat/config.json
+++ b/src/pybind/mgr/diskprediction_local/models/redhat/config.json
@@ -1,3 +1,4 @@
 {
-"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"]
+"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"],
+"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_raw", "smart_199_raw"]
 }
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl
new file mode 100644
index 000000000000..9894d9f553bf
Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl
new file mode 100644
index 000000000000..6f77b85cc38c
Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib
deleted file mode 100644
index ee7d420a2fb4..000000000000
Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl
new file mode 100644
index 000000000000..280d59a088c3
Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib
deleted file mode 100644
index 0b769b983f4e..000000000000
Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl
new file mode 100644
index 000000000000..691bb03c5b49
Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl differ
diff --git a/src/pybind/mgr/diskprediction_local/predictor.py b/src/pybind/mgr/diskprediction_local/predictor.py
index 1a19bf61a520..548454145cea 100644
--- a/src/pybind/mgr/diskprediction_local/predictor.py
+++ b/src/pybind/mgr/diskprediction_local/predictor.py
@@ -23,12 +23,10 @@ An example code is as follows:
 """
 import os
 import json
-import joblib
 import pickle
 import logging
 
 import numpy as np
-import pandas as pd
 from scipy import stats
 
 
@@ -90,10 +88,10 @@ class RHDiskFailurePredictor(object):
         # ensure all manufacturers whose context is defined in config file
         # have models and scalers saved inside model_dirpath
         for manufacturer in self.model_context:
-            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib")
+            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
             if not os.path.isfile(scaler_path):
                 return "Missing scaler file: {}".format(scaler_path)
-            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib")
+            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
             if not os.path.isfile(model_path):
                 return "Missing model file: {}".format(model_path)
 
@@ -123,45 +121,56 @@ class RHDiskFailurePredictor(object):
             )
             return None
 
-        # convert to dataframe, keeping only the required features
+        # convert to structured array, keeping only the required features
+        # assumes all data is in float64 dtype
         try:
-            disk_days_df = pd.DataFrame(disk_days, columns=model_smart_attr)
+            struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
+            values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
+            disk_days_sa = np.array(values, dtype=struc_dtypes)
         except KeyError as e:
             RHDiskFailurePredictor.LOGGER.debug(
                 "Mismatch in SMART attributes used to train model and SMART attributes available"
             )
             return None
 
+        # view structured array as 2d array for applying rolling window transforms
+        # do not include capacity_bytes in this. only use smart_attrs
+        disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
+                            .view(np.float64).reshape(disk_days_sa.shape + (-1,))
+
         # featurize n (6 to 12) days data - mean,std,coefficient of variation
         # current model is trained on 6 days of data because that is what will be
         # available at runtime
-        # NOTE: ensure unique indices so that features can be merged w/ pandas errors
-        disk_days_df = disk_days_df.reset_index(drop=True)
-        means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean()
-        stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std()
-        cvs = stds.divide(means, fill_value=0)
-
-        # rename and combine features into one df
-        means = means.rename(columns={col: "mean_" + col for col in means.columns})
-        stds = stds.rename(columns={col: "std_" + col for col in stds.columns})
-        cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns})
-        featurized_df = means.merge(stds, left_index=True, right_index=True)
-        featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True)
-
-        # drop rows where all features (mean,std,cv) are nans
-        featurized_df = featurized_df.dropna(how="all")
-
-        # fill nans created by cv calculation
-        featurized_df = featurized_df.fillna(0)
 
-        # capacity is not a feature that varies over time
-        featurized_df["user_capacity"] = disk_days_df["user_capacity"].iloc[0]
+        # rolling time window interval size in days
+        roll_window_size = 6
+
+        # rolling means generator
+        gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \
+                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
+        means = np.vstack(gen)
+
+        # rolling stds generator
+        gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \
+                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
+        stds = np.vstack(gen)
+
+        # coefficient of variation
+        cvs = stds / means
+        cvs[np.isnan(cvs)] = 0
+        featurized = np.hstack((
+                                means,
+                                stds,
+                                cvs,
+                                disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1)
+                                ))
 
         # scale features
-        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib")
-        scaler = joblib.load(scaler_path)
-        featurized_df = scaler.transform(featurized_df)
-        return featurized_df
+        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
+        with open(scaler_path, 'rb') as f:
+            scaler = pickle.load(f)
+        featurized = scaler.transform(featurized)
+        return featurized
 
     @staticmethod
     def __get_manufacturer(model_name):
@@ -210,9 +219,10 @@ class RHDiskFailurePredictor(object):
 
         # get model for current manufacturer
         model_path = os.path.join(
-            self.model_dirpath, manufacturer + "_predictor.joblib"
+            self.model_dirpath, manufacturer + "_predictor.pkl"
         )
-        model = joblib.load(model_path)
+        with open(model_path, 'rb') as f:
+            model = pickle.load(f)
 
         # use prediction for most recent day
         # TODO: ensure that most recent day is last element and most previous day
diff --git a/src/pybind/mgr/diskprediction_local/requirements.txt b/src/pybind/mgr/diskprediction_local/requirements.txt
index f524b07a1e74..d9c3157fdf93 100644
--- a/src/pybind/mgr/diskprediction_local/requirements.txt
+++ b/src/pybind/mgr/diskprediction_local/requirements.txt
@@ -1,4 +1,3 @@
 numpy==1.15.1
 scipy==1.1.0
-pandas==0.23.4
 scikit-learn==0.19.2