From: Karanraj Chauhan Date: Wed, 6 Nov 2019 18:46:45 +0000 (-0500) Subject: mgr/diskprediction_local: Reverted dependencies, added HGST models X-Git-Tag: v15.1.0~981^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F29437%2Fhead;p=ceph.git mgr/diskprediction_local: Reverted dependencies, added HGST models Removed pandas from requirements.txt, ceph.spec.in, and debian/control because of installation issues in RHEL/CentOS. Replaced pandas usages in RHDiskFailurePredictor with similar numpy counterparts (e.g. structured array instead of dataframe) Replaced joblib usages with pickle because older version of scikit-learn did not list joblib as a dependency and so it wasnt getting installed. Using joblib would have required specifying it as a separate dependency in spec file and requirements. Added HGST models for RHDiskFailurePredictor. Signed-off-by: Karanraj Chauhan --- diff --git a/ceph.spec.in b/ceph.spec.in index 50d1ff002727..519f0d257b57 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -537,16 +537,13 @@ Requires: ceph-mgr = %{_epoch_prefix}%{version}-%{release} Requires: python%{_python_buildid}-numpy %if 0%{without python2} Requires: python3-scipy -Requires: python3-pandas %else Requires: python2-scipy -Requires: python2-pandas %endif %endif %if 0%{?rhel} == 7 Requires: numpy Requires: scipy -Requires: pandas %endif %description mgr-diskprediction-local ceph-mgr-diskprediction-local is a ceph-mgr plugin that tries to predict diff --git a/debian/control b/debian/control index 0c8d51ac2c9f..3f43f25e776e 100644 --- a/debian/control +++ b/debian/control @@ -269,7 +269,6 @@ Depends: ceph-mgr (= ${binary:Version}), python-numpy, python-scipy, python-sklearn, - python-pandas, ${misc:Depends}, ${python:Depends}, ${shlibs:Depends}, diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/config.json b/src/pybind/mgr/diskprediction_local/models/redhat/config.json index 127d6ea4d8b2..62a0d828264b 100644 --- a/src/pybind/mgr/diskprediction_local/models/redhat/config.json +++ b/src/pybind/mgr/diskprediction_local/models/redhat/config.json @@ -1,3 +1,4 @@ { -"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"] +"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"], +"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_raw", "smart_199_raw"] } diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl new file mode 100644 index 000000000000..9894d9f553bf Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_predictor.pkl differ diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl new file mode 100644 index 000000000000..6f77b85cc38c Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/hgst_scaler.pkl differ diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib deleted file mode 100644 index ee7d420a2fb4..000000000000 Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.joblib and /dev/null differ diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl new file mode 100644 index 000000000000..280d59a088c3 Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_predictor.pkl differ diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib deleted file mode 100644 index 0b769b983f4e..000000000000 Binary files a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.joblib and /dev/null differ diff --git a/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl new file mode 100644 index 000000000000..691bb03c5b49 Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/redhat/seagate_scaler.pkl differ diff --git a/src/pybind/mgr/diskprediction_local/predictor.py b/src/pybind/mgr/diskprediction_local/predictor.py index 1a19bf61a520..548454145cea 100644 --- a/src/pybind/mgr/diskprediction_local/predictor.py +++ b/src/pybind/mgr/diskprediction_local/predictor.py @@ -23,12 +23,10 @@ An example code is as follows: """ import os import json -import joblib import pickle import logging import numpy as np -import pandas as pd from scipy import stats @@ -90,10 +88,10 @@ class RHDiskFailurePredictor(object): # ensure all manufacturers whose context is defined in config file # have models and scalers saved inside model_dirpath for manufacturer in self.model_context: - scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib") + scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl") if not os.path.isfile(scaler_path): return "Missing scaler file: {}".format(scaler_path) - model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib") + model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl") if not os.path.isfile(model_path): return "Missing model file: {}".format(model_path) @@ -123,45 +121,56 @@ class RHDiskFailurePredictor(object): ) return None - # convert to dataframe, keeping only the required features + # convert to structured array, keeping only the required features + # assumes all data is in float64 dtype try: - disk_days_df = pd.DataFrame(disk_days, columns=model_smart_attr) + struc_dtypes = [(attr, np.float64) for attr in model_smart_attr] + values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days] + disk_days_sa = np.array(values, dtype=struc_dtypes) except KeyError as e: RHDiskFailurePredictor.LOGGER.debug( "Mismatch in SMART attributes used to train model and SMART attributes available" ) return None + # view structured array as 2d array for applying rolling window transforms + # do not include capacity_bytes in this. only use smart_attrs + disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\ + .view(np.float64).reshape(disk_days_sa.shape + (-1,)) + # featurize n (6 to 12) days data - mean,std,coefficient of variation # current model is trained on 6 days of data because that is what will be # available at runtime - # NOTE: ensure unique indices so that features can be merged w/ pandas errors - disk_days_df = disk_days_df.reset_index(drop=True) - means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean() - stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std() - cvs = stds.divide(means, fill_value=0) - - # rename and combine features into one df - means = means.rename(columns={col: "mean_" + col for col in means.columns}) - stds = stds.rename(columns={col: "std_" + col for col in stds.columns}) - cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns}) - featurized_df = means.merge(stds, left_index=True, right_index=True) - featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True) - - # drop rows where all features (mean,std,cv) are nans - featurized_df = featurized_df.dropna(how="all") - - # fill nans created by cv calculation - featurized_df = featurized_df.fillna(0) - # capacity is not a feature that varies over time - featurized_df["user_capacity"] = disk_days_df["user_capacity"].iloc[0] + # rolling time window interval size in days + roll_window_size = 6 + + # rolling means generator + gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \ + for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1)) + means = np.vstack(gen) + + # rolling stds generator + gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \ + for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1)) + stds = np.vstack(gen) + + # coefficient of variation + cvs = stds / means + cvs[np.isnan(cvs)] = 0 + featurized = np.hstack(( + means, + stds, + cvs, + disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1) + )) # scale features - scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib") - scaler = joblib.load(scaler_path) - featurized_df = scaler.transform(featurized_df) - return featurized_df + scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl") + with open(scaler_path, 'rb') as f: + scaler = pickle.load(f) + featurized = scaler.transform(featurized) + return featurized @staticmethod def __get_manufacturer(model_name): @@ -210,9 +219,10 @@ class RHDiskFailurePredictor(object): # get model for current manufacturer model_path = os.path.join( - self.model_dirpath, manufacturer + "_predictor.joblib" + self.model_dirpath, manufacturer + "_predictor.pkl" ) - model = joblib.load(model_path) + with open(model_path, 'rb') as f: + model = pickle.load(f) # use prediction for most recent day # TODO: ensure that most recent day is last element and most previous day diff --git a/src/pybind/mgr/diskprediction_local/requirements.txt b/src/pybind/mgr/diskprediction_local/requirements.txt index f524b07a1e74..d9c3157fdf93 100644 --- a/src/pybind/mgr/diskprediction_local/requirements.txt +++ b/src/pybind/mgr/diskprediction_local/requirements.txt @@ -1,4 +1,3 @@ numpy==1.15.1 scipy==1.1.0 -pandas==0.23.4 scikit-learn==0.19.2