]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/diskprediction_local: Updated prediction models to use only supported python...
authorKaranraj Chauhan <chauhank@bu.edu>
Tue, 10 Sep 2019 15:20:45 +0000 (11:20 -0400)
committerKaranraj Chauhan <chauhank@bu.edu>
Thu, 7 Nov 2019 13:38:44 +0000 (08:38 -0500)
Removed non-supported python packages from requirements.txt
Added scikit-learn based models, removed rgf-python based models.
Updated config.json and DiskPredictor.__preprocess for the same.
Also added manufacturer as argument to DiskPredictor.__preprocess

Updated manufacturer lookup - first check if available as smartctl field,
if not then try to infer from model name.
Updated predicted class to be the prediction for the most recent day in
time series data given.
Updated naming convention from "preprocessor" to "scaler".

Signed-off-by: Karanraj Chauhan <kachauha@redhat.com>
src/pybind/mgr/diskprediction_local/models/config.json
src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib [deleted file]
src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib [deleted file]
src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib
src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib [deleted file]
src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib [new file with mode: 0644]
src/pybind/mgr/diskprediction_local/predictor.py
src/pybind/mgr/diskprediction_local/requirements.txt

index f77cb11e06dd61a5b18db46cefe10c683f1dae88..127d6ea4d8b2292e59ee826af3bd647a3e23296c 100644 (file)
@@ -1,4 +1,3 @@
 {
-"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_normalized", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_normalized", "smart_12_raw", "smart_22_normalized", "smart_22_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_199_normalized", "smart_199_raw"],
-"seagate": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_184_normalized", "smart_184_raw", "smart_187_normalized", "smart_187_raw", "smart_188_normalized", "smart_188_raw", "smart_189_normalized", "smart_189_raw", "smart_190_normalized", "smart_190_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_240_normalized", "smart_240_raw", "smart_241_normalized", "smart_241_raw", "smart_242_normalized", "smart_242_raw"]
+"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"]
 }
diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib
deleted file mode 100644 (file)
index 9e1c51f..0000000
Binary files a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib
deleted file mode 100644 (file)
index 2d94963..0000000
Binary files a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib and /dev/null differ
index 574223e668b9332669d8391901d897e1d350e33f..ee7d420a2fb45afdab3c205cd53e2c89d3fad5a1 100644 (file)
Binary files a/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib and b/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib differ
diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib
deleted file mode 100644 (file)
index 34f96ab..0000000
Binary files a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib
new file mode 100644 (file)
index 0000000..0b769b9
Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib differ
index 3ddd934666258ffce7b5b60b6e7c68ee191d97e1..5464b2ec4297b52cdf0e1d97ce248a0cc617ac5d 100644 (file)
@@ -38,14 +38,23 @@ class DiskFailurePredictor(object):
 
     This class implements a disk failure prediction module.
     """
+
     # json with manufacturer names as keys
     # and features used for prediction as values
     CONFIG_FILE = "config.json"
-    PREDICTION_CLASSES = {-1: "Unknown",
-                          0: "Good",
-                          1: "Warning",
-                          2: "Bad"}
-
+    PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
+
+    # model name prefixes to identify vendor
+    MANUFACTURER_MODELNAME_PREFIXES = {
+        "WDC": "WDC",
+        "Toshiba": "Toshiba",  # for cases like "Toshiba xxx"
+        "TOSHIBA": "Toshiba",  # for cases like "TOSHIBA xxx"
+        "toshiba": "Toshiba",  # for cases like "toshiba xxx"
+        "S": "Seagate",        # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
+        "ZA": "Seagate",       # for cases like "ZAxxxx"
+        "Hitachi": "Hitachi",
+        "HGST": "HGST",
+    }
 
     def __init__(self):
         """
@@ -54,7 +63,6 @@ class DiskFailurePredictor(object):
         self.model_dirpath = ""
         self.model_context = {}
 
-
     def initialize(self, model_dirpath):
         """Initialize all models. Save paths of all trained model files to list
 
@@ -73,18 +81,17 @@ class DiskFailurePredictor(object):
                 self.model_context = json.load(f_conf)
 
         # ensure all manufacturers whose context is defined in config file
-        # have models and preprocessors saved inside model_dirpath
+        # have models and scalers saved inside model_dirpath
         for manufacturer in self.model_context:
-            preprocessor_path = os.path.join(model_dirpath, manufacturer + '_preprocessor.joblib')
-            if not os.path.isfile(preprocessor_path):
-                return "Missing preprocessor file: {}".format(preprocessor_path)
-            model_path = os.path.join(model_dirpath, manufacturer + '_predictor.joblib')
+            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib")
+            if not os.path.isfile(scaler_path):
+                return "Missing scaler file: {}".format(scaler_path)
+            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib")
             if not os.path.isfile(model_path):
                 return "Missing model file: {}".format(model_path)
 
         self.model_dirpath = model_dirpath
 
-
     def __format_raw_data(self, disk_days):
         """Massages the input raw data into a form that can be used by the
         predictor for preprocessing, feeding to model, etc. Specifically,
@@ -103,53 +110,87 @@ class DiskFailurePredictor(object):
         df = pd.DataFrame(disk_days)
 
         # change from dict type {'bytes': 123} to just float64 type 123
-        df['user_capacity'] = df['user_capacity'].apply(lambda x: x['bytes'])
+        df["user_capacity"] = df["user_capacity"].apply(lambda x: x["bytes"])
 
         # change from dict type {'table': [{}, {}, {}]}  to list type [{}, {}, {}]
-        df['ata_smart_attributes'] = df['ata_smart_attributes'].apply(lambda x: x['table'])
+        df["ata_smart_attributes"] = df["ata_smart_attributes"].apply(
+            lambda x: x["table"]
+        )
 
         # make a separate column for raw and normalized values of each smart id
         for day_idx in range(len(disk_days)):
-            for attr_dict in df.iloc[0]['ata_smart_attributes']:
-                smart_id = attr_dict['id']
-                df.at[day_idx, 'smart_{}_raw'.format(smart_id)] = int(attr_dict['raw']['value'])
-                df.at[day_idx, 'smart_{}_normalized'.format(smart_id)] = int(attr_dict['value'])
+            for attr_dict in df.iloc[0]["ata_smart_attributes"]:
+                smart_id = attr_dict["id"]
+                df.at[day_idx, "smart_{}_raw".format(smart_id)] = int(
+                    attr_dict["raw"]["value"]
+                )
+                df.at[day_idx, "smart_{}_normalized".format(smart_id)] = int(
+                    attr_dict["value"]
+                )
 
         # drop the now-redundant column
-        df = df.drop('ata_smart_attributes', axis=1)
+        df = df.drop("ata_smart_attributes", axis=1)
         return df
 
-
-    def __preprocess(self, disk_days_df):
+    def __preprocess(self, disk_days_df, manufacturer):
         """Scales and transforms input dataframe to feed it to prediction model
 
         Arguments:
             disk_days_df {pandas.DataFrame} -- df where each row holds drive
                                                 features from one day.
+            manufacturer {str} -- manufacturer of the hard drive
 
         Returns:
             numpy.ndarray -- (n, d) shaped array of n days worth of data and d
                                 features, scaled
         """
-        # preprocessing may vary across manufactueres. so get manufacturer
-        manufacturer = DiskFailurePredictor.__get_manufacturer(disk_days_df['model_name'].iloc[0]).lower()
-
-        # keep only the features used for prediction for current manufacturer
+        # get the attributes that were used to train model for current manufacturer
         try:
-            disk_days_df = disk_days_df[self.model_context[manufacturer]]
+            model_smart_attr = self.model_context[manufacturer]
         except KeyError as e:
-            # TODO: change to log.error
-            print("Either SMART attributes mismatch for hard drive and prediction model,\
-                 or 'model_name' not available in input data")
-            print(e)
+            print("No context (SMART attributes on which model has been trained) found for manufacturer: {}"\
+                .format(manufacturer)
+            )
             return None
 
-        # scale raw data
-        preprocessor_path = os.path.join(self.model_dirpath, manufacturer + '_preprocessor.joblib')
-        preprocessor = joblib.load(preprocessor_path)
-        disk_days_df = preprocessor.transform(disk_days_df)
-        return disk_days_df
+        # keep only the required features
+        try:
+            disk_days_df = disk_days_df[model_smart_attr]
+        except KeyError as e:
+            print("Mismatch in SMART attributes used to train model and SMART attributes available")
+            return None
 
+        # featurize n (6 to 12) days data - mean,std,coefficient of variation
+        # current model is trained on 6 days of data because that is what will be
+        # available at runtime
+        # NOTE: ensure unique indices so that features can be merged w/ pandas errors
+        disk_days_df = disk_days_df.reset_index(drop=True)
+        means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean()
+        stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std()
+        cvs = stds.divide(means, fill_value=0)
+
+        # rename and combine features into one df
+        means = means.rename(columns={col: "mean_" + col for col in means.columns})
+        stds = stds.rename(columns={col: "std_" + col for col in stds.columns})
+        cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns})
+        featurized_df = means.merge(stds, left_index=True, right_index=True)
+        featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True)
+
+        # drop rows where all features (mean,std,cv) are nans
+        featurized_df = featurized_df.dropna(how="all")
+
+        # fill nans created by cv calculation
+        featurized_df = featurized_df.fillna(0)
+
+        # capacity is not a feature that varies over time
+        # FIXME: will this values roll over
+        featurized_df["user_capacity"] = disk_days_df["user_capacity"]
+
+        # scale features
+        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib")
+        scaler = joblib.load(scaler_path)
+        featurized_df = scaler.transform(featurized_df)
+        return featurized_df
 
     @staticmethod
     def __get_manufacturer(model_name):
@@ -161,35 +202,45 @@ class DiskFailurePredictor(object):
         Returns:
             str -- manufacturer name
         """
-        if model_name.startswith("W"):
-            return "WDC"
-        elif model_name.startswith("T"):
-            return "Toshiba"
-        elif model_name.startswith("S"):
-            return "Seagate"
-        elif model_name.startswith("Hi"):
-            return "Hitachi"
-        else:
-            return "HGST"
-
+        for prefix, manufacturer in DiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
+            if model_name.startswith(prefix):
+                return manufacturer
+        # print error message
+        print("Could not infer manufacturer from model name {}".format(model_name))
 
     def predict(self, disk_days):
         # massage data into a format that can be fed to models
         raw_df = self.__format_raw_data(disk_days)
 
-        # preprocess
-        preprocessed_data = self.__preprocess(raw_df)
+        # get manufacturer preferably as a smartctl attribute
+        # if not available then infer using model name
+        try:
+            manufacturer = raw_df["vendor"].iloc[0]
+        except KeyError as e:
+            print('"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.')
+            manufacturer = DiskFailurePredictor.__get_manufacturer(raw_df["model_name"].iloc[0]).lower()
+
+        # print error message, return Unknown, and continue execution
+        if manufacturer is None:
+            print(
+                "Manufacturer could not be determiend. This may be because \
+                DiskPredictor has never encountered this manufacturer before, \
+                    or the model name is not according to the manufacturer's \
+                        naming conventions known to DiskPredictor"
+            )
+            return DiskFailurePredictor.PREDICTION_CLASSES[-1]
+
+        # preprocess for feeding to model
+        preprocessed_data = self.__preprocess(raw_df, manufacturer)
         if preprocessed_data is None:
             return DiskFailurePredictor.PREDICTION_CLASSES[-1]
 
         # get model for current manufacturer
-        manufacturer = self.__get_manufacturer(raw_df['model_name'].iloc[0]).lower()
-        model_path = os.path.join(self.model_dirpath, manufacturer + '_predictor.joblib')
+        model_path = os.path.join(
+            self.model_dirpath, manufacturer + "_predictor.joblib"
+        )
         model = joblib.load(model_path)
 
-        # predictions for each day
-        preds = model.predict(preprocessed_data)
-
-        # use majority vote to decide class. raise if a nan prediction exists
-        pred_class_id = stats.mode(preds, nan_policy='raise').mode[0]
+        # use prediction for last day
+        pred_class_id = model.predict(preprocessed_data)[-1]
         return DiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
index 8769b42e6010594ca7b0a8ad6f5ee4b02ca46116..4bfcec0fcc987fddaaec4f7821ecc3f13ad483aa 100644 (file)
@@ -1,6 +1,5 @@
-numpy==1.16.4
-scipy==1.2.1
-pandas==0.25.0
-joblib==0.13.2
-scikit-learn==0.21.2
-rgf-python==3.6.0
+numpy==1.15.1
+scipy==1.1.0
+pandas==0.23.4
+joblib==0.11
+scikit-learn==0.19.2