mgr/diskprediction_local: Updated prediction models to use only supported python...

author Karanraj Chauhan <chauhank@bu.edu>

Tue, 10 Sep 2019 15:20:45 +0000 (11:20 -0400)

committer Karanraj Chauhan <chauhank@bu.edu>

Thu, 7 Nov 2019 13:38:44 +0000 (08:38 -0500)
author Karanraj Chauhan <chauhank@bu.edu>
Tue, 10 Sep 2019 15:20:45 +0000 (11:20 -0400)
committer Karanraj Chauhan <chauhank@bu.edu>
Thu, 7 Nov 2019 13:38:44 +0000 (08:38 -0500)
diff --git a/src/pybind/mgr/diskprediction_local/models/config.json b/src/pybind/mgr/diskprediction_local/models/config.json

index f77cb11e06dd61a5b18db46cefe10c683f1dae88..127d6ea4d8b2292e59ee826af3bd647a3e23296c 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/models/config.json
+++ b/src/pybind/mgr/diskprediction_local/models/config.json
@@ -1,4 +1,3 @@
  {
-"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_normalized", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_normalized", "smart_12_raw", "smart_22_normalized", "smart_22_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_199_normalized", "smart_199_raw"],
-"seagate": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_184_normalized", "smart_184_raw", "smart_187_normalized", "smart_187_raw", "smart_188_normalized", "smart_188_raw", "smart_189_normalized", "smart_189_raw", "smart_190_normalized", "smart_190_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_240_normalized", "smart_240_raw", "smart_241_normalized", "smart_241_raw", "smart_242_normalized", "smart_242_raw"]
+"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"]
  }
diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib

deleted file mode 100644 (file)

index 9e1c51f..0000000

Binary files a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib

deleted file mode 100644 (file)

index 2d94963..0000000

Binary files a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib

index 574223e668b9332669d8391901d897e1d350e33f..ee7d420a2fb45afdab3c205cd53e2c89d3fad5a1 100644 (file)

Binary files a/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib and b/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib differ
diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib

deleted file mode 100644 (file)

index 34f96ab..0000000

Binary files a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib and /dev/null differ
diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib

new file mode 100644 (file)

index 0000000..0b769b9

Binary files /dev/null and b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib differ
diff --git a/src/pybind/mgr/diskprediction_local/predictor.py b/src/pybind/mgr/diskprediction_local/predictor.py

index 3ddd934666258ffce7b5b60b6e7c68ee191d97e1..5464b2ec4297b52cdf0e1d97ce248a0cc617ac5d 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/predictor.py
+++ b/src/pybind/mgr/diskprediction_local/predictor.py
@@ -38,14 +38,23 @@ class DiskFailurePredictor(object):
  
      This class implements a disk failure prediction module.
      """
+
      # json with manufacturer names as keys
      # and features used for prediction as values
      CONFIG_FILE = "config.json"
-    PREDICTION_CLASSES = {-1: "Unknown",
-                          0: "Good",
-                          1: "Warning",
-                          2: "Bad"}
-
+    PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
+
+    # model name prefixes to identify vendor
+    MANUFACTURER_MODELNAME_PREFIXES = {
+        "WDC": "WDC",
+        "Toshiba": "Toshiba",  # for cases like "Toshiba xxx"
+        "TOSHIBA": "Toshiba",  # for cases like "TOSHIBA xxx"
+        "toshiba": "Toshiba",  # for cases like "toshiba xxx"
+        "S": "Seagate",        # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
+        "ZA": "Seagate",       # for cases like "ZAxxxx"
+        "Hitachi": "Hitachi",
+        "HGST": "HGST",
+    }
  
      def __init__(self):
          """
@@ -54,7 +63,6 @@ class DiskFailurePredictor(object):
          self.model_dirpath = ""
          self.model_context = {}
  
-
      def initialize(self, model_dirpath):
          """Initialize all models. Save paths of all trained model files to list
  
@@ -73,18 +81,17 @@ class DiskFailurePredictor(object):
                  self.model_context = json.load(f_conf)
  
          # ensure all manufacturers whose context is defined in config file
-        # have models and preprocessors saved inside model_dirpath
+        # have models and scalers saved inside model_dirpath
          for manufacturer in self.model_context:
-            preprocessor_path = os.path.join(model_dirpath, manufacturer + '_preprocessor.joblib')
-            if not os.path.isfile(preprocessor_path):
-                return "Missing preprocessor file: {}".format(preprocessor_path)
-            model_path = os.path.join(model_dirpath, manufacturer + '_predictor.joblib')
+            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib")
+            if not os.path.isfile(scaler_path):
+                return "Missing scaler file: {}".format(scaler_path)
+            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib")
              if not os.path.isfile(model_path):
                  return "Missing model file: {}".format(model_path)
  
          self.model_dirpath = model_dirpath
  
-
      def __format_raw_data(self, disk_days):
          """Massages the input raw data into a form that can be used by the
          predictor for preprocessing, feeding to model, etc. Specifically,
@@ -103,53 +110,87 @@ class DiskFailurePredictor(object):
          df = pd.DataFrame(disk_days)
  
          # change from dict type {'bytes': 123} to just float64 type 123
-        df['user_capacity'] = df['user_capacity'].apply(lambda x: x['bytes'])
+        df["user_capacity"] = df["user_capacity"].apply(lambda x: x["bytes"])
  
          # change from dict type {'table': [{}, {}, {}]}  to list type [{}, {}, {}]
-        df['ata_smart_attributes'] = df['ata_smart_attributes'].apply(lambda x: x['table'])
+        df["ata_smart_attributes"] = df["ata_smart_attributes"].apply(
+            lambda x: x["table"]
+        )
  
          # make a separate column for raw and normalized values of each smart id
          for day_idx in range(len(disk_days)):
-            for attr_dict in df.iloc[0]['ata_smart_attributes']:
-                smart_id = attr_dict['id']
-                df.at[day_idx, 'smart_{}_raw'.format(smart_id)] = int(attr_dict['raw']['value'])
-                df.at[day_idx, 'smart_{}_normalized'.format(smart_id)] = int(attr_dict['value'])
+            for attr_dict in df.iloc[0]["ata_smart_attributes"]:
+                smart_id = attr_dict["id"]
+                df.at[day_idx, "smart_{}_raw".format(smart_id)] = int(
+                    attr_dict["raw"]["value"]
+                )
+                df.at[day_idx, "smart_{}_normalized".format(smart_id)] = int(
+                    attr_dict["value"]
+                )
  
          # drop the now-redundant column
-        df = df.drop('ata_smart_attributes', axis=1)
+        df = df.drop("ata_smart_attributes", axis=1)
          return df
  
-
-    def __preprocess(self, disk_days_df):
+    def __preprocess(self, disk_days_df, manufacturer):
          """Scales and transforms input dataframe to feed it to prediction model
  
          Arguments:
              disk_days_df {pandas.DataFrame} -- df where each row holds drive
                                                  features from one day.
+            manufacturer {str} -- manufacturer of the hard drive
  
          Returns:
              numpy.ndarray -- (n, d) shaped array of n days worth of data and d
                                  features, scaled
          """
-        # preprocessing may vary across manufactueres. so get manufacturer
-        manufacturer = DiskFailurePredictor.__get_manufacturer(disk_days_df['model_name'].iloc[0]).lower()
-
-        # keep only the features used for prediction for current manufacturer
+        # get the attributes that were used to train model for current manufacturer
          try:
-            disk_days_df = disk_days_df[self.model_context[manufacturer]]
+            model_smart_attr = self.model_context[manufacturer]
          except KeyError as e:
-            # TODO: change to log.error
-            print("Either SMART attributes mismatch for hard drive and prediction model,\
-                 or 'model_name' not available in input data")
-            print(e)
+            print("No context (SMART attributes on which model has been trained) found for manufacturer: {}"\
+                .format(manufacturer)
+            )
              return None
  
-        # scale raw data
-        preprocessor_path = os.path.join(self.model_dirpath, manufacturer + '_preprocessor.joblib')
-        preprocessor = joblib.load(preprocessor_path)
-        disk_days_df = preprocessor.transform(disk_days_df)
-        return disk_days_df
+        # keep only the required features
+        try:
+            disk_days_df = disk_days_df[model_smart_attr]
+        except KeyError as e:
+            print("Mismatch in SMART attributes used to train model and SMART attributes available")
+            return None
  
+        # featurize n (6 to 12) days data - mean,std,coefficient of variation
+        # current model is trained on 6 days of data because that is what will be
+        # available at runtime
+        # NOTE: ensure unique indices so that features can be merged w/ pandas errors
+        disk_days_df = disk_days_df.reset_index(drop=True)
+        means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean()
+        stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std()
+        cvs = stds.divide(means, fill_value=0)
+
+        # rename and combine features into one df
+        means = means.rename(columns={col: "mean_" + col for col in means.columns})
+        stds = stds.rename(columns={col: "std_" + col for col in stds.columns})
+        cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns})
+        featurized_df = means.merge(stds, left_index=True, right_index=True)
+        featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True)
+
+        # drop rows where all features (mean,std,cv) are nans
+        featurized_df = featurized_df.dropna(how="all")
+
+        # fill nans created by cv calculation
+        featurized_df = featurized_df.fillna(0)
+
+        # capacity is not a feature that varies over time
+        # FIXME: will this values roll over
+        featurized_df["user_capacity"] = disk_days_df["user_capacity"]
+
+        # scale features
+        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib")
+        scaler = joblib.load(scaler_path)
+        featurized_df = scaler.transform(featurized_df)
+        return featurized_df
  
      @staticmethod
      def __get_manufacturer(model_name):
@@ -161,35 +202,45 @@ class DiskFailurePredictor(object):
          Returns:
              str -- manufacturer name
          """
-        if model_name.startswith("W"):
-            return "WDC"
-        elif model_name.startswith("T"):
-            return "Toshiba"
-        elif model_name.startswith("S"):
-            return "Seagate"
-        elif model_name.startswith("Hi"):
-            return "Hitachi"
-        else:
-            return "HGST"
-
+        for prefix, manufacturer in DiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
+            if model_name.startswith(prefix):
+                return manufacturer
+        # print error message
+        print("Could not infer manufacturer from model name {}".format(model_name))
  
      def predict(self, disk_days):
          # massage data into a format that can be fed to models
          raw_df = self.__format_raw_data(disk_days)
  
-        # preprocess
-        preprocessed_data = self.__preprocess(raw_df)
+        # get manufacturer preferably as a smartctl attribute
+        # if not available then infer using model name
+        try:
+            manufacturer = raw_df["vendor"].iloc[0]
+        except KeyError as e:
+            print('"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.')
+            manufacturer = DiskFailurePredictor.__get_manufacturer(raw_df["model_name"].iloc[0]).lower()
+
+        # print error message, return Unknown, and continue execution
+        if manufacturer is None:
+            print(
+                "Manufacturer could not be determiend. This may be because \
+                DiskPredictor has never encountered this manufacturer before, \
+                    or the model name is not according to the manufacturer's \
+                        naming conventions known to DiskPredictor"
+            )
+            return DiskFailurePredictor.PREDICTION_CLASSES[-1]
+
+        # preprocess for feeding to model
+        preprocessed_data = self.__preprocess(raw_df, manufacturer)
          if preprocessed_data is None:
              return DiskFailurePredictor.PREDICTION_CLASSES[-1]
  
          # get model for current manufacturer
-        manufacturer = self.__get_manufacturer(raw_df['model_name'].iloc[0]).lower()
-        model_path = os.path.join(self.model_dirpath, manufacturer + '_predictor.joblib')
+        model_path = os.path.join(
+            self.model_dirpath, manufacturer + "_predictor.joblib"
+        )
          model = joblib.load(model_path)
  
-        # predictions for each day
-        preds = model.predict(preprocessed_data)
-
-        # use majority vote to decide class. raise if a nan prediction exists
-        pred_class_id = stats.mode(preds, nan_policy='raise').mode[0]
+        # use prediction for last day
+        pred_class_id = model.predict(preprocessed_data)[-1]
          return DiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
diff --git a/src/pybind/mgr/diskprediction_local/requirements.txt b/src/pybind/mgr/diskprediction_local/requirements.txt

index 8769b42e6010594ca7b0a8ad6f5ee4b02ca46116..4bfcec0fcc987fddaaec4f7821ecc3f13ad483aa 100644 (file)
--- a/src/pybind/mgr/diskprediction_local/requirements.txt
+++ b/src/pybind/mgr/diskprediction_local/requirements.txt
@@ -1,6 +1,5 @@
-numpy==1.16.4
-scipy==1.2.1
-pandas==0.25.0
-joblib==0.13.2
-scikit-learn==0.21.2
-rgf-python==3.6.0
+numpy==1.15.1
+scipy==1.1.0
+pandas==0.23.4
+joblib==0.11
+scikit-learn==0.19.2
author	Karanraj Chauhan <chauhank@bu.edu>
	Tue, 10 Sep 2019 15:20:45 +0000 (11:20 -0400)
committer	Karanraj Chauhan <chauhank@bu.edu>
	Thu, 7 Nov 2019 13:38:44 +0000 (08:38 -0500)
src/pybind/mgr/diskprediction_local/models/config.json		patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib	[deleted file]	patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib	[deleted file]	patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib		patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib	[deleted file]	patch \| blob \| history
src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/diskprediction_local/predictor.py		patch \| blob \| history
src/pybind/mgr/diskprediction_local/requirements.txt		patch \| blob \| history