]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Warn about objects with too many omap entries
authorBrad Hubbard <bhubbard@redhat.com>
Tue, 27 Jun 2017 01:40:47 +0000 (11:40 +1000)
committerBrad Hubbard <bhubbard@redhat.com>
Tue, 24 Oct 2017 07:27:57 +0000 (17:27 +1000)
Signed-off-by: Brad Hubbard <bhubbard@redhat.com>
qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml [new file with mode: 0644]
qa/workunits/rados/test_large_omap_detection.py [new file with mode: 0755]
src/common/options.cc
src/mon/PGMap.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/PGBackend.cc
src/osd/PGBackend.h
src/osd/ReplicatedBackend.cc
src/osd/osd_types.cc
src/osd/osd_types.h

diff --git a/qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml b/qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml
new file mode 100644 (file)
index 0000000..e00a93d
--- /dev/null
@@ -0,0 +1,22 @@
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, client.0]
+overrides:
+  ceph:
+    log-whitelist:
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_FULL\)
+      - \(MDS_READ_ONLY\)
+      - large omap objects
+      - Large omap object found
+      - application not enabled
+    conf:
+      osd:
+        osd deep scrub large omap object value sum threshold: 8800000
+        osd deep scrub large omap object key threshold: 20000
+tasks:
+- install:
+- ceph:
+- workunit:
+    clients:
+      all:
+        - rados/test_large_omap_detection.py
diff --git a/qa/workunits/rados/test_large_omap_detection.py b/qa/workunits/rados/test_large_omap_detection.py
new file mode 100755 (executable)
index 0000000..b5bf64c
--- /dev/null
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+import json
+import rados
+import shlex
+import subprocess
+import time
+
+def cleanup(cluster):
+    cluster.delete_pool('large-omap-test-pool')
+    cluster.shutdown()
+
+def init():
+    # For local testing
+    #cluster = rados.Rados(conffile='./ceph.conf')
+    cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
+    cluster.connect()
+    print("\nCluster ID: " + cluster.get_fsid())
+    cluster.create_pool('large-omap-test-pool')
+    ioctx = cluster.open_ioctx('large-omap-test-pool')
+    ioctx.write_full('large-omap-test-object1', "Lorem ipsum")
+    op = ioctx.create_write_op()
+
+    keys = []
+    values = []
+    for x in range(20001):
+        keys.append(str(x))
+        values.append("X")
+
+    ioctx.set_omap(op, tuple(keys), tuple(values))
+    ioctx.operate_write_op(op, 'large-omap-test-object1', 0)
+    ioctx.release_write_op(op)
+
+    ioctx.write_full('large-omap-test-object2', "Lorem ipsum dolor")
+    op = ioctx.create_write_op()
+
+    buffer = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+              "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+              "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+              "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+              "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+              "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+              "sunt in culpa qui officia deserunt mollit anim id est laborum.")
+
+    keys = []
+    values = []
+    for x in xrange(20000):
+        keys.append(str(x))
+        values.append(buffer)
+
+    ioctx.set_omap(op, tuple(keys), tuple(values))
+    ioctx.operate_write_op(op, 'large-omap-test-object2', 0)
+    ioctx.release_write_op(op)
+    ioctx.close()
+    return cluster
+
+def get_deep_scrub_timestamp(pgid):
+    cmd = ['ceph', 'pg', 'dump', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    for stat in json.loads(out)['pg_stats']:
+        if stat['pgid'] == pgid:
+            return stat['last_deep_scrub_stamp']
+
+def wait_for_scrub():
+    osds = set();
+    pgs = dict();
+    cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+           'large-omap-test-object1', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    osds.add(json.loads(out)['acting_primary'])
+    pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+    cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+           'large-omap-test-object2', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    osds.add(json.loads(out)['acting_primary'])
+    pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+
+    for osd in osds:
+        command = "ceph osd deep-scrub osd." + str(osd)
+        subprocess.check_call(shlex.split(command))
+
+    for pg in pgs:
+        RETRIES = 0
+        while RETRIES < 60 and pgs[pg] == get_deep_scrub_timestamp(pg):
+            time.sleep(10)
+            RETRIES += 1
+
+def check_health_output():
+    RETRIES = 0
+    result = 0
+    while RETRIES < 6 and result != 2:
+        result = 0
+        RETRIES += 1
+        output = subprocess.check_output(["ceph", "health", "detail"])
+        for line in output.splitlines():
+            result += int(line.find('2 large omap objects') != -1)
+        time.sleep(10)
+
+    if result != 2:
+        print("Error, got invalid output:")
+        print(output)
+        raise Exception
+
+def main():
+    cluster = init()
+    wait_for_scrub()
+    check_health_output()
+
+    cleanup(cluster)
+
+if __name__ == '__main__':
+    main()
index 5b8a4da681725adf4d85f72113921e2a5b80ff84..fdeb7d51d1c7a82b21f36759ee03ed9363fb0cf0 100644 (file)
@@ -2523,6 +2523,18 @@ std::vector<Option> get_global_options() {
     .set_default(2_hr)
     .set_description(""),
 
+    Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2000000)
+    .set_description("threshold for number of keys to determine a large omap object")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),
+
+    Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description("threshold for summed size (bytes) of all key values to determine a large omap object")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),
+
     Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(CEPH_LIBDIR "/rados-classes")
     .set_description(""),
index 1cb1abd200294a3418b191dc460275c50681ef8b..5ced39c07a32b0cf4ee8db8838e4660cbb103808 100644 (file)
@@ -2341,6 +2341,39 @@ void PGMap::get_health_checks(
     checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
   }
 
+  // LARGE_OMAP_OBJECTS
+  if (pg_sum.stats.sum.num_large_omap_objects) {
+    list<string> detail;
+    for (auto &pool : pools) {
+      const string& pool_name = osdmap.get_pool_name(pool.first);
+      auto it2 = pg_pool_sum.find(pool.first);
+      if (it2 == pg_pool_sum.end()) {
+        continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      if (pstat == nullptr) {
+        continue;
+      }
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      if (sum.num_large_omap_objects) {
+        stringstream ss;
+        ss << sum.num_large_omap_objects << " large objects found in pool "
+           << "'" << pool_name << "'";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+      auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
+      stringstream tip;
+      tip << "Search the cluster log for 'Large omap object found' for more "
+          << "details.";
+      detail.push_back(tip.str());
+      d.detail.swap(detail);
+    }
+  }
+
   // CACHE_POOL_NEAR_FULL
   {
     list<string> detail;
index e23316da4fadc939e5c4f422211fca76fc6258ac..7809e7de43ef437b2813526f81df6f187d3f02a6 100644 (file)
@@ -4644,38 +4644,42 @@ void PG::scrub_compare_maps()
   map<pg_shard_t, ScrubMap *> maps;
   maps[pg_whoami] = &scrubber.primary_scrubmap;
 
-  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-       i != actingbackfill.end();
-       ++i) {
-    if (*i == pg_whoami) continue;
-    dout(2) << __func__ << " replica " << *i << " has "
-            << scrubber.received_maps[*i].objects.size()
+  for (const auto& i : actingbackfill) {
+    if (i == pg_whoami) continue;
+    dout(2) << __func__ << " replica " << i << " has "
+            << scrubber.received_maps[i].objects.size()
             << " items" << dendl;
-    maps[*i] = &scrubber.received_maps[*i];
+    maps[i] = &scrubber.received_maps[i];
   }
 
-  map<hobject_t,ScrubMap::object>::const_iterator i;
-  map<pg_shard_t, ScrubMap *>::const_iterator j;
   set<hobject_t> master_set;
 
   // Construct master set
-  for (j = maps.begin(); j != maps.end(); ++j) {
-    for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
-      master_set.insert(i->first);
+  for (const auto map : maps) {
+    for (const auto i : map.second->objects) {
+      master_set.insert(i.first);
     }
   }
 
+  stringstream ss;
+  get_pgbackend()->be_large_omap_check(maps, master_set,
+                                       scrubber.large_omap_objects, ss);
+  if (!ss.str().empty()) {
+    osd->clog->warn(ss);
+  }
+
   if (acting.size() > 1) {
     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
 
-    stringstream ss;
-
     // Map from object with errors to good peer
     map<hobject_t, list<pg_shard_t>> authoritative;
 
     dout(2) << __func__ << "   osd." << acting[0] << " has "
            << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
 
+    ss.str("");
+    ss.clear();
+
     get_pgbackend()->be_compare_scrubmaps(
       maps,
       master_set,
@@ -4872,6 +4876,7 @@ void PG::scrub_finish()
       info.history.last_clean_scrub_stamp = now;
     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
+    info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
   } else {
     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
index 3cb5c30d0575eb86d5610bf2a9e69803400fd222..31cd6babea65e3c974b79aebc16c634767460223 100644 (file)
@@ -1461,6 +1461,7 @@ public:
     set<pg_shard_t> waiting_on_whom;
     int shallow_errors;
     int deep_errors;
+    int large_omap_objects = 0;
     int fixed;
     ScrubMap primary_scrubmap;
     map<pg_shard_t, ScrubMap> received_maps;
@@ -1576,6 +1577,7 @@ public:
       subset_last_update = eversion_t();
       shallow_errors = 0;
       deep_errors = 0;
+      large_omap_objects = 0;
       fixed = 0;
       deep = false;
       seed = 0;
index 7866fa9990d3491bc0aa15726bdd38811bb6fbbf..9de930365a6f74e7e6fa28e8b5b91841a6103bae 100644 (file)
@@ -1098,3 +1098,23 @@ out:
     }
   }
 }
+
+void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
+  const set<hobject_t> &master_set,
+  int& large_omap_objects,
+  ostream &warnstream) const
+{
+  // Iterate through objects and check large omap object flag
+  for (const auto& k : master_set) {
+    for (const auto& map : maps) {
+      ScrubMap::object& obj = map.second->objects[k];
+      if (obj.large_omap_object_found) {
+        large_omap_objects++;
+        warnstream << "Large omap object found. Object: " << k << " Key count: "
+                   << obj.large_omap_object_key_count << " Size (bytes): "
+                   << obj.large_omap_object_value_size << '\n';
+        break;
+      }
+    }
+  }
+}
index d832faf2852d38b9e911b243f2534e82e9fa9ac4..85a42ad284d4692275f2fddd1d294965ec9e550c 100644 (file)
@@ -591,6 +591,11 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
      uint32_t seed,
      ScrubMap::object &o,
      ThreadPool::TPHandle &handle) = 0;
+   void be_large_omap_check(
+     const map<pg_shard_t,ScrubMap*> &maps,
+     const set<hobject_t> &master_set,
+     int& large_omap_objects,
+     ostream &warnstream) const;
 
    static PGBackend *build_pg_backend(
      const pg_pool_t &pool,
index e7317f038a78825180f778f2262a9051ae58074b..7f41687c7d4baee50abe27cfcfdc0b845ffda971 100644 (file)
@@ -769,20 +769,37 @@ void ReplicatedBackend::be_deep_scrub(
     ghobject_t(
       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   assert(iter);
+  uint64_t keys_scanned = 0;
+  uint64_t value_sum = 0;
   for (iter->seek_to_first(); iter->status() == 0 && iter->valid();
     iter->next(false)) {
+    ++keys_scanned;
     handle.reset_tp_timeout();
 
     dout(25) << "CRC key " << iter->key() << " value:\n";
     iter->value().hexdump(*_dout);
     *_dout << dendl;
 
+    value_sum += iter->value().length();
+
     ::encode(iter->key(), bl);
     ::encode(iter->value(), bl);
     oh << bl;
     bl.clear();
   }
 
+  if (keys_scanned > cct->_conf->get_val<uint64_t>(
+                         "osd_deep_scrub_large_omap_object_key_threshold") ||
+      value_sum > cct->_conf->get_val<uint64_t>(
+                      "osd_deep_scrub_large_omap_object_value_sum_threshold")) {
+    dout(25) << __func__ << " " << poid
+             << " large omap object detected. Object has " << keys_scanned
+             << " keys and size " << value_sum << " bytes" << dendl;
+    o.large_omap_object_found = true;
+    o.large_omap_object_key_count = keys_scanned;
+    o.large_omap_object_value_size = value_sum;
+  }
+
   if (iter->status() < 0) {
     dout(25) << __func__ << "  " << poid
              << " on omap scan, db status error" << dendl;
index 5f32dc7ea4458b022fa79f7c98e69295e80be6d6..ec4fadb66ecb9f67655d9c17ac53c66ae326a1e2 100644 (file)
@@ -1942,11 +1942,12 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_evict_mode_full", num_evict_mode_full);
   f->dump_int("num_objects_pinned", num_objects_pinned);
   f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+  f->dump_int("num_large_omap_objects", num_large_omap_objects);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(16, 14, bl);
+  ENCODE_START(17, 14, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
   bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
 #else
@@ -1985,6 +1986,7 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_objects_pinned, bl);
   ::encode(num_objects_missing, bl);
   ::encode(num_legacy_snapsets, bl);
+  ::encode(num_large_omap_objects, bl);
 #endif
   ENCODE_FINISH(bl);
 }
@@ -1992,7 +1994,7 @@ void object_stat_sum_t::encode(bufferlist& bl) const
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
   bool decode_finish = false;
-  DECODE_START(16, bl);
+  DECODE_START(17, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
   if (struct_v >= 16) {
     bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
@@ -2039,6 +2041,9 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
     } else {
       num_legacy_snapsets = num_object_clones;  // upper bound
     }
+    if (struct_v >= 17) {
+      ::decode(num_large_omap_objects, bl);
+    }
   }
   DECODE_FINISH(bl);
 }
@@ -2078,6 +2083,7 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
   a.num_evict_mode_some = 1;
   a.num_evict_mode_full = 0;
   a.num_objects_pinned = 20;
+  a.num_large_omap_objects = 5;
   o.push_back(new object_stat_sum_t(a));
 }
 
@@ -2118,6 +2124,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_evict_mode_full += o.num_evict_mode_full;
   num_objects_pinned += o.num_objects_pinned;
   num_legacy_snapsets += o.num_legacy_snapsets;
+  num_large_omap_objects += o.num_large_omap_objects;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -2157,6 +2164,7 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_evict_mode_full -= o.num_evict_mode_full;
   num_objects_pinned -= o.num_objects_pinned;
   num_legacy_snapsets -= o.num_legacy_snapsets;
+  num_large_omap_objects -= o.num_large_omap_objects;
 }
 
 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
@@ -2196,7 +2204,8 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_evict_mode_some == r.num_evict_mode_some &&
     l.num_evict_mode_full == r.num_evict_mode_full &&
     l.num_objects_pinned == r.num_objects_pinned &&
-    l.num_legacy_snapsets == r.num_legacy_snapsets;
+    l.num_legacy_snapsets == r.num_legacy_snapsets &&
+    l.num_large_omap_objects == r.num_large_omap_objects;
 }
 
 // -- object_stat_collection_t --
@@ -5586,7 +5595,7 @@ void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
 void ScrubMap::object::encode(bufferlist& bl) const
 {
   bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
-  ENCODE_START(8, 7, bl);
+  ENCODE_START(9, 7, bl);
   ::encode(size, bl);
   ::encode(negative, bl);
   ::encode(attrs, bl);
@@ -5601,12 +5610,15 @@ void ScrubMap::object::encode(bufferlist& bl) const
   ::encode(read_error, bl);
   ::encode(ec_hash_mismatch, bl);
   ::encode(ec_size_mismatch, bl);
+  ::encode(large_omap_object_found, bl);
+  ::encode(large_omap_object_key_count, bl);
+  ::encode(large_omap_object_value_size, bl);
   ENCODE_FINISH(bl);
 }
 
 void ScrubMap::object::decode(bufferlist::iterator& bl)
 {
-  DECODE_START(8, bl);
+  DECODE_START(9, bl);
   ::decode(size, bl);
   bool tmp, compat_read_error = false;
   ::decode(tmp, bl);
@@ -5638,6 +5650,12 @@ void ScrubMap::object::decode(bufferlist::iterator& bl)
   // If older encoder found a read_error, set read_error
   if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
     read_error = true;
+  if (struct_v >= 9) {
+    ::decode(tmp, bl);
+    large_omap_object_found = tmp;
+    ::decode(large_omap_object_key_count, bl);
+    ::decode(large_omap_object_value_size, bl);
+  }
   DECODE_FINISH(bl);
 }
 
index 57acf95900ca5801a11c9783d1380073701e7123..7683991ab57dba93899b812057879fa4165119b9 100644 (file)
@@ -1659,6 +1659,7 @@ struct object_stat_sum_t {
   int64_t num_objects_pinned;
   int64_t num_objects_missing;
   int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+  int64_t num_large_omap_objects = 0;
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1706,6 +1707,7 @@ struct object_stat_sum_t {
     FLOOR(num_wr);
     FLOOR(num_wr_kb);
     FLOOR(num_scrub_errors);
+    FLOOR(num_large_omap_objects);
     FLOOR(num_shallow_scrub_errors);
     FLOOR(num_deep_scrub_errors);
     FLOOR(num_objects_recovered);
@@ -1760,6 +1762,7 @@ struct object_stat_sum_t {
     SPLIT(num_wr);
     SPLIT(num_wr_kb);
     SPLIT(num_scrub_errors);
+    SPLIT(num_large_omap_objects);
     SPLIT(num_shallow_scrub_errors);
     SPLIT(num_deep_scrub_errors);
     SPLIT(num_objects_recovered);
@@ -1816,6 +1819,7 @@ struct object_stat_sum_t {
         sizeof(num_wr) +
         sizeof(num_wr_kb) +
         sizeof(num_scrub_errors) +
+        sizeof(num_large_omap_objects) +
         sizeof(num_objects_recovered) +
         sizeof(num_bytes_recovered) +
         sizeof(num_keys_recovered) +
@@ -4870,12 +4874,16 @@ struct ScrubMap {
     bool stat_error:1;
     bool ec_hash_mismatch:1;
     bool ec_size_mismatch:1;
+    bool large_omap_object_found:1;
+    uint64_t large_omap_object_key_count = 0;
+    uint64_t large_omap_object_value_size = 0;
 
     object() :
       // Init invalid size so it won't match if we get a stat EIO error
       size(-1), omap_digest(0), digest(0),
-      negative(false), digest_present(false), omap_digest_present(false), 
-      read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
+      negative(false), digest_present(false), omap_digest_present(false),
+      read_error(false), stat_error(false), ec_hash_mismatch(false),
+      ec_size_mismatch(false), large_omap_object_found(false) {}
 
     void encode(bufferlist& bl) const;
     void decode(bufferlist::iterator& bl);