--- /dev/null
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, client.0]
+overrides:
+ ceph:
+ log-whitelist:
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_FULL\)
+ - \(MDS_READ_ONLY\)
+ - large omap objects
+ - Large omap object found
+ - application not enabled
+ conf:
+ osd:
+ osd deep scrub large omap object value sum threshold: 8800000
+ osd deep scrub large omap object key threshold: 20000
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/test_large_omap_detection.py
--- /dev/null
+#!/usr/bin/python
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+import json
+import rados
+import shlex
+import subprocess
+import time
+
+def cleanup(cluster):
+ cluster.delete_pool('large-omap-test-pool')
+ cluster.shutdown()
+
+def init():
+ # For local testing
+ #cluster = rados.Rados(conffile='./ceph.conf')
+ cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
+ cluster.connect()
+ print("\nCluster ID: " + cluster.get_fsid())
+ cluster.create_pool('large-omap-test-pool')
+ ioctx = cluster.open_ioctx('large-omap-test-pool')
+ ioctx.write_full('large-omap-test-object1', "Lorem ipsum")
+ op = ioctx.create_write_op()
+
+ keys = []
+ values = []
+ for x in range(20001):
+ keys.append(str(x))
+ values.append("X")
+
+ ioctx.set_omap(op, tuple(keys), tuple(values))
+ ioctx.operate_write_op(op, 'large-omap-test-object1', 0)
+ ioctx.release_write_op(op)
+
+ ioctx.write_full('large-omap-test-object2', "Lorem ipsum dolor")
+ op = ioctx.create_write_op()
+
+ buffer = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+ "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+ "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+ "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+ "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+ "sunt in culpa qui officia deserunt mollit anim id est laborum.")
+
+ keys = []
+ values = []
+ for x in xrange(20000):
+ keys.append(str(x))
+ values.append(buffer)
+
+ ioctx.set_omap(op, tuple(keys), tuple(values))
+ ioctx.operate_write_op(op, 'large-omap-test-object2', 0)
+ ioctx.release_write_op(op)
+ ioctx.close()
+ return cluster
+
+def get_deep_scrub_timestamp(pgid):
+ cmd = ['ceph', 'pg', 'dump', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ for stat in json.loads(out)['pg_stats']:
+ if stat['pgid'] == pgid:
+ return stat['last_deep_scrub_stamp']
+
+def wait_for_scrub():
+ osds = set();
+ pgs = dict();
+ cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+ 'large-omap-test-object1', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ osds.add(json.loads(out)['acting_primary'])
+ pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+ cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+ 'large-omap-test-object2', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ osds.add(json.loads(out)['acting_primary'])
+ pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+
+ for osd in osds:
+ command = "ceph osd deep-scrub osd." + str(osd)
+ subprocess.check_call(shlex.split(command))
+
+ for pg in pgs:
+ RETRIES = 0
+ while RETRIES < 60 and pgs[pg] == get_deep_scrub_timestamp(pg):
+ time.sleep(10)
+ RETRIES += 1
+
+def check_health_output():
+ RETRIES = 0
+ result = 0
+ while RETRIES < 6 and result != 2:
+ result = 0
+ RETRIES += 1
+ output = subprocess.check_output(["ceph", "health", "detail"])
+ for line in output.splitlines():
+ result += int(line.find('2 large omap objects') != -1)
+ time.sleep(10)
+
+ if result != 2:
+ print("Error, got invalid output:")
+ print(output)
+ raise Exception
+
+def main():
+ cluster = init()
+ wait_for_scrub()
+ check_health_output()
+
+ cleanup(cluster)
+
+if __name__ == '__main__':
+ main()
.set_default(2_hr)
.set_description(""),
+ Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2000000)
+ .set_description("threshold for number of keys to determine a large omap object")
+ .add_service("osd")
+ .add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),
+
+ Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1_G)
+ .set_description("threshold for summed size (bytes) of all key values to determine a large omap object")
+ .add_service("osd")
+ .add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),
+
Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(CEPH_LIBDIR "/rados-classes")
.set_description(""),
checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
}
+ // LARGE_OMAP_OBJECTS
+ if (pg_sum.stats.sum.num_large_omap_objects) {
+ list<string> detail;
+ for (auto &pool : pools) {
+ const string& pool_name = osdmap.get_pool_name(pool.first);
+ auto it2 = pg_pool_sum.find(pool.first);
+ if (it2 == pg_pool_sum.end()) {
+ continue;
+ }
+ const pool_stat_t *pstat = &it2->second;
+ if (pstat == nullptr) {
+ continue;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ if (sum.num_large_omap_objects) {
+ stringstream ss;
+ ss << sum.num_large_omap_objects << " large objects found in pool "
+ << "'" << pool_name << "'";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+ auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
+ stringstream tip;
+ tip << "Search the cluster log for 'Large omap object found' for more "
+ << "details.";
+ detail.push_back(tip.str());
+ d.detail.swap(detail);
+ }
+ }
+
// CACHE_POOL_NEAR_FULL
{
list<string> detail;
map<pg_shard_t, ScrubMap *> maps;
maps[pg_whoami] = &scrubber.primary_scrubmap;
- for (set<pg_shard_t>::iterator i = actingbackfill.begin();
- i != actingbackfill.end();
- ++i) {
- if (*i == pg_whoami) continue;
- dout(2) << __func__ << " replica " << *i << " has "
- << scrubber.received_maps[*i].objects.size()
+ for (const auto& i : actingbackfill) {
+ if (i == pg_whoami) continue;
+ dout(2) << __func__ << " replica " << i << " has "
+ << scrubber.received_maps[i].objects.size()
<< " items" << dendl;
- maps[*i] = &scrubber.received_maps[*i];
+ maps[i] = &scrubber.received_maps[i];
}
- map<hobject_t,ScrubMap::object>::const_iterator i;
- map<pg_shard_t, ScrubMap *>::const_iterator j;
set<hobject_t> master_set;
// Construct master set
- for (j = maps.begin(); j != maps.end(); ++j) {
- for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
- master_set.insert(i->first);
+ for (const auto map : maps) {
+ for (const auto i : map.second->objects) {
+ master_set.insert(i.first);
}
}
+ stringstream ss;
+ get_pgbackend()->be_large_omap_check(maps, master_set,
+ scrubber.large_omap_objects, ss);
+ if (!ss.str().empty()) {
+ osd->clog->warn(ss);
+ }
+
if (acting.size() > 1) {
dout(10) << __func__ << " comparing replica scrub maps" << dendl;
- stringstream ss;
-
// Map from object with errors to good peer
map<hobject_t, list<pg_shard_t>> authoritative;
dout(2) << __func__ << " osd." << acting[0] << " has "
<< scrubber.primary_scrubmap.objects.size() << " items" << dendl;
+ ss.str("");
+ ss.clear();
+
get_pgbackend()->be_compare_scrubmaps(
maps,
master_set,
info.history.last_clean_scrub_stamp = now;
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
+ info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
} else {
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
// XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
set<pg_shard_t> waiting_on_whom;
int shallow_errors;
int deep_errors;
+ int large_omap_objects = 0;
int fixed;
ScrubMap primary_scrubmap;
map<pg_shard_t, ScrubMap> received_maps;
subset_last_update = eversion_t();
shallow_errors = 0;
deep_errors = 0;
+ large_omap_objects = 0;
fixed = 0;
deep = false;
seed = 0;
}
}
}
+
+void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ int& large_omap_objects,
+ ostream &warnstream) const
+{
+ // Iterate through objects and check large omap object flag
+ for (const auto& k : master_set) {
+ for (const auto& map : maps) {
+ ScrubMap::object& obj = map.second->objects[k];
+ if (obj.large_omap_object_found) {
+ large_omap_objects++;
+ warnstream << "Large omap object found. Object: " << k << " Key count: "
+ << obj.large_omap_object_key_count << " Size (bytes): "
+ << obj.large_omap_object_value_size << '\n';
+ break;
+ }
+ }
+ }
+}
uint32_t seed,
ScrubMap::object &o,
ThreadPool::TPHandle &handle) = 0;
+ void be_large_omap_check(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ int& large_omap_objects,
+ ostream &warnstream) const;
static PGBackend *build_pg_backend(
const pg_pool_t &pool,
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
assert(iter);
+ uint64_t keys_scanned = 0;
+ uint64_t value_sum = 0;
for (iter->seek_to_first(); iter->status() == 0 && iter->valid();
iter->next(false)) {
+ ++keys_scanned;
handle.reset_tp_timeout();
dout(25) << "CRC key " << iter->key() << " value:\n";
iter->value().hexdump(*_dout);
*_dout << dendl;
+ value_sum += iter->value().length();
+
::encode(iter->key(), bl);
::encode(iter->value(), bl);
oh << bl;
bl.clear();
}
+ if (keys_scanned > cct->_conf->get_val<uint64_t>(
+ "osd_deep_scrub_large_omap_object_key_threshold") ||
+ value_sum > cct->_conf->get_val<uint64_t>(
+ "osd_deep_scrub_large_omap_object_value_sum_threshold")) {
+ dout(25) << __func__ << " " << poid
+ << " large omap object detected. Object has " << keys_scanned
+ << " keys and size " << value_sum << " bytes" << dendl;
+ o.large_omap_object_found = true;
+ o.large_omap_object_key_count = keys_scanned;
+ o.large_omap_object_value_size = value_sum;
+ }
+
if (iter->status() < 0) {
dout(25) << __func__ << " " << poid
<< " on omap scan, db status error" << dendl;
f->dump_int("num_evict_mode_full", num_evict_mode_full);
f->dump_int("num_objects_pinned", num_objects_pinned);
f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+ f->dump_int("num_large_omap_objects", num_large_omap_objects);
}
void object_stat_sum_t::encode(bufferlist& bl) const
{
- ENCODE_START(16, 14, bl);
+ ENCODE_START(17, 14, bl);
#if defined(CEPH_LITTLE_ENDIAN)
bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
#else
::encode(num_objects_pinned, bl);
::encode(num_objects_missing, bl);
::encode(num_legacy_snapsets, bl);
+ ::encode(num_large_omap_objects, bl);
#endif
ENCODE_FINISH(bl);
}
void object_stat_sum_t::decode(bufferlist::iterator& bl)
{
bool decode_finish = false;
- DECODE_START(16, bl);
+ DECODE_START(17, bl);
#if defined(CEPH_LITTLE_ENDIAN)
if (struct_v >= 16) {
bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
} else {
num_legacy_snapsets = num_object_clones; // upper bound
}
+ if (struct_v >= 17) {
+ ::decode(num_large_omap_objects, bl);
+ }
}
DECODE_FINISH(bl);
}
a.num_evict_mode_some = 1;
a.num_evict_mode_full = 0;
a.num_objects_pinned = 20;
+ a.num_large_omap_objects = 5;
o.push_back(new object_stat_sum_t(a));
}
num_evict_mode_full += o.num_evict_mode_full;
num_objects_pinned += o.num_objects_pinned;
num_legacy_snapsets += o.num_legacy_snapsets;
+ num_large_omap_objects += o.num_large_omap_objects;
}
void object_stat_sum_t::sub(const object_stat_sum_t& o)
num_evict_mode_full -= o.num_evict_mode_full;
num_objects_pinned -= o.num_objects_pinned;
num_legacy_snapsets -= o.num_legacy_snapsets;
+ num_large_omap_objects -= o.num_large_omap_objects;
}
bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
l.num_evict_mode_some == r.num_evict_mode_some &&
l.num_evict_mode_full == r.num_evict_mode_full &&
l.num_objects_pinned == r.num_objects_pinned &&
- l.num_legacy_snapsets == r.num_legacy_snapsets;
+ l.num_legacy_snapsets == r.num_legacy_snapsets &&
+ l.num_large_omap_objects == r.num_large_omap_objects;
}
// -- object_stat_collection_t --
void ScrubMap::object::encode(bufferlist& bl) const
{
bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
- ENCODE_START(8, 7, bl);
+ ENCODE_START(9, 7, bl);
::encode(size, bl);
::encode(negative, bl);
::encode(attrs, bl);
::encode(read_error, bl);
::encode(ec_hash_mismatch, bl);
::encode(ec_size_mismatch, bl);
+ ::encode(large_omap_object_found, bl);
+ ::encode(large_omap_object_key_count, bl);
+ ::encode(large_omap_object_value_size, bl);
ENCODE_FINISH(bl);
}
void ScrubMap::object::decode(bufferlist::iterator& bl)
{
- DECODE_START(8, bl);
+ DECODE_START(9, bl);
::decode(size, bl);
bool tmp, compat_read_error = false;
::decode(tmp, bl);
// If older encoder found a read_error, set read_error
if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
read_error = true;
+ if (struct_v >= 9) {
+ ::decode(tmp, bl);
+ large_omap_object_found = tmp;
+ ::decode(large_omap_object_key_count, bl);
+ ::decode(large_omap_object_value_size, bl);
+ }
DECODE_FINISH(bl);
}
int64_t num_objects_pinned;
int64_t num_objects_missing;
int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+ int64_t num_large_omap_objects = 0;
object_stat_sum_t()
: num_bytes(0),
FLOOR(num_wr);
FLOOR(num_wr_kb);
FLOOR(num_scrub_errors);
+ FLOOR(num_large_omap_objects);
FLOOR(num_shallow_scrub_errors);
FLOOR(num_deep_scrub_errors);
FLOOR(num_objects_recovered);
SPLIT(num_wr);
SPLIT(num_wr_kb);
SPLIT(num_scrub_errors);
+ SPLIT(num_large_omap_objects);
SPLIT(num_shallow_scrub_errors);
SPLIT(num_deep_scrub_errors);
SPLIT(num_objects_recovered);
sizeof(num_wr) +
sizeof(num_wr_kb) +
sizeof(num_scrub_errors) +
+ sizeof(num_large_omap_objects) +
sizeof(num_objects_recovered) +
sizeof(num_bytes_recovered) +
sizeof(num_keys_recovered) +
bool stat_error:1;
bool ec_hash_mismatch:1;
bool ec_size_mismatch:1;
+ bool large_omap_object_found:1;
+ uint64_t large_omap_object_key_count = 0;
+ uint64_t large_omap_object_value_size = 0;
object() :
// Init invalid size so it won't match if we get a stat EIO error
size(-1), omap_digest(0), digest(0),
- negative(false), digest_present(false), omap_digest_present(false),
- read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
+ negative(false), digest_present(false), omap_digest_present(false),
+ read_error(false), stat_error(false), ec_hash_mismatch(false),
+ ec_size_mismatch(false), large_omap_object_found(false) {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);