From f04e0075eabfb1b9d19c75761c321098546c4044 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 15 Dec 2015 18:28:29 +0100 Subject: [PATCH] osd: log inconsistent shard sizes When an OSD asserts because it cannot recover from an unexpected erasure coded shard size, the object needs to be manually fixed (i.e. the shard must be removed so it can be reconstructed). Unfortunately the assert does not display the name of the object and it is not displayed unless the OSD logs are >= 10, which is uncommon on an actual cluster because it creates log files that are too big. The problem has been fixed in infernalis as part of https://github.com/ceph/ceph/pull/5173 and backporting it is non trivial. The error condition is modified to make it easy for the system administrator to identify which object is causing the crash and what steps must be taken to fix the issue. http://tracker.ceph.com/issues/14009 Refs: #14009 Signed-off-by: Loic Dachary --- src/osd/ECBackend.cc | 12 +++++++++++- src/osd/ECUtil.cc | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 3b517402bcef..00cbd54c6d74 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -361,7 +361,17 @@ void ECBackend::handle_recovery_read_complete( from[i->first.shard].claim(i->second); } dout(10) << __func__ << ": " << from << dendl; - ECUtil::decode(sinfo, ec_impl, from, target); + if (ECUtil::decode(sinfo, ec_impl, from, target) != 0) { + derr << __func__ << ": inconsistent shard sizes " << hoid << " " + << " the offending shard must be manually removed " + << " after verifying there are enough shards to recover " + << "(" << to_read.get<0>() + << ", " << to_read.get<1>() + << ", " << to_read.get<2>() + << ")" + << dendl; + assert(0); + } if (attrs) { op.xattrs.swap(*attrs); diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 1f3b45857da8..efc57b5e8a49 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -56,7 +56,8 @@ int ECUtil::decode( for (map::iterator i = to_decode.begin(); i != to_decode.end(); ++i) { - assert(i->second.length() == total_chunk_size); + if (i->second.length() != total_chunk_size) + return -EINVAL; } if (total_chunk_size == 0) -- 2.47.3