From: Loic Dachary Date: Tue, 15 Dec 2015 17:28:29 +0000 (+0100) Subject: osd: log inconsistent shard sizes X-Git-Tag: v0.94.7~44^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f04e0075eabfb1b9d19c75761c321098546c4044;p=ceph.git osd: log inconsistent shard sizes When an OSD asserts because it cannot recover from an unexpected erasure coded shard size, the object needs to be manually fixed (i.e. the shard must be removed so it can be reconstructed). Unfortunately the assert does not display the name of the object and it is not displayed unless the OSD logs are >= 10, which is uncommon on an actual cluster because it creates log files that are too big. The problem has been fixed in infernalis as part of https://github.com/ceph/ceph/pull/5173 and backporting it is non trivial. The error condition is modified to make it easy for the system administrator to identify which object is causing the crash and what steps must be taken to fix the issue. http://tracker.ceph.com/issues/14009 Refs: #14009 Signed-off-by: Loic Dachary --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 3b517402bce..00cbd54c6d7 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -361,7 +361,17 @@ void ECBackend::handle_recovery_read_complete( from[i->first.shard].claim(i->second); } dout(10) << __func__ << ": " << from << dendl; - ECUtil::decode(sinfo, ec_impl, from, target); + if (ECUtil::decode(sinfo, ec_impl, from, target) != 0) { + derr << __func__ << ": inconsistent shard sizes " << hoid << " " + << " the offending shard must be manually removed " + << " after verifying there are enough shards to recover " + << "(" << to_read.get<0>() + << ", " << to_read.get<1>() + << ", " << to_read.get<2>() + << ")" + << dendl; + assert(0); + } if (attrs) { op.xattrs.swap(*attrs); diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 1f3b45857da..efc57b5e8a4 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -56,7 +56,8 @@ int ECUtil::decode( for (map::iterator i = to_decode.begin(); i != to_decode.end(); ++i) { - assert(i->second.length() == total_chunk_size); + if (i->second.length() != total_chunk_size) + return -EINVAL; } if (total_chunk_size == 0)