From 79415957431077b5adb9868e0a36a472d1fbe378 Mon Sep 17 00:00:00 2001 From: Christian Theune Date: Thu, 4 Oct 2018 09:13:22 +0200 Subject: [PATCH] osd: fix OSD crash when trimming corrupted stores This has been properly fixed in luminous but hasn't been backported and the patch needed a manual update. This is a little bit of a mashup between what I found in the luminous code and what the patch for Hammer discussed in http://tracker.ceph.com/issues/6101 did. Fixes: http://tracker.ceph.com/issues/6101 Signed-off-by: Christian Theune --- src/osd/ReplicatedPG.cc | 32 ++++++++++++++++++++++---------- src/osd/ReplicatedPG.h | 2 +- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 5d37b1ed9a5d8..fb8103214532c 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3383,14 +3383,15 @@ void ReplicatedPG::do_backfill(OpRequestRef op) } } -ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) +int ReplicatedPG::trim_object(const hobject_t &coid, ReplicatedPG::OpContextUPtr* ctxp) { + *ctxp = NULL; // load clone info bufferlist bl; ObjectContextRef obc = get_object_context(coid, false, NULL); if (!obc) { derr << __func__ << "could not find coid " << coid << dendl; - assert(0); + return -ENOENT; } assert(obc->ssc); @@ -3404,7 +3405,7 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) set old_snaps(coi.snaps.begin(), coi.snaps.end()); if (old_snaps.empty()) { osd->clog->error() << __func__ << " No object info snaps for " << coid << "\n"; - return NULL; + return -ENOENT; } SnapSet& snapset = obc->ssc->snapset; @@ -3413,7 +3414,7 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) << " old snapset " << snapset << dendl; if (snapset.seq == 0) { osd->clog->error() << __func__ << " No snapset.seq for " << coid << "\n"; - return NULL; + return -ENOENT; } set new_snaps; @@ -3430,7 +3431,7 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); if (p == snapset.clones.end()) { osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones" << "\n"; - return NULL; + return -ENOENT; } } @@ -3442,7 +3443,7 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) obc)) { close_op_ctx(ctx.release()); dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl; - return NULL; + return -ENOLCK; } if (!ctx->lock_manager.get_snaptrimmer_write( @@ -3450,7 +3451,7 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) snapset_obc)) { close_op_ctx(ctx.release()); dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl; - return NULL; + return -ENOLCK; } ctx->at_version = get_next_version(); @@ -3633,7 +3634,8 @@ ReplicatedPG::OpContextUPtr ReplicatedPG::trim_object(const hobject_t &coid) } } - return ctx; + *ctxp = std::move(ctx); + return 0; } void ReplicatedPG::snap_trimmer(epoch_t queued) @@ -13201,8 +13203,18 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&) } dout(10) << "TrimmingObjects react trimming " << pos << dendl; - OpContextUPtr ctx = pg->trim_object(pos); - if (!ctx) { + OpContextUPtr ctx; + r = pg->trim_object(pos, &ctx); + if (r == -ENOENT) { + dout(10) << "TrimmingObjects cannot find snap, dropping from snaptrimq: " << pos << dendl; + post_event(SnapTrim()); + if (in_flight.empty()) { + return transit< NotTrimming >(); + } else { + return transit< WaitingOnReplicas >(); + } + } + if (r == -ENOLCK) { dout(10) << __func__ << " could not get write lock on obj " << pos << dendl; pos = old_pos; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 8a800ddb804bc..e82f33e476c19 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1495,7 +1495,7 @@ public: ThreadPool::TPHandle &handle); void do_backfill(OpRequestRef op); - OpContextUPtr trim_object(const hobject_t &coid); + int trim_object(const hobject_t &coid, OpContextUPtr* ctxp); void snap_trimmer(epoch_t e); int do_osd_ops(OpContext *ctx, vector& ops); -- 2.39.5