From 7f90c723949eebba9b9233ffdf3ea54efaca46aa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 22 Feb 2017 14:35:20 -0600 Subject: [PATCH] osd: store clone list in SnapSet Store the per-clone snaps list in SnapSet if (1) REQUIRE_LUMINOUS is set in the OSDMap and (2) the SnapSet isn't a 'legacy' SnapSet that hasn't been converted yet by scrub. Signed-off-by: Sage Weil --- src/osd/PG.cc | 60 +++++++-- src/osd/PrimaryLogPG.cc | 244 +++++++++++++++++++++++++---------- src/osd/ReplicatedBackend.cc | 1 + src/osd/osd_types.h | 2 +- 4 files changed, 224 insertions(+), 83 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index cdc52784dbaf..4ecc8fbbc9b6 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3952,33 +3952,67 @@ void PG::_scan_rollback_obs( void PG::_scan_snaps(ScrubMap &smap) { - for (map::iterator i = smap.objects.begin(); - i != smap.objects.end(); + hobject_t head; + SnapSet snapset; + for (map::reverse_iterator i = smap.objects.rbegin(); + i != smap.objects.rend(); ++i) { const hobject_t &hoid = i->first; ScrubMap::object &o = i->second; - if (hoid.snap < CEPH_MAXSNAP) { - // check and if necessary fix snap_mapper + if (hoid.is_head() || hoid.is_snapdir()) { + // parse the SnapSet bufferlist bl; - if (o.attrs.find(OI_ATTR) == o.attrs.end()) { + if (o.attrs.find(SS_ATTR) == o.attrs.end()) { continue; } - bl.push_back(o.attrs[OI_ATTR]); - object_info_t oi; + bl.push_back(o.attrs[SS_ATTR]); + auto p = bl.begin(); try { - oi.decode(bl); + ::decode(snapset, p); } catch(...) { continue; } - set oi_snaps(oi.legacy_snaps.begin(), oi.legacy_snaps.end()); + head = hoid.get_head(); + continue; + } + if (hoid.snap < CEPH_MAXSNAP) { + // check and if necessary fix snap_mapper + if (hoid.get_head() != head) { + derr << __func__ << " no head for " << hoid << " (have " << head << ")" + << dendl; + continue; + } + set obj_snaps; + if (!snapset.is_legacy()) { + auto p = snapset.clone_snaps.find(hoid.snap); + if (p == snapset.clone_snaps.end()) { + derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset + << dendl; + continue; + } + obj_snaps.insert(p->second.begin(), p->second.end()); + } else { + bufferlist bl; + if (o.attrs.find(OI_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[OI_ATTR]); + object_info_t oi; + try { + oi.decode(bl); + } catch(...) { + continue; + } + obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end()); + } set cur_snaps; int r = snap_mapper.get_snaps(hoid, &cur_snaps); if (r != 0 && r != -ENOENT) { derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; ceph_abort(); } - if (r == -ENOENT || cur_snaps != oi_snaps) { + if (r == -ENOENT || cur_snaps != obj_snaps) { ObjectStore::Transaction t; OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); if (r == 0) { @@ -3993,7 +4027,7 @@ void PG::_scan_snaps(ScrubMap &smap) << info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps << ", oi: " - << oi_snaps + << obj_snaps << "...repaired"; } else { osd->clog->error() << "osd." << osd->whoami @@ -4001,10 +4035,10 @@ void PG::_scan_snaps(ScrubMap &smap) << info.pgid << " oid " << hoid << " snaps missing in mapper" << ", should be: " - << oi_snaps + << obj_snaps << "...repaired"; } - snap_mapper.add_oid(hoid, oi_snaps, &_t); + snap_mapper.add_oid(hoid, obj_snaps, &_t); r = osd->store->apply_transaction(osr.get(), std::move(t)); if (r != 0) { derr << __func__ << ": apply_transaction got " << cpp_strerror(r) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index c4b152b35ea2..9fc6c4f784bb 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -333,18 +333,27 @@ void PrimaryLogPG::on_local_recover( ObjectRecoveryInfo recovery_info(_recovery_info); clear_object_snap_mapping(t, hoid); - if (recovery_info.soid.snap < CEPH_NOSNAP) { - assert(recovery_info.oi.legacy_snaps.size()); + if (recovery_info.soid.is_snap()) { OSDriver::OSTransaction _t(osdriver.get_transaction(t)); - set snaps( - recovery_info.oi.legacy_snaps.begin(), - recovery_info.oi.legacy_snaps.end()); + set snaps; + dout(20) << " snapset " << recovery_info.ss + << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl; + if (recovery_info.ss.is_legacy() || + recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) { + assert(recovery_info.oi.legacy_snaps.size()); + snaps.insert(recovery_info.oi.legacy_snaps.begin(), + recovery_info.oi.legacy_snaps.end()); + } else { + auto p = recovery_info.ss.clone_snaps.find(hoid.snap); + assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn? + snaps.insert(p->second.begin(), p->second.end()); + } + dout(20) << " snaps " << snaps << dendl; snap_mapper.add_oid( recovery_info.soid, snaps, &_t); } - if (pg_log.get_missing().is_missing(recovery_info.soid) && pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { assert(is_primary()); @@ -428,7 +437,6 @@ void PrimaryLogPG::on_local_recover( // update pg dirty_info = true; write_if_dirty(*t); - } void PrimaryLogPG::on_global_recover( @@ -3530,15 +3538,30 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_ ObjectContextRef snapset_obc = get_object_context(snapoid, false); assert(snapset_obc); + SnapSet& snapset = obc->ssc->snapset; + + bool legacy = snapset.is_legacy() || + !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); + object_info_t &coi = obc->obs.oi; - set old_snaps(coi.legacy_snaps.begin(), coi.legacy_snaps.end()); + set old_snaps; + if (legacy) { + old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end()); + } else { + auto p = snapset.clone_snaps.find(coid.snap); + if (p == snapset.clone_snaps.end()) { + osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset + << " for " << coid << "\n"; + return NULL; + } + old_snaps.insert(snapset.clone_snaps[coid.snap].begin(), + snapset.clone_snaps[coid.snap].end()); + } if (old_snaps.empty()) { osd->clog->error() << __func__ << " No object info snaps for " << coid; return NULL; } - SnapSet& snapset = obc->ssc->snapset; - dout(10) << coid << " old_snaps " << old_snaps << " old snapset " << snapset << dendl; if (snapset.seq == 0) { @@ -3633,6 +3656,7 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_ snapset.clones.erase(p); snapset.clone_overlap.erase(last); snapset.clone_size.erase(last); + snapset.clone_snaps.erase(last); ctx->log.push_back( pg_log_entry_t( @@ -3653,9 +3677,15 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_ ctx->at_version.version++; } else { // save adjusted snaps for this object - dout(10) << coid << " snaps " << old_snaps - << " -> " << new_snaps << dendl; - coi.legacy_snaps = vector(new_snaps.rbegin(), new_snaps.rend()); + dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl; + if (legacy) { + coi.legacy_snaps = vector(new_snaps.rbegin(), new_snaps.rend()); + } else { + snapset.clone_snaps[coid.snap] = vector(new_snaps.rbegin(), + new_snaps.rend()); + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + } coi.prior_version = coi.version; coi.version = ctx->at_version; @@ -3674,12 +3704,12 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_ ctx->mtime, 0) ); + ctx->at_version.version++; t->update_snaps( coid, old_snaps, new_snaps); - ctx->at_version.version++; } // save head snapset @@ -5195,7 +5225,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) if (ssc->snapset.head_exists) clonecount++; resp.clones.reserve(clonecount); - for (vector::const_iterator clone_iter = ssc->snapset.clones.begin(); + for (auto clone_iter = ssc->snapset.clones.begin(); clone_iter != ssc->snapset.clones.end(); ++clone_iter) { clone_info ci; ci.cloneid = *clone_iter; @@ -5203,43 +5233,58 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) hobject_t clone_oid = soid; clone_oid.snap = *clone_iter; - /* No need to take a lock here. We are only inspecting state cached on - * in the ObjectContext, so we aren't performing an actual read unless - * the clone obc is not already loaded (in which case, it cannot have - * an in progress write). We also do not risk exposing uncommitted - * state since we do have a read lock on the head object or snapdir, - * which we would have to write lock in order to make user visible - * modifications to the snapshot state (snap trim related mutations - * are not user visible). - */ - if (is_missing_object(clone_oid)) { - dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl; - wait_for_unreadable_object(clone_oid, ctx->op); - result = -EAGAIN; - break; - } - - ObjectContextRef clone_obc = get_object_context(clone_oid, false); - if (!clone_obc) { - if (maybe_handle_cache( - ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) { - // promoting the clone - result = -EAGAIN; - } else { + if (!ssc->snapset.is_legacy()) { + auto p = ssc->snapset.clone_snaps.find(*clone_iter); + if (p == ssc->snapset.clone_snaps.end()) { osd->clog->error() << "osd." << osd->whoami - << ": missing clone " << clone_oid - << " for oid " - << soid; - // should not happen - result = -ENOENT; + << ": inconsistent clone_snaps found for oid " + << soid << " clone " << *clone_iter + << " snapset " << ssc->snapset; + result = -EINVAL; + break; + } + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + ci.snaps.push_back(*q); + } + } else { + /* No need to take a lock here. We are only inspecting state cached on + * in the ObjectContext, so we aren't performing an actual read unless + * the clone obc is not already loaded (in which case, it cannot have + * an in progress write). We also do not risk exposing uncommitted + * state since we do have a read lock on the head object or snapdir, + * which we would have to write lock in order to make user visible + * modifications to the snapshot state (snap trim related mutations + * are not user visible). + */ + if (is_missing_object(clone_oid)) { + dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl; + wait_for_unreadable_object(clone_oid, ctx->op); + result = -EAGAIN; + break; + } + + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (!clone_obc) { + if (maybe_handle_cache( + ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) { + // promoting the clone + result = -EAGAIN; + } else { + osd->clog->error() << "osd." << osd->whoami + << ": missing clone " << clone_oid + << " for oid " + << soid; + // should not happen + result = -ENOENT; + } + break; + } + for (vector::reverse_iterator p = + clone_obc->obs.oi.legacy_snaps.rbegin(); + p != clone_obc->obs.oi.legacy_snaps.rend(); + ++p) { + ci.snaps.push_back(*p); } - break; - } - for (vector::reverse_iterator p = - clone_obc->obs.oi.legacy_snaps.rbegin(); - p != clone_obc->obs.oi.legacy_snaps.rend(); - ++p) { - ci.snaps.push_back(*p); } dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl; @@ -5247,7 +5292,8 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) map >::const_iterator coi; coi = ssc->snapset.clone_overlap.find(ci.cloneid); if (coi == ssc->snapset.clone_overlap.end()) { - osd->clog->error() << "osd." << osd->whoami << ": inconsistent clone_overlap found for oid " + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_overlap found for oid " << soid << " clone " << *clone_iter; result = -EINVAL; break; @@ -5256,14 +5302,16 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) ci.overlap.reserve(o.num_intervals()); for (interval_set::const_iterator r = o.begin(); r != o.end(); ++r) { - ci.overlap.push_back(pair(r.get_start(), r.get_len())); + ci.overlap.push_back(pair(r.get_start(), + r.get_len())); } map::const_iterator si; si = ssc->snapset.clone_size.find(ci.cloneid); if (si == ssc->snapset.clone_size.end()) { - osd->clog->error() << "osd." << osd->whoami << ": inconsistent clone_size found for oid " - << soid << " clone " << *clone_iter; + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_size found for oid " + << soid << " clone " << *clone_iter; result = -EINVAL; break; } @@ -6720,7 +6768,12 @@ void PrimaryLogPG::make_writeable(OpContext *ctx) snap_oi->version = ctx->at_version; snap_oi->prior_version = ctx->obs->oi.version; snap_oi->copy_user_bits(ctx->obs->oi); - snap_oi->legacy_snaps = snaps; + + bool legacy = ctx->new_snapset.is_legacy() || + !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); + if (legacy) { + snap_oi->legacy_snaps = snaps; + } _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi); @@ -6736,6 +6789,9 @@ void PrimaryLogPG::make_writeable(OpContext *ctx) ctx->delta_stats.num_object_clones++; ctx->new_snapset.clones.push_back(coid.snap); ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size; + if (!legacy) { + ctx->new_snapset.clone_snaps[coid.snap] = snaps; + } // clone_overlap should contain an entry for each clone // (an empty interval_set if there is no overlap) @@ -6746,7 +6802,8 @@ void PrimaryLogPG::make_writeable(OpContext *ctx) // log clone dout(10) << " cloning v " << ctx->obs->oi.version << " to " << coid << " v " << ctx->at_version - << " snaps=" << snaps << dendl; + << " snaps=" << snaps + << " snapset=" << ctx->new_snapset << dendl; ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version, ctx->obs->oi.version, ctx->obs->oi.user_version, @@ -7129,6 +7186,9 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid); } + bool legacy_snapset = ctx->new_snapset.is_legacy() || + !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); + // append to log ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version, ctx->obs->oi.version, @@ -7139,9 +7199,16 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc case pg_log_entry_t::MODIFY: case pg_log_entry_t::PROMOTE: case pg_log_entry_t::CLEAN: - dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.legacy_snaps - << dendl; - ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps); + if (legacy_snapset) { + dout(20) << __func__ << " encoding legacy_snaps " + << ctx->new_obs.oi.legacy_snaps + << dendl; + ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps); + } else { + dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset + << dendl; + ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps); + } break; default: break; @@ -7307,10 +7374,16 @@ int PrimaryLogPG::fill_in_copy_get( // size, mtime reply_obj.size = oi.size; reply_obj.mtime = oi.mtime; + assert(obc->ssc); if (soid.snap < CEPH_NOSNAP) { - reply_obj.snaps = oi.legacy_snaps; + if (obc->ssc->snapset.is_legacy()) { + reply_obj.snaps = oi.legacy_snaps; + } else { + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + assert(p != obc->ssc->snapset.clone_snaps.end()); // warn? + reply_obj.snaps = p->second; + } } else { - assert(obc->ssc); reply_obj.snap_seq = obc->ssc->snapset.seq; } if (oi.is_data_digest()) { @@ -7953,15 +8026,22 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, tctx->at_version = get_next_version(); filter_snapc(tctx->new_snapset.snaps); vector new_clones; + map> new_clone_snaps; for (vector::iterator i = tctx->new_snapset.clones.begin(); i != tctx->new_snapset.clones.end(); ++i) { - if (*i != soid.snap) + if (*i != soid.snap) { new_clones.push_back(*i); + auto p = tctx->new_snapset.clone_snaps.find(*i); + if (p != tctx->new_snapset.clone_snaps.end()) { + new_clone_snaps[*i] = p->second; + } + } } tctx->new_snapset.clones.swap(new_clones); tctx->new_snapset.clone_overlap.erase(soid.snap); tctx->new_snapset.clone_size.erase(soid.snap); + tctx->new_snapset.clone_snaps.swap(new_clone_snaps); // take RWWRITE lock for duration of our local write. ignore starvation. if (!tctx->lock_manager.take_write_lock( @@ -8013,6 +8093,9 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, tctx->extra_reqids = results->reqids; + bool legacy_snapset = tctx->new_snapset.is_legacy() || + !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); + if (whiteout) { // create a whiteout tctx->op_t->create(soid); @@ -8042,8 +8125,13 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, tctx->new_obs.oi.truncate_size = results->truncate_size; if (soid.snap != CEPH_NOSNAP) { - tctx->new_obs.oi.legacy_snaps = results->snaps; - assert(!tctx->new_obs.oi.legacy_snaps.empty()); + if (legacy_snapset) { + tctx->new_obs.oi.legacy_snaps = results->snaps; + assert(!tctx->new_obs.oi.legacy_snaps.empty()); + } else { + // it's already in the snapset + assert(obc->ssc->snapset.clone_snaps.count(soid.snap)); + } assert(obc->ssc->snapset.clone_size.count(soid.snap)); assert(obc->ssc->snapset.clone_size[soid.snap] == results->object_size); @@ -8057,7 +8145,9 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, if (results->mirror_snapset) { assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP); - tctx->new_snapset.from_snap_set(results->snapset); + tctx->new_snapset.from_snap_set( + results->snapset, + !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)); } tctx->new_snapset.head_exists = true; dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl; @@ -8282,7 +8372,14 @@ int PrimaryLogPG::start_flush( snapc.seq = snapset.seq; snapc.snaps = snapset.snaps; } else { - snapid_t min_included_snap = oi.legacy_snaps.back(); + snapid_t min_included_snap; + if (snapset.is_legacy()) { + min_included_snap = oi.legacy_snaps.back(); + } else { + auto p = snapset.clone_snaps.find(soid.snap); + assert(p != snapset.clone_snaps.end()); + min_included_snap = p->second.back(); + } snapc = snapset.get_ssc_as_of(min_included_snap - 1); } @@ -9541,10 +9638,19 @@ int PrimaryLogPG::find_object_context(const hobject_t& oid, // clone dout(20) << "find_object_context " << soid - << " snaps " << obc->obs.oi.legacy_snaps + << " snapset " << obc->ssc->snapset + << " legacy_snaps " << obc->obs.oi.legacy_snaps << dendl; - snapid_t first = obc->obs.oi.legacy_snaps[obc->obs.oi.legacy_snaps.size()-1]; - snapid_t last = obc->obs.oi.legacy_snaps[0]; + snapid_t first, last; + if (obc->ssc->snapset.is_legacy()) { + first = obc->obs.oi.legacy_snaps.back(); + last = obc->obs.oi.legacy_snaps.front(); + } else { + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + assert(p != obc->ssc->snapset.clone_snaps.end()); + first = p->second.back(); + last = p->second.front(); + } if (first <= oid.snap) { dout(20) << "find_object_context " << soid << " [" << first << "," << last << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl; diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 7b47688d4e3c..ba5ab88151be 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -1414,6 +1414,7 @@ void ReplicatedBackend::prepare_pull( SnapSetContext *ssc = headctx->ssc; assert(ssc); dout(10) << " snapset " << ssc->snapset << dendl; + recovery_info.ss = ssc->snapset; calc_clone_subsets( ssc->snapset, soid, get_parent()->get_local_missing(), get_info().last_backfill, diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 1be4a07745e3..93bdc4735d3a 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4553,7 +4553,7 @@ struct ObjectRecoveryInfo { eversion_t version; uint64_t size; object_info_t oi; - SnapSet ss; + SnapSet ss; // only populated if soid is_snap() interval_set copy_subset; map> clone_subset; -- 2.47.3