From 1d5dc29a137df894fb9affdcffa7b35dc1a84f42 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 26 Sep 2016 11:39:41 -0400 Subject: [PATCH] osd: add _fastinfo PG attr for common pg_info_t updates For most IO operations we only update a handful of fields in the pg_info_t structure. However, the full struct, when encoded, is on the order of 800 bytes. This adds a new attribute, _fastinfo, which contains only the most commonly updated fields. When present, the fastinfo fields should be overlayed on top of the full info struct contained in the existing info attr. If a field outside of the "fast" set is updated, we clear the fastinfo attribute and update the full info attr. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 + src/osd/OSD.cc | 1 + src/osd/PG.cc | 81 ++++++++++++---- src/osd/PG.h | 10 +- src/osd/osd_types.h | 144 +++++++++++++++++++++++++++++ src/tools/ceph_objectstore_tool.cc | 2 +- 6 files changed, 219 insertions(+), 21 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f01bdb2a9bb..e5fa1c0f81d 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -826,6 +826,8 @@ OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OPTION(osd_pg_object_context_cache_count, OPT_INT, 64) OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled +OPTION(osd_fast_info, OPT_BOOL, true) // use fast info attr, if we can + // determines whether PGLog::check() compares written out log to stored log OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false) OPTION(osd_loop_before_reset_tphandle, OPT_U32, 64) // Max number of loop before we reset thread-pool's handle diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 265c4fff8b6..40b14c112c8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -194,6 +194,7 @@ CompatSet OSD::get_osd_initial_compat_set() { ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, ceph_osd_feature_incompat); } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 3d06d90b7d0..d902aac9b33 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -74,6 +74,7 @@ const string infover_key("_infover"); const string info_key("_info"); const string biginfo_key("_biginfo"); const string epoch_key("_epoch"); +const string fastinfo_key("_fastinfo"); template @@ -2739,23 +2740,30 @@ void PG::init( void PG::upgrade(ObjectStore *store) { - assert(info_struct_v <= 8); + assert(info_struct_v <= 9); ObjectStore::Transaction t; - assert(info_struct_v == 7); + assert(info_struct_v >= 7); + + // 8 -> 9 + if (info_struct_v <= 8) { + // no special action needed. + } // 7 -> 8 - pg_log.mark_log_for_rewrite(); - ghobject_t log_oid(OSD::make_pg_log_oid(pg_id)); - ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id)); - t.remove(coll_t::meta(), log_oid); - t.remove(coll_t::meta(), biginfo_oid); + if (info_struct_v <= 7) { + pg_log.mark_log_for_rewrite(); + ghobject_t log_oid(OSD::make_pg_log_oid(pg_id)); + ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id)); + t.remove(coll_t::meta(), log_oid); + t.remove(coll_t::meta(), biginfo_oid); - t.touch(coll, pgmeta_oid); - map v; - __u8 ver = cur_struct_v; - ::encode(ver, v[infover_key]); - t.omap_setkeys(coll, pgmeta_oid, v); + t.touch(coll, pgmeta_oid); + map v; + __u8 ver = cur_struct_v; + ::encode(ver, v[infover_key]); + t.omap_setkeys(coll, pgmeta_oid, v); + } dirty_info = true; dirty_big_info = true; @@ -2785,12 +2793,41 @@ int PG::_prepare_write_info(map *km, pg_info_t &info, pg_info_t &last_written_info, map &past_intervals, bool dirty_big_info, - bool dirty_epoch) + bool dirty_epoch, + bool try_fast_info) { + if (dirty_epoch) { + ::encode(epoch, (*km)[epoch_key]); + } + + // try to do info efficiently? + if (!dirty_big_info && try_fast_info) { + pg_fast_info_t fast; + fast.populate_from(info); + fast.apply_to(&last_written_info); + if (info == last_written_info) { + ::encode(fast, (*km)[fastinfo_key]); + return 0; + } + generic_dout(30) << __func__ << " fastinfo failed, info:\n"; + { + JSONFormatter jf(true); + jf.dump_object("info", info); + jf.flush(*_dout); + } + { + *_dout << "\nlast_written_info:\n"; + JSONFormatter jf(true); + jf.dump_object("last_written_info", last_written_info); + jf.flush(*_dout); + } + *_dout << dendl; + } + (*km)[fastinfo_key]; // erase any previous fastinfo + last_written_info = info; + // info. store purged_snaps separately. interval_set purged_snaps; - if (dirty_epoch) - ::encode(epoch, (*km)[epoch_key]); purged_snaps.swap(info.purged_snaps); ::encode(info, (*km)[info_key]); purged_snaps.swap(info.purged_snaps); @@ -2845,7 +2882,8 @@ void PG::prepare_write_info(map *km) info, last_written_info, past_intervals, - dirty_big_info, need_update_epoch); + dirty_big_info, need_update_epoch, + g_conf->osd_fast_info); assert(ret == 0); if (need_update_epoch) last_epoch = get_osdmap()->get_epoch(); @@ -3065,11 +3103,13 @@ int PG::read_info( keys.insert(infover_key); keys.insert(info_key); keys.insert(biginfo_key); + keys.insert(fastinfo_key); ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); map values; int r = store->omap_get_values(coll, pgmeta_oid, keys, &values); if (r == 0) { - assert(values.size() == 3); + assert(values.size() == 3 || + values.size() == 4); bufferlist::iterator p = values[infover_key].begin(); ::decode(struct_v, p); @@ -3081,6 +3121,13 @@ int PG::read_info( p = values[biginfo_key].begin(); ::decode(past_intervals, p); ::decode(info.purged_snaps, p); + + p = values[fastinfo_key].begin(); + if (!p.end()) { + pg_fast_info_t fast; + ::decode(fast, p); + fast.apply_to(&info); + } return 0; } diff --git a/src/osd/PG.h b/src/osd/PG.h index 8f454fd8434..0a8fbe04a60 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -295,7 +295,9 @@ public: pg_info_t info; ///< current pg info pg_info_t last_written_info; ///< last written info __u8 info_struct_v; - static const __u8 cur_struct_v = 8; + static const __u8 cur_struct_v = 9; + // v9 was fastinfo_key addition + // v8 was the move to a per-pg pgmeta object // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad // (first appeared in cuttlefish). static const __u8 compat_struct_v = 7; @@ -2233,13 +2235,15 @@ private: void prepare_write_info(map *km); public: - static int _prepare_write_info(map *km, + static int _prepare_write_info( + map *km, epoch_t epoch, pg_info_t &info, pg_info_t &last_written_info, map &past_intervals, bool dirty_big_info, - bool dirty_epoch); + bool dirty_epoch, + bool try_fast_info); void write_if_dirty(ObjectStore::Transaction& t); eversion_t get_next_version() const { diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index a02e39a4752..80367a4e184 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -61,6 +61,7 @@ #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints") #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object") #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(14, "fastinfo pg attr") /// max recovery priority for MBackfillReserve @@ -2232,6 +2233,149 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi) return out; } +/** + * pg_fast_info_t - common pg_info_t fields + * + * These are the fields of pg_info_t (and children) that are updated for + * most IO operations. + * + * ** WARNING ** + * Because we rely on these fields to be applied to the normal + * info struct, adding a new field here that is not also new in info + * means that we must set an incompat OSD feature bit! + */ +struct pg_fast_info_t { + eversion_t last_update; + eversion_t last_complete; + version_t last_user_version; + struct { // pg_stat_t stats + eversion_t version; + version_t reported_seq; + utime_t last_fresh; + utime_t last_active; + utime_t last_peered; + utime_t last_clean; + utime_t last_unstale; + utime_t last_undegraded; + utime_t last_fullsized; + int64_t log_size; // (also ondisk_log_size, which has the same value) + struct { // object_stat_collection_t stats; + struct { // objct_stat_sum_t sum + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_copies; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_objects_dirty; + } sum; + } stats; + } stats; + + void populate_from(const pg_info_t& info) { + last_update = info.last_update; + last_complete = info.last_complete; + last_user_version = info.last_user_version; + stats.version = info.stats.version; + stats.reported_seq = info.stats.reported_seq; + stats.last_fresh = info.stats.last_fresh; + stats.last_active = info.stats.last_active; + stats.last_peered = info.stats.last_peered; + stats.last_clean = info.stats.last_clean; + stats.last_unstale = info.stats.last_unstale; + stats.last_undegraded = info.stats.last_undegraded; + stats.last_fullsized = info.stats.last_fullsized; + stats.log_size = info.stats.log_size; + stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes; + stats.stats.sum.num_objects = info.stats.stats.sum.num_objects; + stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies; + stats.stats.sum.num_rd = info.stats.stats.sum.num_rd; + stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb; + stats.stats.sum.num_wr = info.stats.stats.sum.num_wr; + stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb; + stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty; + } + + void apply_to(pg_info_t* info) { + info->last_update = last_update; + info->last_complete = last_complete; + info->last_user_version = last_user_version; + info->stats.version = stats.version; + info->stats.reported_seq = stats.reported_seq; + info->stats.last_fresh = stats.last_fresh; + info->stats.last_active = stats.last_active; + info->stats.last_peered = stats.last_peered; + info->stats.last_clean = stats.last_clean; + info->stats.last_unstale = stats.last_unstale; + info->stats.last_undegraded = stats.last_undegraded; + info->stats.last_fullsized = stats.last_fullsized; + info->stats.log_size = stats.log_size; + info->stats.ondisk_log_size = stats.log_size; + info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes; + info->stats.stats.sum.num_objects = stats.stats.sum.num_objects; + info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies; + info->stats.stats.sum.num_rd = stats.stats.sum.num_rd; + info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb; + info->stats.stats.sum.num_wr = stats.stats.sum.num_wr; + info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb; + info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(last_update, bl); + ::encode(last_complete, bl); + ::encode(last_user_version, bl); + ::encode(stats.version, bl); + ::encode(stats.reported_seq, bl); + ::encode(stats.last_fresh, bl); + ::encode(stats.last_active, bl); + ::encode(stats.last_peered, bl); + ::encode(stats.last_clean, bl); + ::encode(stats.last_unstale, bl); + ::encode(stats.last_undegraded, bl); + ::encode(stats.last_fullsized, bl); + ::encode(stats.log_size, bl); + ::encode(stats.stats.sum.num_bytes, bl); + ::encode(stats.stats.sum.num_objects, bl); + ::encode(stats.stats.sum.num_object_copies, bl); + ::encode(stats.stats.sum.num_rd, bl); + ::encode(stats.stats.sum.num_rd_kb, bl); + ::encode(stats.stats.sum.num_wr, bl); + ::encode(stats.stats.sum.num_wr_kb, bl); + ::encode(stats.stats.sum.num_objects_dirty, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& p) { + DECODE_START(1, p); + ::decode(last_update, p); + ::decode(last_complete, p); + ::decode(last_user_version, p); + ::decode(stats.version, p); + ::decode(stats.reported_seq, p); + ::decode(stats.last_fresh, p); + ::decode(stats.last_active, p); + ::decode(stats.last_peered, p); + ::decode(stats.last_clean, p); + ::decode(stats.last_unstale, p); + ::decode(stats.last_undegraded, p); + ::decode(stats.last_fullsized, p); + ::decode(stats.log_size, p); + ::decode(stats.stats.sum.num_bytes, p); + ::decode(stats.stats.sum.num_objects, p); + ::decode(stats.stats.sum.num_object_copies, p); + ::decode(stats.stats.sum.num_rd, p); + ::decode(stats.stats.sum.num_rd_kb, p); + ::decode(stats.stats.sum.num_wr, p); + ::decode(stats.stats.sum.num_wr_kb, p); + ::decode(stats.stats.sum.num_objects_dirty, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(pg_fast_info_t) + + struct pg_notify_t { epoch_t query_epoch; epoch_t epoch_sent; diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index f3a14d84a21..6a2ae1551d7 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -483,7 +483,7 @@ int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, info, last_written_info, past_intervals, - true, true); + true, true, false); if (ret) cerr << "Failed to write info" << std::endl; t.omap_setkeys(coll, pgmeta_oid, km); return ret; -- 2.39.5