From 11992703195b3b6d89bf86883d0d1865786077c5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Nov 2008 11:23:15 -0800 Subject: [PATCH] osd: move stats into PG::Info (disk format change) We want the pg stats to propagate along with last_update. Do so in merge_log. Also, stop doing delayed stats update on primary; we always update the in-core copy of Info, and only delay applying the transaction to disk. At least currently. --- src/TODO | 1 + src/include/ceph_fs.h | 2 +- src/osd/PG.cc | 30 +++++++++------------------ src/osd/PG.h | 6 +++++- src/osd/ReplicatedPG.cc | 46 ++++++++++++++++++++--------------------- src/osd/ReplicatedPG.h | 3 +-- 6 files changed, 40 insertions(+), 48 deletions(-) diff --git a/src/TODO b/src/TODO index 9a9065e18aa9f..28081323357e5 100644 --- a/src/TODO +++ b/src/TODO @@ -145,6 +145,7 @@ mon - osdmon needs to lower-bound old osdmap versions it keeps around? osd +- pg split needs to fix up pg stats. this is tricky with the clone overlap business... - how does an admin intervene when a pg needs to repeer despite a dead osd? - generalize ack semantics? or just change ack from memory to journal? memory/journal/disk... - rdlocks diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 0e87d3270061d..16d2c7c2ba7d9 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -34,7 +34,7 @@ #define CEPH_MON_PROTOCOL 2 #define CEPH_CLIENT_PROTOCOL 1 -#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v003" +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v004" #define CEPH_MON_ONDISK_MAGIC "ceph mon volume v003" /* diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 0c9334ccbb32f..52b8741082048 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -365,6 +365,7 @@ void PG::merge_log(ObjectStore::Transaction& t, Log &olog, Missing &omissing, in info.last_update = log.top = olog.top; info.log_bottom = log.bottom = olog.bottom; info.log_backlog = log.backlog = olog.backlog; + info.stats = peer_info[fromosd].stats; changed = true; } @@ -451,6 +452,7 @@ void PG::merge_log(ObjectStore::Transaction& t, Log &olog, Missing &omissing, in olog.log, from, to); info.last_update = log.top = olog.top; + info.stats = peer_info[fromosd].stats; // process divergent items if (!divergent.empty()) { @@ -513,8 +515,8 @@ void PG::generate_backlog() vector olist; osd->store->collection_list(info.pgid.to_coll(), olist); - if (olist.size() != pg_stats.num_objects) - dout(10) << " WARNING: " << olist.size() << " != num_objects " << pg_stats.num_objects << dendl; + if (olist.size() != info.stats.num_objects) + dout(10) << " WARNING: " << olist.size() << " != num_objects " << info.stats.num_objects << dendl; int local = 0; @@ -1393,7 +1395,7 @@ void PG::update_stats() if (is_primary()) { // update our stat summary pg_stats_valid = true; - pg_stats_stable = pg_stats; + pg_stats_stable = info.stats; pg_stats_stable.version = info.last_update; pg_stats_stable.reported = osd->osdmap->get_epoch(); pg_stats_stable.state = state; @@ -1421,11 +1423,12 @@ void PG::clear_stats() void PG::write_info(ObjectStore::Transaction& t) { - // write pg info + // pg state bufferlist infobl; ::encode(info, infobl); t.collection_setattr(info.pgid.to_coll(), "info", infobl); - + + // local state bufferlist snapbl; ::encode(snap_collections, snapbl); t.collection_setattr(info.pgid.to_coll(), "snap_collections", snapbl); @@ -1434,10 +1437,6 @@ void PG::write_info(ObjectStore::Transaction& t) ::encode(past_intervals, ki); t.collection_setattr(info.pgid.to_coll(), "past_intervals", ki); - bufferlist st; - ::encode(pg_stats, st); - t.collection_setattr(info.pgid.to_coll(), "stats", st); - dirty_info = false; } @@ -1642,17 +1641,8 @@ void PG::read_state(ObjectStore *store) // past_intervals bl.clear(); store->collection_getattr(info.pgid.to_coll(), "past_intervals", bl); - if (bl.length()) { - p = bl.begin(); - ::decode(past_intervals, p); - } - - bl.clear(); - store->collection_getattr(info.pgid.to_coll(), "stats", bl); - if (bl.length()) { - p = bl.begin(); - ::decode(pg_stats, p); - } + p = bl.begin(); + ::decode(past_intervals, p); read_log(store); } diff --git a/src/osd/PG.h b/src/osd/PG.h index a44d886a5b640..09086ded34ccb 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -68,6 +68,8 @@ public: set dead_snaps; // snaps we need to trim + pg_stat_t stats; + struct History { epoch_t epoch_created; // epoch in which PG was created epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally) @@ -116,6 +118,7 @@ public: ::encode(last_complete, bl); ::encode(log_bottom, bl); ::encode(log_backlog, bl); + ::encode(stats, bl); history.encode(bl); ::encode(dead_snaps, bl); } @@ -125,6 +128,7 @@ public: ::decode(last_complete, bl); ::decode(log_bottom, bl); ::decode(log_backlog, bl); + ::decode(stats, bl); history.decode(bl); ::decode(dead_snaps, bl); } @@ -603,7 +607,7 @@ protected: Mutex pg_stats_lock; bool pg_stats_valid; - pg_stat_t pg_stats, pg_stats_stable; + pg_stat_t pg_stats_stable; void update_stats(); void clear_stats(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 1e9e572b019ac..ad2b0c89c15c2 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -431,8 +431,8 @@ bool ReplicatedPG::snap_trimmer() coll_t c = info.pgid.to_snap_coll(sn); vector ls; osd->store->collection_list(c, ls); - if (ls.size() != pg_stats.num_objects) - dout(10) << " WARNING: " << ls.size() << " != num_objects " << pg_stats.num_objects << dendl; + if (ls.size() != info.stats.num_objects) + dout(10) << " WARNING: " << ls.size() << " != num_objects " << info.stats.num_objects << dendl; dout(10) << "snap_trimmer collection " << c << " has " << ls.size() << " items" << dendl; @@ -487,15 +487,15 @@ bool ReplicatedPG::snap_trimmer() vector::iterator n = p - 1; interval_set<__u64> keep; keep.union_of(snapset.clone_overlap[*n], snapset.clone_overlap[*p]); - add_interval_usage(keep, pg_stats); // not deallocated + add_interval_usage(keep, info.stats); // not deallocated snapset.clone_overlap[*n].intersection_of(snapset.clone_overlap[*p]); } else { - add_interval_usage(snapset.clone_overlap[last], pg_stats); // not deallocated + add_interval_usage(snapset.clone_overlap[last], info.stats); // not deallocated } - pg_stats.num_objects--; - pg_stats.num_object_clones--; - pg_stats.num_bytes -= snapset.clone_size[last]; - pg_stats.num_kb -= SHIFT_ROUND_UP(snapset.clone_size[last], 10); + info.stats.num_objects--; + info.stats.num_object_clones--; + info.stats.num_bytes -= snapset.clone_size[last]; + info.stats.num_kb -= SHIFT_ROUND_UP(snapset.clone_size[last], 10); snapset.clones.erase(p); snapset.clone_overlap.erase(last); snapset.clone_size.erase(last); @@ -523,7 +523,7 @@ bool ReplicatedPG::snap_trimmer() if (snapset.clones.empty() && !snapset.head_exists) { dout(10) << coid << " removing head " << head << dendl; t.remove(info.pgid.to_coll(), head); - pg_stats.num_objects--; + info.stats.num_objects--; } else { bl.clear(); ::encode(snapset, bl); @@ -1111,7 +1111,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req return 0; } -void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, pg_stat_t& stats, +void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, pobject_t poid, vector& ops, bufferlist& bl, eversion_t old_version, eversion_t at_version, @@ -1139,11 +1139,11 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t // clone? if (!did_snap && poid.oid.snap && !ceph_osd_op_type_lock(ops[i].op)) { // is a (non-lock) modification - prepare_clone(t, log_bl, reqid, stats, poid, old_size, old_version, at_version, + prepare_clone(t, log_bl, reqid, info.stats, poid, old_size, old_version, at_version, snapset, snapc); did_snap = true; } - prepare_simple_op(t, reqid, stats, poid, old_size, exists, + prepare_simple_op(t, reqid, info.stats, poid, old_size, exists, ops[i], bp, snapset, snapc); } @@ -1234,8 +1234,6 @@ void ReplicatedPG::apply_repop(RepGather *repop) repop->applied = true; - pg_stats.add(repop->stats); - // any completion stuff to do here? object_t oid = repop->op->get_oid(); ceph_osd_op& first = repop->op->ops[0]; @@ -1587,7 +1585,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) // we are acker. if (op->ops.size()) { // log and update later. - prepare_transaction(repop->t, op->get_reqid(), repop->stats, poid, op->ops, op->get_data(), + prepare_transaction(repop->t, op->get_reqid(), poid, op->ops, op->get_data(), old_version, av, snapset, snapc, op->get_inc_lock(), peers_complete_thru); @@ -1698,7 +1696,7 @@ void ReplicatedPG::sub_op_modify(MOSDSubOp *op) osd->logger->inc("r_wrb", op->get_data().length()); if (op->ops.size()) { - prepare_transaction(t, op->reqid, pg_stats, + prepare_transaction(t, op->reqid, op->poid, op->ops, op->get_data(), op->old_version, op->version, op->snapset, op->snapc, @@ -2720,8 +2718,8 @@ void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) // be thorough. vector ls; osd->store->collection_list(info.pgid.to_coll(), ls); - if (ls.size() != pg_stats.num_objects) - dout(10) << " WARNING: " << ls.size() << " != num_objects " << pg_stats.num_objects << dendl; + if (ls.size() != info.stats.num_objects) + dout(10) << " WARNING: " << ls.size() << " != num_objects " << info.stats.num_objects << dendl; set s; @@ -2790,8 +2788,8 @@ void ReplicatedPG::scrub() coll_t c = info.pgid.to_coll(); vector ls; osd->store->collection_list(c, ls); - if (ls.size() != pg_stats.num_objects) - dout(10) << "scrub WARNING: " << ls.size() << " != num_objects " << pg_stats.num_objects << dendl; + if (ls.size() != info.stats.num_objects) + dout(10) << "scrub WARNING: " << ls.size() << " != num_objects " << info.stats.num_objects << dendl; dout(10) << "scrub " << ls.size() << " objects" << dendl; sort(ls.begin(), ls.end()); @@ -2897,10 +2895,10 @@ void ReplicatedPG::scrub() } dout(10) << "scrub got " - << stat.num_objects << "/" << pg_stats.num_objects << " objects, " - << stat.num_object_clones << "/" << pg_stats.num_object_clones << " clones, " - << stat.num_bytes << "/" << pg_stats.num_bytes << " bytes, " - << stat.num_kb << "/" << pg_stats.num_kb << " kb." + << stat.num_objects << "/" << info.stats.num_objects << " objects, " + << stat.num_object_clones << "/" << info.stats.num_object_clones << " clones, " + << stat.num_bytes << "/" << info.stats.num_bytes << " bytes, " + << stat.num_kb << "/" << info.stats.num_kb << " kb." << dendl; dout(10) << "scrub finish" << dendl; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 9faf37f97056c..738fe91eda0f6 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -36,7 +36,6 @@ public: ObjectStore::Transaction t; bool applied, aborted; - pg_stat_t stats; set waitfor_ack; set waitfor_nvram; @@ -150,7 +149,7 @@ protected: pobject_t poid, __u64& old_size, bool& exists, ceph_osd_op& op, bufferlist::iterator& bp, SnapSet& snapset, SnapContext& snapc); - void prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, pg_stat_t& st, + void prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, pobject_t poid, vector& ops, bufferlist& bl, eversion_t old_version, eversion_t at_version, -- 2.39.5