From 5d241696701f8f4d5a3b366054af4d220dfe39e9 Mon Sep 17 00:00:00 2001 From: sageweil Date: Wed, 6 Jun 2007 18:47:53 +0000 Subject: [PATCH] * tweak to still allow primary-directed read balancing * pg_bits -> pg_num, smarter 'mod' function git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1400 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/pgs/TODO | 5 +- branches/sage/pgs/config.cc | 4 +- branches/sage/pgs/messages/MOSDOp.h | 3 + branches/sage/pgs/mon/OSDMonitor.cc | 18 +---- branches/sage/pgs/osd/OSD.cc | 26 ++++--- branches/sage/pgs/osd/OSDMap.h | 106 +++++++++++++++----------- branches/sage/pgs/osd/ReplicatedPG.cc | 16 +++- 7 files changed, 99 insertions(+), 79 deletions(-) diff --git a/branches/sage/pgs/TODO b/branches/sage/pgs/TODO index 9bbf0a8b5e24a..2925ba907ebd2 100644 --- a/branches/sage/pgs/TODO +++ b/branches/sage/pgs/TODO @@ -28,7 +28,7 @@ code cleanup - clean up all encoded structures general kernel planning -- soft consistency on lookup? +- soft consistency on (kernel) lookup? @@ -219,10 +219,7 @@ objecter osd/rados - read+floor_lockout for clean STOGITH-like/fencing semantics after failover. -- separate out replication code into a PG class, to pave way for RAID - - efficiently replicate clone() objects -- pg_num instead of pg_bits - flag missing log entries on crash recovery --> WRNOOP? or WRLOST? - consider implications of nvram writeahead logs - fix heartbeat wrt new replication diff --git a/branches/sage/pgs/config.cc b/branches/sage/pgs/config.cc index 66e3033918ec1..fa06c4d66ccd6 100644 --- a/branches/sage/pgs/config.cc +++ b/branches/sage/pgs/config.cc @@ -206,8 +206,8 @@ md_config_t g_conf = { osd_rep: OSD_REP_PRIMARY, osd_balance_reads: false, - osd_immediate_read_from_cache: true, // osds to read from the cache immediately? - osd_exclusive_caching: false, + osd_immediate_read_from_cache: true, // osds to read from the cache immediately? + osd_exclusive_caching: true, // replicas evict replicated writes osd_load_diff_percent: 20, // load diff for read forwarding osd_flash_crowd_iat_threshold: 100, osd_flash_crowd_iat_alpha: 0.125, diff --git a/branches/sage/pgs/messages/MOSDOp.h b/branches/sage/pgs/messages/MOSDOp.h index 220da427aeb02..96b389b119a7d 100644 --- a/branches/sage/pgs/messages/MOSDOp.h +++ b/branches/sage/pgs/messages/MOSDOp.h @@ -46,6 +46,7 @@ #define OSD_OP_RDUNLOCK 23 #define OSD_OP_UPLOCK 24 #define OSD_OP_DNLOCK 25 +#define OSD_OP_MININCLOCK 26 // minimum incarnation lock #define OSD_OP_PULL 30 #define OSD_OP_PUSH 31 @@ -74,6 +75,8 @@ public: case OSD_OP_UPLOCK: return "uplock"; case OSD_OP_DNLOCK: return "dnlock"; + case OSD_OP_MININCLOCK: return "mininclock"; + case OSD_OP_BALANCEREADS: return "balance-reads"; case OSD_OP_UNBALANCEREADS: return "unbalance-reads"; diff --git a/branches/sage/pgs/mon/OSDMonitor.cc b/branches/sage/pgs/mon/OSDMonitor.cc index 9a9b677a4ac3f..d81b2de404428 100644 --- a/branches/sage/pgs/mon/OSDMonitor.cc +++ b/branches/sage/pgs/mon/OSDMonitor.cc @@ -125,22 +125,10 @@ void OSDMonitor::create_initial() osdmap.ctime = g_clock.now(); if (g_conf.osd_pg_bits) { - osdmap.set_pg_bits(g_conf.osd_pg_bits); + osdmap.set_pg_num(1 << g_conf.osd_pg_bits); } else { - // figure out how many bits worth of osds we have. - // 1 osd -> 0 bits - // <= 2 osds -> 1 bit - // <= 4 osds -> 2 bits - int osdbits = -1; - int n = g_conf.num_osd; - assert(n > 0); - while (n) { - n = n >> 1; - osdbits++; - } - - // 7 bits per osd. - osdmap.set_pg_bits(osdbits + 4); // FIXME + // 4 bits of pgs per osd. + osdmap.set_pg_num(g_conf.num_osd << 4); } // start at epoch 0 until all osds boot diff --git a/branches/sage/pgs/osd/OSD.cc b/branches/sage/pgs/osd/OSD.cc index 96e80e8337c01..4f9d001a3c459 100644 --- a/branches/sage/pgs/osd/OSD.cc +++ b/branches/sage/pgs/osd/OSD.cc @@ -1157,9 +1157,15 @@ void OSD::advance_map(ObjectStore::Transaction& t) << dendl; if (osdmap->is_mkfs()) { - ps_t maxps = 1ULL << osdmap->get_pg_bits(); - ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits(); - dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << dendl; + ps_t numps = osdmap->get_pg_num(); + ps_t numlps = osdmap->get_localized_pg_num(); + dout(1) << "mkfs on " << numps << " normal, " << numlps << " localized pg sets" << dendl; + int minrep = 1; + int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); + int minraid = g_conf.osd_min_raid_width; + int maxraid = g_conf.osd_max_raid_width; + dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " + << minraid << ".." << maxraid << " osd raid groups" << dendl; assert(osdmap->get_epoch() == 1); //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; @@ -1170,9 +1176,9 @@ void OSD::advance_map(ObjectStore::Transaction& t) // create PGs // replicated for (int nrep = 1; - nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh + nrep <= maxrep; // for low osd counts.. hackish bleh nrep++) { - for (ps_t ps = 0; ps < maxps; ++ps) { + for (ps_t ps = 0; ps < numps; ++ps) { vector acting; pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, -1); int nrep = osdmap->pg_to_acting_osds(pgid, acting); @@ -1193,7 +1199,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) _unlock_pg(pgid); } - for (ps_t ps = 0; ps < maxlps; ++ps) { + for (ps_t ps = 0; ps < numlps; ++ps) { // local PG too vector acting; pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, whoami); @@ -1216,10 +1222,10 @@ void OSD::advance_map(ObjectStore::Transaction& t) } // raided - for (int size = g_conf.osd_min_raid_width; - size <= g_conf.osd_max_raid_width; + for (int size = minraid; + size <= maxraid; size++) { - for (ps_t ps = 0; ps < maxps; ++ps) { + for (ps_t ps = 0; ps < numps; ++ps) { vector acting; pg_t pgid = pg_t(pg_t::TYPE_RAID4, size, ps, -1); int nrep = osdmap->pg_to_acting_osds(pgid, acting); @@ -1240,7 +1246,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) _unlock_pg(pgid); } - for (ps_t ps = 0; ps < maxlps; ++ps) { + for (ps_t ps = 0; ps < numlps; ++ps) { // local PG too vector acting; pg_t pgid = pg_t(pg_t::TYPE_RAID4, size, ps, whoami); diff --git a/branches/sage/pgs/osd/OSDMap.h b/branches/sage/pgs/osd/OSDMap.h index f03b0f235ee3d..f8c5b3ebdd036 100644 --- a/branches/sage/pgs/osd/OSDMap.h +++ b/branches/sage/pgs/osd/OSDMap.h @@ -49,7 +49,7 @@ using namespace std; #define PG_PS_MASK ((1LL<>1)); +} + +inline int calc_bits_of(int t) { + int b = 0; + while (t) { + t = t >> 1; + b++; + } + return b; +} + + /** OSDMap */ @@ -108,8 +125,10 @@ private: epoch_t epoch; // what epoch of the osd cluster descriptor is this epoch_t mon_epoch; // monitor epoch (election iteration) utime_t ctime; // epoch start time - int pg_bits; // placement group bits - int localized_pg_bits; // bits for localized pgs + int pg_num; // placement group count + int pg_num_mask; // bitmask for above + int localized_pg_num; // localized place group count + int localized_pg_num_mask; // ditto set osds; // all osds set down_osds; // list of down disks @@ -124,15 +143,24 @@ private: friend class MDS; public: - OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {} + OSDMap() : epoch(0), mon_epoch(0), + pg_num(1<<5), + localized_pg_num(1<<3) { + calc_pg_masks(); + } // map info epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } - int get_pg_bits() const { return pg_bits; } - void set_pg_bits(int b) { pg_bits = b; } - int get_localized_pg_bits() const { return localized_pg_bits; } + void calc_pg_masks() { + pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; + localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1; + } + + int get_pg_num() const { return pg_num; } + void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } + int get_localized_pg_num() const { return localized_pg_num; } const utime_t& get_ctime() const { return ctime; } @@ -225,36 +253,35 @@ private: // serialize, unserialize void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&mon_epoch, sizeof(mon_epoch)); - blist.append((char*)&ctime, sizeof(ctime)); - blist.append((char*)&pg_bits, sizeof(pg_bits)); + ::_encode(epoch, blist); + ::_encode(mon_epoch, blist); + ::_encode(ctime, blist); + ::_encode(pg_num, blist); + ::_encode(localized_pg_num, blist); - _encode(osds, blist); - _encode(down_osds, blist); - _encode(out_osds, blist); - _encode(overload_osds, blist); - _encode(osd_inst, blist); + ::_encode(osds, blist); + ::_encode(down_osds, blist); + ::_encode(out_osds, blist); + ::_encode(overload_osds, blist); + ::_encode(osd_inst, blist); crush._encode(blist); } void decode(bufferlist& blist) { int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - blist.copy(off, sizeof(pg_bits), (char*)&pg_bits); - off += sizeof(pg_bits); - - _decode(osds, blist, off); - _decode(down_osds, blist, off); - _decode(out_osds, blist, off); - _decode(overload_osds, blist, off); - _decode(osd_inst, blist, off); + ::_decode(epoch, blist, off); + ::_decode(mon_epoch, blist, off); + ::_decode(ctime, blist, off); + ::_decode(pg_num, blist, off); + ::_decode(localized_pg_num, blist, off); + calc_pg_masks(); + + ::_decode(osds, blist, off); + ::_decode(down_osds, blist, off); + ::_decode(out_osds, blist, off); + ::_decode(overload_osds, blist, off); + ::_decode(osd_inst, blist, off); crush._decode(blist, off); } @@ -276,28 +303,15 @@ private: ps_t ps; switch (g_conf.osd_object_layout) { case OBJECT_LAYOUT_LINEAR: - { - //const object_t ono = oid.bno; - //const inodeno_t ino = oid >> OID_ONO_BITS; - ps = (oid.bno + oid.ino) & PG_PS_MASK; - ps &= ((1ULL<> OID_ONO_BITS; - ps = (oid.bno + H(oid.ino)) & PG_PS_MASK; - ps &= ((1ULL<> 32) ) & PG_PS_MASK; - ps &= ((1ULL<> 32) ), pg_num, pg_num_mask); break; default: diff --git a/branches/sage/pgs/osd/ReplicatedPG.cc b/branches/sage/pgs/osd/ReplicatedPG.cc index ebadf9ded163f..06b533928cdfe 100644 --- a/branches/sage/pgs/osd/ReplicatedPG.cc +++ b/branches/sage/pgs/osd/ReplicatedPG.cc @@ -374,7 +374,10 @@ void ReplicatedPG::op_read(MOSDOp *op) return; // !primary and unbalanced? - if (!is_primary()) { + // (ignore ops forwarded from the primary) + if (!is_primary() && + !(op->get_source().is_osd() && + op->get_source().num() == get_primary())) { // make sure i exist and am balanced, otherwise fw back to acker. bool b; if (!osd->store->exists(oid) || @@ -550,15 +553,24 @@ void ReplicatedPG::prepare_op_transaction(ObjectStore::Transaction& t, } break; + case OSD_OP_MININCLOCK: + { + uint32_t mininc = op->get_length(); + t.setattr(oid, "mininclock", &mininc, sizeof(mininc)); + } + break; + case OSD_OP_BALANCEREADS: { bool bal = true; t.setattr(oid, "balance-reads", &bal, sizeof(bal)); } + break; case OSD_OP_UNBALANCEREADS: { t.rmattr(oid, "balance-reads"); } + break; // -- modify -- @@ -1168,7 +1180,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) // lets evict the data from our cache to maintain a total large cache size if (g_conf.osd_exclusive_caching) - osd->store->trim_from_cache(op->get_oid() , op->get_offset(), op->get_length()); + osd->store->trim_from_cache(op->get_oid(), op->get_offset(), op->get_length()); oncommit->ack(); } -- 2.39.5