Avoid eating a double per instance.
- osd pg split breaks if not all osds are up...
- mislinked directory? (cpusr.sh, mv /c/* /c/t, more cpusr, ls /c/t)
-?- kclient: after reconnect,
-cp: writing `/c/ceph2.2/bin/gs-gpl': Bad file descriptor
- - need to somehow wake up unreconnected caps? hrm!!
-
-?- kclient: socket creation
-
- snaprealm thing
ceph3:~# find /c
/c
*
*/
+class DecayRate {
+ double k; // k = ln(.5)/half_life
+
+ friend class DecayCounter;
+
+public:
+ DecayRate() : k(0) {}
+ DecayRate(double hl) { set_halflife(hl); }
+ void set_halflife(double hl) {
+ k = log(.5) / hl;
+ }
+};
+
class DecayCounter {
protected:
public:
- double k; // k = ln(.5)/half_life
double val; // value
double delta; // delta since last decay
double vel; // recent velocity
public:
void encode(bufferlist& bl) const {
- __u8 struct_v = 2;
+ __u8 struct_v = 3;
::encode(struct_v, bl);
- ::encode(k, bl);
::encode(val, bl);
::encode(delta, bl);
::encode(vel, bl);
double half_life;
::decode(half_life, p);
}
- ::decode(k, p);
+ if (struct_v < 3) {
+ double k;
+ ::decode(k, p);
+ }
::decode(val, p);
::decode(delta, p);
::decode(vel, p);
}
DecayCounter() : val(0), delta(0), vel(0) {
- set_halflife( g_conf.mds_decay_halflife );
- reset();
- }
- DecayCounter(double hl) : val(0), delta(0), vel(0) {
- set_halflife( hl );
reset();
}
* reading
*/
- double get() {
- return get(g_clock.now());
- }
-
- double get(utime_t now) {
- decay(now);
+ double get(utime_t now, const DecayRate& rate) {
+ decay(now, rate);
return val;
}
* adjusting
*/
- double hit(utime_t now, double v = 1.0) {
- decay(now);
+ double hit(utime_t now, const DecayRate& rate, double v = 1.0) {
+ decay(now, rate);
delta += v;
return val+delta;
}
void adjust(double a) {
val += a;
}
- void adjust(utime_t now, double a) {
- decay(now);
+ void adjust(utime_t now, const DecayRate& rate, double a) {
+ decay(now, rate);
val += a;
}
void scale(double f) {
* decay etc.
*/
- void set_halflife(double hl) {
- k = log(.5) / hl;
- }
-
void reset() {
reset(g_clock.now());
}
val = delta = 0;
}
- void decay(utime_t now) {
+ void decay(utime_t now, const DecayRate &rate) {
utime_t el = now;
el -= last_decay;
if (el.sec() >= 1) {
// calculate new value
- double newval = (val+delta) * exp((double)el * k);
+ double newval = (val+delta) * exp((double)el * rate.k);
if (newval < .01) newval = 0.0;
// calculate velocity approx
vel += (newval - val) * (double)el;
- vel *= exp((double)el * k);
+ vel *= exp((double)el * rate.k);
val = newval;
delta = 0;
f->set_version(get_version());
f->pop_me = pop_me;
- f->pop_me *= fac;
+ f->pop_me.scale(fac);
// FIXME; this is an approximation
f->pop_nested = pop_nested;
- f->pop_nested *= fac;
+ f->pop_nested.scale(fac);
f->pop_auth_subtree = pop_auth_subtree;
- f->pop_auth_subtree *= fac;
+ f->pop_auth_subtree.scale(fac);
f->pop_auth_subtree_nested = pop_auth_subtree_nested;
- f->pop_auth_subtree_nested *= fac;
+ f->pop_auth_subtree_nested.scale(fac);
dout(10) << " subfrag " << *p << " " << *f << dendl;
subfrags[n++] = f;
void CDir::finish_export(utime_t now)
{
- pop_auth_subtree_nested -= pop_auth_subtree;
+ pop_auth_subtree_nested.sub(now, cache->decayrate, pop_auth_subtree);
pop_me.zero(now);
pop_auth_subtree.zero(now);
put(PIN_TEMPEXPORTING);
dirty_old_rstat.clear();
}
-void CDir::decode_import(bufferlist::iterator& blp)
+void CDir::decode_import(bufferlist::iterator& blp, utime_t now)
{
::decode(first, blp);
::decode(fnode, blp);
::decode(pop_me, blp);
::decode(pop_auth_subtree, blp);
- pop_auth_subtree_nested += pop_auth_subtree;
+ pop_auth_subtree_nested.add(now, cache->decayrate, pop_auth_subtree);
::decode(dir_rep_by, blp);
::decode(replica_map, blp);
void abort_export() {
put(PIN_TEMPEXPORTING);
}
- void decode_import(bufferlist::iterator& blp);
+ void decode_import(bufferlist::iterator& blp, utime_t now);
// -- auth pins --
return 0;
}
-mds_load_t MDBalancer::get_load()
+mds_load_t MDBalancer::get_load(utime_t now)
{
mds_load_t load;
for (list<CDir*>::iterator p = ls.begin();
p != ls.end();
p++) {
- load.auth += (*p)->pop_auth_subtree_nested;
- load.all += (*p)->pop_nested;
+ load.auth.add(now, mds->mdcache->decayrate, (*p)->pop_auth_subtree_nested);
+ load.all.add(now, mds->mdcache->decayrate, (*p)->pop_nested);
}
} else {
dout(20) << "get_load no root, no load" << dendl;
beat_epoch++;
// my load
- mds_load_t load = get_load();
+ mds_load_t load = get_load(now);
mds_load[ mds->get_nodeid() ] = load;
// import_map -- how much do i import from whom
int from = im->inode->authority().first;
if (from == mds->get_nodeid()) continue;
if (im->get_inode()->is_stray()) continue;
- import_map[from] += im->pop_auth_subtree.meta_load(now);
+ import_map[from] += im->pop_auth_subtree.meta_load(now, mds->mdcache->decayrate);
}
mds_import_map[ mds->get_nodeid() ] = import_map;
// rescale! turn my mds_load back into meta_load units
double load_fac = 1.0;
if (mds_load[whoami].mds_load() > 0) {
- double metald = mds_load[whoami].auth.meta_load(rebalance_time);
+ double metald = mds_load[whoami].auth.meta_load(rebalance_time, mds->mdcache->decayrate);
double mdsld = mds_load[whoami].mds_load();
load_fac = metald / mdsld;
dout(7) << " load_fac is " << load_fac
CDir *im = *it;
if (im->get_inode()->is_stray()) continue;
- double pop = im->pop_auth_subtree.meta_load(rebalance_time);
+ double pop = im->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
if (g_conf.mds_bal_idle_threshold > 0 &&
pop < g_conf.mds_bal_idle_threshold &&
im->inode != mds->mdcache->get_root() &&
if (dir->inode->is_root()) continue;
if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
- double pop = dir->pop_auth_subtree.meta_load(rebalance_time);
+ double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
if (pop <= amount-have) {
dout(-5) << " - exporting "
<< (*it)->pop_auth_subtree
<< " "
- << (*it)->pop_auth_subtree.meta_load(rebalance_time)
+ << (*it)->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
<< " to mds" << target
<< " " << **it
<< dendl;
list<CDir*> bigger_rep, bigger_unrep;
multimap<double, CDir*> smaller;
- double dir_pop = dir->pop_auth_subtree.meta_load(rebalance_time);
+ double dir_pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
double subdir_sum = 0;
if (subdir->is_frozen()) continue; // can't export this right now!
// how popular?
- double pop = subdir->pop_auth_subtree.meta_load(rebalance_time);
+ double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
subdir_sum += pop;
dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who)
{
// hit inode
- in->pop.get(type).hit(now);
+ in->pop.get(type).hit(now, mds->mdcache->decayrate);
if (in->get_parent_dn())
hit_dir(now, in->get_parent_dn()->get_dir(), type, who);
// replicate?
if (type == META_POP_IRD && who >= 0) {
- dir->pop_spread.hit(now, who);
+ dir->pop_spread.hit(now, mds->mdcache->decayrate, who);
}
double rd_adj = 0;
if (type == META_POP_IRD &&
dir->last_popularity_sample < last_sample) {
- float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm??
+ float dir_pop = dir->pop_auth_subtree.get(type).get(now, mds->mdcache->decayrate); // hmm??
dir->last_popularity_sample = last_sample;
- float pop_sp = dir->pop_spread.get(now);
+ float pop_sp = dir->pop_spread.get(now, mds->mdcache->decayrate);
dir_pop += pop_sp * 10;
//if (dir->ino() == inodeno_t(0x10000000002))
if (!dir->is_rep() &&
dir_pop >= g_conf.mds_bal_replicate_threshold) {
// replicate
- float rdp = dir->pop_me.get(META_POP_IRD).get(now);
+ float rdp = dir->pop_me.get(META_POP_IRD).get(now, mds->mdcache->decayrate);
rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp;
rd_adj /= 2.0; // temper somewhat
while (1) {
dir->pop_nested.get(type).hit(now, amount);
if (rd_adj != 0.0)
- dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj);
+ dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
if (hit_subtree) {
dir->pop_auth_subtree.get(type).hit(now, amount);
if (rd_adj != 0.0)
- dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj);
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
}
if (hit_subtree_nested) {
- dir->pop_auth_subtree_nested.get(type).hit(now, amount);
+ dir->pop_auth_subtree_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
if (rd_adj != 0.0)
- dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj);
+ dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
}
if (dir->is_subtree_root())
* NOTE: call me _after_ forcing *dir into a subtree root,
* but _before_ doing the encode_export_dirs.
*/
-void MDBalancer::subtract_export(CDir *dir)
+void MDBalancer::subtract_export(CDir *dir, utime_t now)
{
dirfrag_load_vec_t subload = dir->pop_auth_subtree;
dir = dir->inode->get_parent_dir();
if (!dir) break;
- dir->pop_nested -= subload;
- dir->pop_auth_subtree_nested -= subload;
+ dir->pop_nested.sub(now, mds->mdcache->decayrate, subload);
+ dir->pop_auth_subtree_nested.sub(now, mds->mdcache->decayrate, subload);
}
}
-void MDBalancer::add_import(CDir *dir)
+void MDBalancer::add_import(CDir *dir, utime_t now)
{
dirfrag_load_vec_t subload = dir->pop_auth_subtree;
dir = dir->inode->get_parent_dir();
if (!dir) break;
- dir->pop_nested += subload;
- dir->pop_auth_subtree_nested += subload;
+ dir->pop_nested.add(now, mds->mdcache->decayrate, subload);
+ dir->pop_auth_subtree_nested.add(now, mds->mdcache->decayrate, subload);
}
}
++p) {
CDir *dir = *p;
- myfile << (int)dir->pop_me.meta_load(now) << "\t";
- myfile << (int)dir->pop_nested.meta_load(now) << "\t";
- myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t";
- myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t";
+ myfile << (int)dir->pop_me.meta_load(now, mds->mdcache->decayrate) << "\t";
+ myfile << (int)dir->pop_nested.meta_load(now, mds->mdcache->decayrate) << "\t";
+ myfile << (int)dir->pop_auth_subtree.meta_load(now, mds->mdcache->decayrate) << "\t";
+ myfile << (int)dir->pop_auth_subtree_nested.meta_load(now, mds->mdcache->decayrate) << "\t";
// filename last
string p;
beat_epoch(0),
last_epoch_under(0), last_epoch_over(0) { }
- mds_load_t get_load();
+ mds_load_t get_load(utime_t);
int proc_message(Message *m);
set<CDir*>& already_exporting);
- void subtract_export(class CDir *ex);
- void add_import(class CDir *im);
+ void subtract_export(class CDir *ex, utime_t now);
+ void add_import(class CDir *im, utime_t now);
void hit_inode(utime_t now, class CInode *in, int type, int who=-1);
void hit_dir(utime_t now, class CDir *dir, int type, int who, double amount=1.0);
lru.lru_set_max(g_conf.mds_cache_size);
lru.lru_set_midpoint(g_conf.mds_cache_mid);
+ decayrate.set_halflife(g_conf.mds_decay_halflife);
+
did_shutdown_log_cap = false;
}
// adjust recursive pop counters
if (dir->is_auth()) {
+ utime_t now = g_clock.now();
CDir *p = dir->get_parent_dir();
while (p) {
- p->pop_auth_subtree -= dir->pop_auth_subtree;
+ p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
if (p->is_subtree_root()) break;
p = p->inode->get_parent_dir();
}
// adjust popularity?
if (dir->is_auth()) {
+ utime_t now = g_clock.now();
CDir *p = dir->get_parent_dir();
while (p) {
- p->pop_auth_subtree += dir->pop_auth_subtree;
+ p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
if (p->is_subtree_root()) break;
p = p->inode->get_parent_dir();
}
CInode *stray; // my stray dir
public:
+ DecayRate decayrate;
+
int num_inodes_with_caps;
int num_caps;
}
// log
- mds_load_t load = balancer->get_load();
+ utime_t now = g_clock.now();
+ mds_load_t load = balancer->get_load(now);
if (logger) {
req_rate = logger->get(l_mds_req);
{
assert(export_peer.count(dir));
int dest = export_peer[dir];
+ utime_t now = g_clock.now();
dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
cache->show_subtrees();
cache->adjust_subtree_auth(dir, dest, mds->get_nodeid());
// take away the popularity we're sending.
- mds->balancer->subtract_export(dir);
+ mds->balancer->subtract_export(dir, now);
// fill export message with cache data
MExportDir *req = new MExportDir(dir->dirfrag());
- utime_t now = g_clock.now();
map<client_t,entity_inst_t> exported_client_map;
int num_exported_inodes = encode_export_dir(req->export_data,
dir, // recur start point
assert (g_conf.mds_kill_import_at != 5);
CDir *dir = cache->get_dirfrag(m->dirfrag);
assert(dir);
-
+
+ utime_t now = g_clock.now();
int oldauth = m->get_source().num();
dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
assert(dir->is_auth() == false);
le,
mds->mdlog->get_current_segment(),
import_caps[dir],
- import_updated_scatterlocks[dir]);
+ import_updated_scatterlocks[dir],
+ now);
}
dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
le->metablob.add_dir(*it, false); // note that parent metadata is already in the event
// adjust popularity
- mds->balancer->add_import(dir);
+ mds->balancer->add_import(dir, now);
dout(7) << "handle_export_dir did " << *dir << dendl;
EImportStart *le,
LogSegment *ls,
map<CInode*, map<client_t,Capability::Export> >& cap_imports,
- list<ScatterLock*>& updated_scatterlocks)
+ list<ScatterLock*>& updated_scatterlocks, utime_t now)
{
// set up dir
dirfrag_t df;
dout(7) << "decode_import_dir " << *dir << dendl;
// assimilate state
- dir->decode_import(blp);
+ dir->decode_import(blp, now);
// mark (may already be marked from get_or_open_dir() above)
if (!dir->is_auth())
EImportStart *le,
LogSegment *ls,
map<CInode*, map<client_t,Capability::Export> >& cap_imports,
- list<ScatterLock*>& updated_scatterlocks);
+ list<ScatterLock*>& updated_scatterlocks, utime_t now);
public:
void import_reverse(CDir *dir);
assert(t < NUM);
return vec[t];
}
- void adjust(utime_t now, double d) {
+ void adjust(utime_t now, const DecayRate& rate, double d) {
for (int i=0; i<NUM; i++)
- vec[i].adjust(now, d);
+ vec[i].adjust(now, rate, d);
}
void zero(utime_t now) {
for (int i=0; i<NUM; i++)
vec[i].reset(now);
}
- double meta_load(utime_t now) {
+ double meta_load(utime_t now, const DecayRate& rate) {
return
- 1*vec[META_POP_IRD].get(now) +
- 2*vec[META_POP_IWR].get(now) +
- 1*vec[META_POP_READDIR].get(now) +
- 2*vec[META_POP_FETCH].get(now) +
- 4*vec[META_POP_STORE].get(now);
+ 1*vec[META_POP_IRD].get(now, rate) +
+ 2*vec[META_POP_IWR].get(now, rate) +
+ 1*vec[META_POP_READDIR].get(now, rate) +
+ 2*vec[META_POP_FETCH].get(now, rate) +
+ 4*vec[META_POP_STORE].get(now, rate);
}
double meta_load() {
return
2*vec[META_POP_FETCH].get_last() +
4*vec[META_POP_STORE].get_last();
}
+
+ void add(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
+ for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].adjust(r.vec[i].get(now, rate));
+ }
+ void sub(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
+ for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].adjust(-r.vec[i].get(now, rate));
+ }
+ void scale(double f) {
+ for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].scale(f);
+ }
};
WRITE_CLASS_ENCODER(dirfrag_load_vec_t)
-inline dirfrag_load_vec_t& operator+=(dirfrag_load_vec_t& l, dirfrag_load_vec_t& r)
-{
- utime_t now = g_clock.now();
- for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
- l.vec[i].adjust(r.vec[i].get(now));
- return l;
-}
-
-inline dirfrag_load_vec_t& operator-=(dirfrag_load_vec_t& l, dirfrag_load_vec_t& r)
-{
- utime_t now = g_clock.now();
- for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
- l.vec[i].adjust(-r.vec[i].get(now));
- return l;
-}
-
-inline dirfrag_load_vec_t& operator*=(dirfrag_load_vec_t& l, double f)
-{
- for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
- l.vec[i].scale(f);
- return l;
-}
-
-
inline ostream& operator<<(ostream& out, dirfrag_load_vec_t& dl)
{
+ // ugliness!
utime_t now = g_clock.now();
- return out << "[" << dl.vec[0].get(now) << "," << dl.vec[1].get(now)
- << " " << dl.meta_load(now)
+ DecayRate rate(g_conf.mds_decay_halflife);
+ return out << "[" << dl.vec[0].get(now, rate) << "," << dl.vec[1].get(now, rate)
+ << " " << dl.meta_load(now, rate)
<< "]";
}
public:
load_spread_t() : p(0), n(0) {
- for (int i=0; i<MAX; i++) last[i] = -1;
+ for (int i=0; i<MAX; i++)
+ last[i] = -1;
}
- double hit(utime_t now, int who) {
+ double hit(utime_t now, const DecayRate& rate, int who) {
for (int i=0; i<n; i++)
if (last[i] == who)
return count.get_last();
if (p == MAX) p = 0;
- return count.hit(now);
+ return count.hit(now, rate);
}
- double get(utime_t now) {
- return count.get(now);
+ double get(utime_t now, const DecayRate& rate) {
+ return count.get(now, rate);
}
};
heartbeat_messenger(hbm),
heartbeat_thread(this),
heartbeat_dispatcher(this),
- stat_oprate(5.0),
+ decayrate(5.0),
+ stat_oprate(),
peer_stat_lock("OSD::peer_stat_lock"),
read_latency_calc(g_conf.osd_max_opq<1 ? 1:g_conf.osd_max_opq),
qlen_calc(3),
pending_ops > 2*my_stat.qlen) {
now.encode_timeval(&my_stat.stamp);
- my_stat.oprate = stat_oprate.get(now);
+ my_stat.oprate = stat_oprate.get(now, decayrate);
//read_latency_calc.set_size( 20 ); // hrm.
PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0;
// update qlen stats
- stat_oprate.hit(now);
+ stat_oprate.hit(now, decayrate);
stat_ops++;
stat_qlen += pending_ops;
private:
// -- stats --
+ DecayRate decayrate;
DecayCounter stat_oprate;
int stat_ops; // ops since last heartbeat
int stat_rd_ops;
if (is_primary() &&
g_conf.osd_balance_reads)
- stat_object_temp_rd[soid].hit(now); // hit temp.
+ stat_object_temp_rd[soid].hit(now, osd->decayrate); // hit temp.
} else {
osd->logger->inc(l_osd_c_wr);
osd->logger->inc(l_osd_c_wrb, ctx->indata.length());