::
- ./vstart.sh -n -l
+ cd build
+ ../src/vstart.sh -n -l
for i in a b c; do
- ./ceph --admin-daemon out/mds.$i.asok config set debug_ms 0
- ./ceph --admin-daemon out/mds.$i.asok config set debug_mds 0
- ./ceph --admin-daemon out/mds.$i.asok config set debug_mds_balancer 2
- ./ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500
+ bin/ceph --admin-daemon out/mds.$i.asok config set debug_ms 0
+ bin/ceph --admin-daemon out/mds.$i.asok config set debug_mds 2
+ bin/ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500
done
::
- ./rados put --pool=cephfs_metadata_a greedyspill.lua mds/balancers/greedyspill.lua
+ bin/rados put --pool=cephfs_metadata_a greedyspill.lua ../src/mds/balancers/greedyspill.lua
3. Activate Mantle:
::
- ./ceph mds set allow_multimds true --yes-i-really-mean-it
- ./ceph mds set max_mds 5
- ./ceph mds set balancer greedyspill.lua
+ bin/ceph mds set allow_multimds true --yes-i-really-mean-it
+ bin/ceph mds set max_mds 5
+ bin/ceph fs set cephfs_a balancer greedyspill.lua
4. Mount CephFS in another window:
::
- ./ceph-fuse /cephfs -o allow_other &
+ bin/ceph-fuse /cephfs -o allow_other &
tail -f out/mds.a.log
last MDS tries to check the load of its neighbor, which does not exist.
5. Run a simple benchmark. In our case, we use the Docker mdtest image to
- create load. Assuming that CephFS is mounted in the first container, we can
- share the mount and run 3 clients using:
+ create load:
::
the top of the stack with a (k, v) pair. After reading each value, pop that
value but keep the key for the next call to `lua_next`.
+Reading from RADOS
+~~~~~~~~~~~~~~~~~~
+
+All MDSs will read balancing code from RADOS when the balancer version changes
+in the MDS Map. The balancer pulls the Lua code from RADOS synchronously. We do
+this with a timeout: if the asynchronous read does not come back within half
+the balancing tick interval the operation is cancelled and a Connection Timeout
+error is returned. By default, the balancing tick interval is 10 seconds, so
+Mantle will use a 5 second second timeout. This design allows Mantle to
+immediately return an error if anything RADOS-related goes wrong.
+
+We use this implementation because we do not want to do a blocking OSD read
+from inside the global MDS lock. Doing so would bring down the MDS cluster if
+any of the OSDs are not responsive -- this is tested in the ceph-qa-suite by
+setting all OSDs to down/out and making sure the MDS cluster stays active.
+
+One approach would be to asynchronously fire the read when handling the MDS Map
+and fill in the Lua code in the background. We cannot do this because the MDS
+does not support daemon-local fallbacks and the balancer assumes that all MDSs
+come to the same decision at the same time (e.g., importers, exporters, etc.).
+
Debugging
~~~~~~~~
(`dout_wrapper`) to Lua with the `lua_register()` primitive. The Lua code is
actually calling the `dout` function in C++.
+Warning and Info messages are centralized using the clog/Beacon. Successful
+messages are only sent on version changes by the first MDS to avoid spamming
+the `ceph -w` utility. These messages are used for the integration tests.
+
Testing
~~~~~~
return load;
}
-int MDBalancer::localize_balancer(string const balancer)
+/*
+ * Read synchronously from RADOS using a timeout. We cannot do daemon-local
+ * fallbacks (i.e. kick off async read when we are processing the map and
+ * check status when we get here) with the way the mds is structured.
+ */
+int MDBalancer::localize_balancer()
{
- int64_t pool_id = mds->mdsmap->get_metadata_pool();
- string fname = "/tmp/" + balancer;
-
- dout(15) << "looking for balancer=" << balancer << " in RADOS pool_id=" << pool_id << dendl;
- object_t oid = object_t(balancer);
- object_locator_t oloc(pool_id);
- bufferlist data;
- C_SaferCond waiter;
- mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &data, 0, &waiter);
- int r = waiter.wait();
- if (r == 0) {
- dout(15) << "write data from RADOS into fname=" << fname << " data=" << data.c_str() << dendl;
- data.write_file(fname.c_str());
- } else {
- dout(0) << "tick could not find balancer " << balancer
- << " in RADOS: " << cpp_strerror(r) << dendl;
+ /* reset everything */
+ bool ack = false;
+ int r = 0;
+ bufferlist lua_src;
+ Mutex lock("lock");
+ Cond cond;
+
+ /* we assume that balancer is in the metadata pool */
+ object_t oid = object_t(mds->mdsmap->get_balancer());
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
+ new C_SafeCond(&lock, &cond, &ack, &r));
+ dout(15) << "launched non-blocking read tid=" << tid
+ << " oid=" << oid << " oloc=" << oloc << dendl;
+
+ /* timeout: if we waste half our time waiting for RADOS, then abort! */
+ double t = ceph_clock_now(g_ceph_context) + g_conf->mds_bal_interval/2;
+ utime_t timeout;
+ timeout.set_from_double(t);
+ lock.Lock();
+ int ret_t = cond.WaitUntil(lock, timeout);
+ lock.Unlock();
+
+ /* success: store the balancer in memory and set the version. */
+ if (!r) {
+ if (ret_t == ETIMEDOUT) {
+ mds->objecter->op_cancel(tid, -ECANCELED);
+ return -ETIMEDOUT;
+ }
+ bal_code.assign(lua_src.to_str());
+ bal_version.assign(oid.name);
+ dout(0) << "localized balancer, bal_code=" << bal_code << dendl;
}
return r;
}
// let's go!
//export_empties(); // no!
- int r = mantle_prep_rebalance();
- if (!r) {
- mds->clog->info() << "mantle succeeded; "
- << "balancer=" << mds->mdsmap->get_balancer();
- return;
+ /* avoid spamming ceph -w if user does not turn mantle on */
+ if (mds->mdsmap->get_balancer() != "") {
+ int r = mantle_prep_rebalance();
+ if (!r) return;
+ mds->clog->warn() << "using old balancer; mantle failed for "
+ << "balancer=" << mds->mdsmap->get_balancer()
+ << " : " << cpp_strerror(r);
}
-
- mds->clog->warn() << "mantle failed (falling back to original balancer); "
- << "balancer=" << mds->mdsmap->get_balancer()
- << " : " << cpp_strerror(r);
prep_rebalance(m->get_beat());
}
}
int MDBalancer::mantle_prep_rebalance()
{
- /* pull metadata balancer from RADOS */
- string const balancer = mds->mdsmap->get_balancer();
- if (balancer == "" || localize_balancer(balancer))
- return -ENOENT;
- ifstream f("/tmp/" + balancer);
- string script((istreambuf_iterator<char>(f)),
- istreambuf_iterator<char>());
+ /* refresh balancer if it has changed */
+ if (bal_version != mds->mdsmap->get_balancer()) {
+ bal_version.assign("");
+ int r = localize_balancer();
+ if (r) return r;
+
+ /* only spam the cluster log from 1 mds on version changes */
+ if (mds->get_nodeid() == 0)
+ mds->clog->info() << "mantle balancer version changed: " << bal_version;
+ }
/* prepare for balancing */
int cluster_size = mds->get_mds_map()->get_num_in_mds();
std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
mds_load_t &load(r.first->second);
- metrics[i].insert(make_pair("auth.meta_load", load.auth.meta_load()));
- metrics[i].insert(make_pair("all.meta_load", load.all.meta_load()));
- metrics[i].insert(make_pair("req_rate", load.req_rate));
- metrics[i].insert(make_pair("queue_len", load.queue_len));
- metrics[i].insert(make_pair("cpu_load_avg", load.cpu_load_avg));
+ metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
+ {"all.meta_load", load.all.meta_load()},
+ {"req_rate", load.req_rate},
+ {"queue_len", load.queue_len},
+ {"cpu_load_avg", load.cpu_load_avg}};
}
/* execute the balancer */
- Mantle *mantle = new Mantle();
- int ret = mantle->balance(script, mds->get_nodeid(), metrics, my_targets);
- delete mantle;
+ Mantle mantle;
+ int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, my_targets);
dout(2) << " mantle decided that new targets=" << my_targets << dendl;
/* mantle doesn't know about cluster size, so check target len here */
#include "include/types.h"
#include "common/Clock.h"
+#include "common/Cond.h"
#include "CInode.h"
int last_epoch_under;
int last_epoch_over;
+ string bal_code;
+ string bal_version;
utime_t last_heartbeat;
utime_t last_fragment;
int proc_message(Message *m);
- int localize_balancer(string const balancer);
+ int localize_balancer();
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);
f->dump_int("metadata_pool", metadata_pool);
f->dump_bool("enabled", enabled);
f->dump_string("fs_name", fs_name);
+ f->dump_string("balancer", balancer);
}
void MDSMap::generate_test_instances(list<MDSMap*>& ls)
out << "data_pools\t" << data_pools << "\n";
out << "metadata_pool\t" << metadata_pool << "\n";
out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
+ out << "balancer\t" << balancer << "\n";
multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
for (const auto &p : mds_info) {
::encode(session_autoclose, bl);
::encode(max_file_size, bl);
::encode(max_mds, bl);
- ::encode(balancer, bl);
__u32 n = mds_info.size();
::encode(n, bl);
for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
::encode(session_autoclose, bl);
::encode(max_file_size, bl);
::encode(max_mds, bl);
- ::encode(balancer, bl);
__u32 n = mds_info.size();
::encode(n, bl);
for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
return;
}
- ENCODE_START(6, 4, bl);
+ ENCODE_START(5, 4, bl);
::encode(epoch, bl);
::encode(flags, bl);
::encode(last_failure, bl);
::encode(session_autoclose, bl);
::encode(max_file_size, bl);
::encode(max_mds, bl);
- ::encode(balancer, bl);
::encode(mds_info, bl, features);
::encode(data_pools, bl);
::encode(cas_pool, bl);
// kclient ignores everything from here
- __u16 ev = 10;
+ __u16 ev = 11;
::encode(ev, bl);
::encode(compat, bl);
::encode(metadata_pool, bl);
::encode(enabled, bl);
::encode(fs_name, bl);
::encode(damaged, bl);
+ ::encode(balancer, bl);
ENCODE_FINISH(bl);
}
std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
cached_up_features = 0;
- DECODE_START_LEGACY_COMPAT_LEN_16(6, 4, 4, p);
+ DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
::decode(epoch, p);
::decode(flags, p);
::decode(last_failure, p);
::decode(session_autoclose, p);
::decode(max_file_size, p);
::decode(max_mds, p);
- ::decode(balancer, p);
::decode(mds_info, p);
if (struct_v < 3) {
__u32 n;
if (ev >= 9) {
::decode(damaged, p);
}
+
+ if (ev >= 11) {
+ ::decode(balancer, p);
+ }
DECODE_FINISH(p);
}
*/
mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */
- string balancer; /* The name and version of the metadata load balancer. */
+ string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
std::set<mds_rank_t> in; // currently defined cluster
mds_rank_t get_max_mds() const { return max_mds; }
void set_max_mds(mds_rank_t m) { max_mds = m; }
- std::string get_balancer() const { return balancer; }
+ const std::string get_balancer() const { return balancer; }
void set_balancer(std::string val) { balancer.assign(val); }
mds_rank_t get_tableserver() const { return tableserver; }
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
#include "mdstypes.h"
#include "MDSRank.h"
#include "Mantle.h"
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MANTLE_H
+#define CEPH_MANTLE_H
+
#include <lua.hpp>
#include <list>
#include <map>
-using std::list;
-using std::map;
#include "include/types.h"
#include "common/Clock.h"
#include "CInode.h"
-
-
-class MDSRank;
-class Message;
-class MHeartbeat;
-class CInode;
-class CDir;
-class Messenger;
-class MonClient;
-
class Mantle {
protected:
lua_State *L;
vector < map<string, double> > metrics,
map<mds_rank_t,double> &my_targets);
};
+
+#endif
"set max MDS index", "mds", "rw", "cli,rest", FLAG(DEPRECATED))
COMMAND_WITH_FLAG("mds set " \
"name=var,type=CephChoices,strings=max_mds|max_file_size"
- "|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags|balancer " \
+ "|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags " \
"name=val,type=CephString " \
"name=confirm,type=CephString,req=false", \
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest", FLAG(DEPRECATED))
COMMAND("fs set " \
"name=fs_name,type=CephString " \
"name=var,type=CephChoices,strings=max_mds|max_file_size"
- "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags " \
+ "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer " \
"name=val,type=CephString " \
"name=confirm,type=CephString,req=false", \
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")