}
// take note of latest distribution on mds's
- dn->inode->mds_contacts = in_info->dist;
- dn->inode->mds_dir_auth = in_info->dir_auth;
-
+ if (in_info->spec_defined) {
+ if (in_info->dist.empty() && !dn->inode->mds_contacts.empty()) {
+ dout(9) << "lost dist spec for " << dn->inode->inode.ino << " " << in_info->dist << endl;
+ }
+ if (!in_info->dist.empty() && dn->inode->mds_contacts.empty()) {
+ dout(9) << "got dist spec for " << dn->inode->inode.ino << " " << in_info->dist << endl;
+ }
+ dn->inode->mds_contacts = in_info->dist;
+ dn->inode->mds_dir_auth = in_info->dir_auth;
+ }
return dn->inode;
}
root->inode = trace[i]->inode;
inode_map[root->inode.ino] = root;
}
- root->mds_contacts = trace[i]->dist;
+ if (trace[i]->spec_defined) {
+ root->mds_contacts = trace[i]->dist;
+ root->mds_dir_auth = trace[i]->dir_auth;
+ }
root->last_updated = now;
dout(12) << "insert_trace trace " << i << " root" << endl;
} else {
int mds = 0;
if (cur) {
- if (!auth_best && cur->mds_contacts.size()) {
+ if (!auth_best && cur->get_replicas().size()) {
// try replica(s)
- dout(9) << "contacting mds from deepest inode " << cur->inode.ino << " " << req->get_filepath() << ": " << cur->mds_contacts << endl;
- set<int>::iterator it = cur->mds_contacts.begin();
- if (cur->mds_contacts.size() == 1)
+ dout(9) << "contacting replica from deepest inode " << cur->inode.ino << " " << req->get_filepath() << ": " << cur->get_replicas() << endl;
+ set<int>::iterator it = cur->get_replicas().begin();
+ if (cur->get_replicas().size() == 1)
mds = *it;
else {
- int r = rand() % cur->mds_contacts.size();
+ int r = rand() % cur->get_replicas().size();
while (r--) it++;
mds = *it;
}
} else {
// try auth
mds = cur->authority();
- dout(9) << "contacting auth mds " << mds << endl;
+ //if (!auth_best && req->get_filepath().get_path()[0] == '/')
+ dout(9) << "contacting auth mds " << mds << " auth_best " << auth_best << " for " << req->get_filepath() << endl;
}
} else {
dout(9) << "i have no idea where " << req->get_filepath() << " is" << endl;
};
+
class Inode {
public:
inode_t inode; // the actual inode
if (symlink) { delete symlink; symlink = 0; }
}
+ bool is_dir() {
+ return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR;
+ }
+
int authority() {
// my info valid?
if (mds_dir_auth >= 0)
return 0; // who knows!
}
+ set<int>& get_replicas() {
+ if (mds_contacts.size())
+ return mds_contacts;
+ if (is_dir()) {
+ return mds_contacts;
+ }
+ if (dn && dn->dir && dn->dir->parent_inode) {
+ return dn->dir->parent_inode->get_replicas();
+ }
+ return mds_contacts;
+ }
+
// open Dir for an inode. if it's not open, allocated it (and pin dentry in memory).
Dir *open_dir() {
+ // for giving to clients
+ void get_dist_spec(set<int>& ls, int auth, timepair_t& now) {
+ if (( popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold)) {
+ //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
+ ls = open_by;
+ }
+ }
+
// -- state --
unsigned get_state() { return state; }
}
+ /*
// for giving to clients
void get_dist_spec(set<int>& ls, int auth, timepair_t& now) {
- if (popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold) {
+ if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) ||
+ (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) {
+ //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
ls = cached_by;
}
}
-
+ */
// dbg
void dump(int d = 0);
dir->popularity[MDS_POP_JUSTME].hit(now);
hit_recursive(dir, now);
+
}
bool anydom = dir->is_auth();
bool curdom = dir->is_auth();
+
+ // replicate?
+ float dir_pop = dir->popularity[MDS_POP_CURDOM].get(now); // hmm??
+
+ if (dir->is_auth()) {
+ if (!dir->is_rep() &&
+ dir_pop >= g_conf.mds_bal_replicate_threshold) {
+ // replicate
+ dout(5) << "replicating dir " << *dir << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDIR_REP_ALL;
+ mds->mdcache->send_dir_updates(dir, true);
+ }
+
+ if (dir->is_rep() &&
+ dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDIR_REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
+ }
+
+
while (dir) {
CInode *in = dir->inode;
} else {
// not mine.
- if (onfail == MDS_TRAVERSE_DISCOVER ||
- onfail == MDS_TRAVERSE_DISCOVERXLOCK) {
+ if (onfail == MDS_TRAVERSE_DISCOVER &&
+ cur->dir->is_rep()) {
+ dout(5) << "trying to discover in popular dir " << *cur->dir << endl;
+ onfail = MDS_TRAVERSE_DISCOVER;
+ }
+
+ if ((onfail == MDS_TRAVERSE_DISCOVER ||
+ onfail == MDS_TRAVERSE_DISCOVERXLOCK)) {
// discover
filepath want = path.postfixpath(depth);
*/
-int MDCache::send_dir_updates(CDir *dir, int except)
+int MDCache::send_dir_updates(CDir *dir, bool bcast)
{
// this is an FYI, re: replication
+ set<int> who = dir->open_by;
+ if (bcast)
+ who = mds->get_cluster()->get_mds_set();
+
+ dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl;
+
+ string path;
+ dir->inode->make_path(path);
+
int whoami = mds->get_nodeid();
- for (set<int>::iterator it = dir->open_by_begin();
- it != dir->open_by_end();
+ for (set<int>::iterator it = who.begin();
+ it != who.end();
it++) {
if (*it == whoami) continue;
- if (*it == except) continue;
+ //if (*it == except) continue;
dout(7) << "sending dir_update on " << *dir << " to " << *it << endl;
+
mds->messenger->send_message(new MDirUpdate(dir->ino(),
dir->dir_rep,
- dir->dir_rep_by),
+ dir->dir_rep_by,
+ path,
+ bcast),
MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
MDS_PORT_CACHE);
}
{
CInode *in = get_inode(m->get_ino());
if (!in || !in->dir) {
- dout(7) << "dir_update on " << m->get_ino() << ", don't have it" << endl;
+ dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl;
+
+ // discover it?
+ if (m->should_discover()) {
+ m->tried_discover(); // only once!
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+
+ dout(5) << "trying discover on dir_update for " << path << endl;
+
+ int r = path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds, m),
+ MDS_TRAVERSE_DISCOVER);
+ if (r > 0)
+ return;
+ if (r == 0) {
+ assert(in);
+ open_remote_dir(in, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ assert(0);
+ }
+
goto out;
}
// update
- dout(7) << "dir_update on " << m->get_ino() << endl;
+ dout(5) << "dir_update on " << m->get_ino() << endl;
in->dir->dir_rep = m->get_dir_rep();
in->dir->dir_rep_by = m->get_dir_rep_by();
//int send_inode_updates(CInode *in);
//void handle_inode_update(MInodeUpdate *m);
- int send_dir_updates(CDir *in, int except=-1);
+ int send_dir_updates(CDir *in, bool bcast=false);
void handle_dir_update(MDirUpdate *m);
void handle_cache_expire(MCacheExpire *m);
int osd_log_begin;
int osd_log_end;
+ set<int> mds_set;
+
void map_osds();
public:
int get_log_osd(int mds);
object_t get_log_oid(int mds);
-
- set<int> mds_set;
-
set<int>& get_mds_set() {
if (mds_set.empty())
for (int i=0; i<num_mds; i++)
bool inode_soft_valid; // true if inode info is valid (ie was readable on mds at the time)
bool inode_hard_valid; // true if inode info is valid (ie was readable on mds at the time)
+ bool spec_defined;
int dir_auth;
set<int> dist; // where am i replicated?
this->ref_dn = ref_dn;
// replicated where?
- if (in->is_dir() && in->dir)
+ spec_defined = in->dir && in->dir->is_auth();
+ if (spec_defined) {
dir_auth = in->dir->get_dir_auth();
- else
- dir_auth = -1;
- in->get_dist_spec(this->dist, whoami, now);
+ in->dir->get_dist_spec(this->dist, whoami, now);
+ }
}
void _encode(bufferlist &bl) {
bl.append((char*)&inode, sizeof(inode));
bl.append((char*)&inode_soft_valid, sizeof(inode_soft_valid));
bl.append((char*)&inode_hard_valid, sizeof(inode_hard_valid));
+ bl.append((char*)&spec_defined, sizeof(spec_defined));
bl.append((char*)&dir_auth, sizeof(dir_auth));
::_encode(ref_dn, bl);
off += sizeof(inode_soft_valid);
bl.copy(off, sizeof(inode_hard_valid), (char*)&inode_hard_valid);
off += sizeof(inode_hard_valid);
+ bl.copy(off, sizeof(spec_defined), (char*)&spec_defined);
+ off += sizeof(spec_defined);
bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
off += sizeof(dir_auth);
typedef struct {
inodeno_t ino;
int dir_rep;
+ int discover;
} MDirUpdate_st;
class MDirUpdate : public Message {
MDirUpdate_st st;
set<int> dir_rep_by;
+ string path;
public:
inodeno_t get_ino() { return st.ino; }
int get_dir_rep() { return st.dir_rep; }
set<int>& get_dir_rep_by() { return dir_rep_by; }
+ bool should_discover() { return st.discover > 0; }
+ string& get_path() { return path; }
+
+ void tried_discover() {
+ if (st.discover) st.discover--;
+ }
MDirUpdate() {}
MDirUpdate(inodeno_t ino,
int dir_rep,
- set<int>& dir_rep_by) :
+ set<int>& dir_rep_by,
+ string& path,
+ bool discover = false) :
Message(MSG_MDS_DIRUPDATE) {
this->st.ino = ino;
this->st.dir_rep = dir_rep;
this->dir_rep_by = dir_rep_by;
+ if (discover) this->st.discover = 3;
+ this->path = path;
}
virtual char *get_type_name() { return "dup"; }
s.copy(off, sizeof(st), (char*)&st);
off += sizeof(st);
_unrope(dir_rep_by, s, off);
+ _unrope(path, s, off);
}
virtual void encode_payload(crope& r) {
r.append((char*)&st, sizeof(st));
_rope(dir_rep_by, r);
+ _rope(path, r);
}
};