client/SyntheticClient.o\
client/Trace.o
-TARGETS = cmon cosd cmds cfuse csyn newsyn fakesyn
+TARGETS = cmon cosd cmds cfuse csyn cmonctl newsyn fakesyn
SRCS=*.cc */*.cc *.h */*.h */*/*.h
cmon: cmon.cc mon.o ebofs.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
+cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
+FIRST
-- bystander dir_auth(_pending) recovery from exporter failure
-- how to reliably deliver cache expire messages?
- - how should proxy behave?
- - exporter failure
- - all cacheexpire info has been passed on up until point where export is permanent. no impact.
- - importer failure
- - exporter collects expire info, so that it can reverse.
- - ???
- - maybe hosts should double-up expires until after export is known to have committed?
---> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :)
-
-*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle!
-- expire should go to both old and new auth
-- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth
-- does inode need it's own replica list? no!
+- openingdir pins should be handled by open_remote_dir, not explicitly by handle_export_dir_prep
+
+
+- bystander dir_auth(_pending) recovery from exporter failure
+
+- does inode need it's own replica list? no?
- dirslices.
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+#include "msg/SimpleMessenger.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+
+#include "common/Timer.h"
+
+#ifndef DARWIN
+#include <envz.h>
+#endif // DARWIN
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+Messenger *messenger = 0;
+
+class Admin : public Dispatcher {
+ void dispatch(Message *m) {
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND_ACK:
+ dout(0) << m->get_source() << " -> '"
+ << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")"
+ << endl;
+ messenger->shutdown();
+ break;
+ }
+ }
+} dispatcher;
+
+int main(int argc, char **argv, char *envp[]) {
+
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ // args for fuse
+ vec_to_argv(args, argc, argv);
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ assert(r >= 0);
+
+ // build command
+ MMonCommand *m = new MMonCommand;
+ string cmd;
+ for (unsigned i=0; i<args.size(); i++) {
+ if (i) cmd += " ";
+ cmd += args[i];
+ m->cmd.push_back(string(args[i]));
+ }
+ int mon = monmap.pick_mon();
+
+ dout(0) << "mon" << mon << " <- '" << cmd << "'" << endl;
+
+ // start up network
+ rank.start_rank();
+ messenger = rank.register_entity(entity_name_t(entity_name_t::TYPE_ADMIN));
+ messenger->set_dispatcher(&dispatcher);
+
+ // send it
+ messenger->send_message(m, monmap.get_inst(mon));
+
+ // wait for messenger to finish
+ rank.wait();
+
+ return 0;
+}
+
mds_local_osd: false,
+ mds_thrash_exports: 0,
// --- osd ---
osd_rep: OSD_REP_PRIMARY,
else if (strcmp(args[i], "--mds_local_osd") == 0)
g_conf.mds_local_osd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_thrash_exports") == 0)
+ g_conf.mds_thrash_exports = atoi(args[++i]);
else if (strcmp(args[i], "--client_use_random_mds") == 0)
g_conf.client_use_random_mds = true;
bool mds_local_osd;
+ int mds_thrash_exports;
// osd
int osd_rep;
+dir is a subtree root iff dir_auth.first != parent.
+if dir_auth.first = parent then inode auth == dir auth, but the converse may not be true.
+that is, you may have inode auth = dir auth, but dir_auth.first = whoami. e.g.,
+ /usr is import.
+ /usr/bin is export.
+ /usr frozen for export, with bound /usr/bin.
+ /usr/bin imports completely. /usr/bin.dir_auth = whoami, not parent, because inode is not authpinnable.
+this is safe because the /usr bound is known, and an abort can adjust the bound's dir_auth.
+
+
+- if i am auth, any subtree bound will be a subtree root, and an export, frozen, or both.
+- if i am auth and unfrozen/freezing, any subtree bound will be an export. and subtree root.
+- if i am auth and frozen, any subtree bound will be an export, or subtree root noted in export_bounds/import_bounds.
+
+- if a dir is a subtree root, it is
+ - auth, import
+ - auth, export
+ - nonauth, frozen, importing
+ - auth, frozen, imported
+ - auth, parent is auth+frozen for import|export, i am known bound.
+ - auth, parent is auth+frozen for import
+ - frozen and exporting
+
+
+- a frozen tree root dir will auth_pin it's inode IFF it is auth AND not a subtree root.
+
+
+--------------------------
+
+dir is a subtree root iff dir_auth.first != parent.
+
+if subtree root and not root, will appear in subtree_bounds[parent subtree root].
+
+
#include "include/Context.h"
#include "include/buffer.h"
-template<typename U,typename V>
-inline ostream& operator<<(ostream& out, const pair<U,V>& p) {
- return out << p.first << "," << p.second;
-}
-
#include "types.h"
#include "Onode.h"
#include "Cnode.h"
}
assert(s.size() == (unsigned)n);
}
+// vector<string>
+inline void _encode(std::vector<std::string>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (std::vector<std::string>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ ::_encode(*it, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(std::vector<std::string>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ s = std::vector<std::string>(n);
+ for (int i=0; i<n; i++) {
+ ::_decode(s[i], bl, off);
+ }
+ assert(s.size() == (unsigned)n);
+}
// list<bufferlist>
inline void _encode(const std::list<bufferlist>& s, bufferlist& bl)
// -- io helpers --
+template<class A, class B>
+inline ostream& operator<<(ostream& out, pair<A,B> v) {
+ return out << v.first << "," << v.second;
+}
+
template<class A>
inline ostream& operator<<(ostream& out, vector<A>& v) {
out << "[";
}
-int CDentry::authority(int *a2)
+pair<int,int> CDentry::authority()
{
- return dir->dentry_authority(name, a2);
+ return dir->dentry_authority(name);
}
version_t get_projected_version() { return projected_version; }
void set_projected_version(version_t v) { projected_version = v; }
- int authority(int *a2=0);
+ pair<int,int> authority();
bool is_auth() { return state & STATE_AUTH; }
bool is_dirty() { return state & STATE_DIRTY; }
out << "." << dir.get_replica_nonce();
}
- if (dir.get_dir_auth() != CDIR_AUTH_PARENT)
+ if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT)
out << " dir_auth=" << dir.get_dir_auth();
- if (dir.get_dir_auth_pending() != CDIR_AUTH_UNKNOWN)
- out << " dir_auth_pending=" << dir.get_dir_auth_pending();
out << " state=" << dir.get_state();
if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy";
if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete";
if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree";
if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree";
- if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf";
+ //if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf";
if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir";
if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir";
ref = 0;
// dir_auth
- dir_auth = CDIR_AUTH_PARENT;
- dir_auth_pending = CDIR_AUTH_UNKNOWN;
+ dir_auth = CDIR_AUTH_DEFAULT;
// auth
assert(in->is_dir());
assert(dn->is_primary());
// explicitly define auth
- in->dangling_auth = in->authority(&in->dangling_auth2);
+ in->dangling_auth = in->authority();
//dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl;
// unpin dentry?
*/
/*
- * simple rule: if dir_auth isn't explicit, auth is the same as the inode.
+ * if dir_auth.first == parent, auth is same as inode.
+ * unless .second != unknown, in which case that sticks.
*/
-int CDir::authority(int *a2)
+pair<int,int> CDir::authority()
{
- // does dir_auth_pending terminate here?
- if (a2 && dir_auth_pending != CDIR_AUTH_UNKNOWN) {
- *a2 = dir_auth_pending;
- a2 = 0;
- }
-
- // pass to parent?
- if (dir_auth == CDIR_AUTH_PARENT)
- return inode->authority(a2);
+ pair<int,int> a = dir_auth;
+
+ // look at parent?
+ if (dir_auth.first == CDIR_AUTH_PARENT)
+ a = inode->authority();
+
+ if (dir_auth.second == CDIR_AUTH_UNKNOWN)
+ return a;
+ else
+ return pair<int,int>(a.first, dir_auth.second);
+}
- // at current node.
- if (a2) *a2 = dir_auth_pending;
- return dir_auth;
+/** is_subtree_root()
+ * true if this is an auth delegation point.
+ * that is, dir_auth != default (parent,unknown)
+ *
+ * some key observations:
+ * if i am auth:
+ * - any region bound will be an export, or frozen.
+ *
+ * note that this DOES heed dir_auth.pending
+ */
+bool CDir::is_subtree_root()
+{
+ if (dir_auth == CDIR_AUTH_DEFAULT)
+ return false;
+ else
+ return true;
}
-int CDir::dentry_authority(const string& dn, int *a2 )
+
+
+pair<int,int> CDir::dentry_authority(const string& dn)
{
+ // forget hashing for now.
+ return authority();
+
+ /*
// hashing -- subset of nodes have hashed the contents
if (is_hashing() && !hashed_subset.empty()) {
int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed
// it's explicit for this whole dir
//dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl;
return get_dir_auth(a2);
+ */
}
-void CDir::set_dir_auth(int d, int d2)
+
+/** set_dir_auth
+ *
+ * accept 'iamauth' param so that i can intelligently adjust freeze auth_pins
+ * even when the auth bit isn't correct.
+ * as when calling MDCache::import_subtree(...).
+ */
+void CDir::set_dir_auth(pair<int,int> a, bool iamauth)
{
- dout(10) << "setting dir_auth=" << d << "," << d2
- << " from " << dir_auth << "," << dir_auth_pending
- << " on " << *this << endl;
- dir_auth = d;
- dir_auth_pending = d2;
-}
-void CDir::set_dir_auth_pending(int d2)
-{
- dout(10) << "setting dir_auth_pending=" << d2
- << " from " << dir_auth << "," << dir_auth_pending
+ dout(10) << "setting dir_auth=" << a
+ << " from " << dir_auth
<< " on " << *this << endl;
- dir_auth_pending = d2;
+
+ bool was_subtree = is_subtree_root();
+
+ // set it.
+ dir_auth = a;
+
+ // new subtree root?
+ if (!was_subtree && is_subtree_root()) {
+ dout(10) << "new subtree root, adjusting auth_pins" << endl;
+
+ // adjust nested auth pins
+ inode->adjust_nested_auth_pins(get_cum_auth_pins());
+
+ // pin parent of frozen dir/tree?
+ if (iamauth && (is_frozen_tree_root() || is_frozen_dir()))
+ inode->auth_pin();
+ }
+ if (was_subtree && !is_subtree_root()) {
+ dout(10) << "old subtree root, adjusting auth_pins" << endl;
+
+ // adjust nested auth pins
+ inode->adjust_nested_auth_pins(get_cum_auth_pins());
+ }
}
/*****************************************
- * AUTH PINS
+ * AUTH PINS and FREEZING
+ *
+ * the basic plan is that auth_pins only exist in auth regions, and they
+ * prevent a freeze (and subsequent auth change).
+ *
+ * however, we also need to prevent a parent from freezing if a child is frozen.
+ * for that reason, the parent inode of a frozen directory is auth_pinned.
+ *
+ * the oddity is when the frozen directory is a subtree root. if that's the case,
+ * the parent inode isn't frozen. which means that when subtree authority is adjusted
+ * at the bounds, inodes for any frozen bound directories need to get auth_pins at that
+ * time.
+ *
*/
void CDir::auth_pin()
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
// nest pins?
- if (!is_subtree_root()) {
- assert(!is_import());
- inode->nested_auth_pins++;
- if (inode->parent)
- inode->parent->dir->adjust_nested_auth_pins( 1 );
- }
+ if (is_subtree_root()) return; // no.
+ assert(!is_import());
+
+ inode->nested_auth_pins++;
+ if (inode->parent)
+ inode->parent->dir->adjust_nested_auth_pins( 1 );
}
void CDir::auth_unpin()
on_freezeable();
// nest?
- if (!is_subtree_root()) {
- inode->nested_auth_pins--;
- if (inode->parent)
- inode->parent->dir->adjust_nested_auth_pins( -1 );
- }
+ if (is_subtree_root()) return; // no.
+ assert(!is_import());
+
+ inode->nested_auth_pins--;
+ if (inode->parent)
+ inode->parent->dir->adjust_nested_auth_pins( -1 );
}
void CDir::adjust_nested_auth_pins(int inc)
{
CDir *dir = this;
- while (1) {
- // dir
- dir->nested_auth_pins += inc;
-
- dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl;
- assert(dir->nested_auth_pins >= 0);
-
- // pending freeze?
- if (dir->auth_pins + dir->nested_auth_pins == 0)
- dir->on_freezeable();
-
- // it's inode
- dir->inode->nested_auth_pins += inc;
-
- if (dir->inode->parent)
- dir = dir->inode->parent->dir;
- else
- break;
- }
+ // dir
+ dir->nested_auth_pins += inc;
+
+ dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl;
+ assert(dir->nested_auth_pins >= 0);
+
+ // pending freeze?
+ if (is_freezeable())
+ dir->on_freezeable();
+ // on freezeable_dir too? FIXME
+
+ // adjust my inode?
+ if (dir->is_subtree_root())
+ return; // no, stop.
+
+ // yes.
+ dir->inode->adjust_nested_auth_pins(inc);
}
{
// check for anything pending freezeable
+ /* NOTE: this will be called on deeper dirs first, walking up toward
+ the root, meaning that deeper freeze attempts will succeed first.
+ */
/* NOTE: the first of these will likely freeze the dir, and unmark
FREEZING. additional ones will re-flag FREEZING. this isn't
particularly graceful, and might cause problems if the first one
if (is_freezeable()) {
dout(10) << "freeze_tree " << *this << endl;
-
- state_set(CDIR_STATE_FROZENTREE);
- inode->auth_pin(); // auth_pin for duration of freeze
-
- // easy, we're frozen
- c->finish(0);
- delete c;
-
+ _freeze_tree(c);
} else {
state_set(CDIR_STATE_FREEZINGTREE);
dout(10) << "freeze_tree + wait " << *this << endl;
}
dout(10) << "freeze_tree_finish " << *this << endl;
- state_set(CDIR_STATE_FROZENTREE);
+ _freeze_tree(c);
+}
+
+void CDir::_freeze_tree(Context *c)
+{
+ dout(10) << "_freeze_tree " << *this << endl;
+
+ // there shouldn't be any conflicting auth_pins.
+ assert(is_freezeable_dir());
+
+ // twiddle state
state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context?
+ state_set(CDIR_STATE_FROZENTREE);
- inode->auth_pin(); // auth_pin for duration of freeze
+ // auth_pin inode for duration of freeze, if we are not a subtree root.
+ if (is_auth() && !is_subtree_root())
+ inode->auth_pin();
// continue to frozen land
if (c) {
state_clear(CDIR_STATE_FROZENTREE);
// unpin (may => FREEZEABLE) FIXME: is this order good?
- inode->auth_unpin();
+ if (is_auth() && !is_subtree_root())
+ inode->auth_unpin();
// waiters?
finish_waiting(CDIR_WAIT_UNFREEZE);
CDir *dir = this;
while (1) {
if (dir->is_frozen_tree_root()) return true;
- if (dir->is_import()) return false;
- if (dir->is_hashed()) return false;
- if (dir->is_frozen_tree_leaf()) return false;
+ if (dir->is_subtree_root()) return false;
if (dir->inode->parent)
dir = dir->inode->parent->dir;
else
if (is_freezeable_dir()) {
dout(10) << "freeze_dir " << *this << endl;
-
- state_set(CDIR_STATE_FROZENDIR);
- inode->auth_pin(); // auth_pin for duration of freeze
-
- // easy, we're frozen
- c->finish(0);
- delete c;
-
+ _freeze_dir(c);
} else {
state_set(CDIR_STATE_FREEZINGDIR);
dout(10) << "freeze_dir + wait " << *this << endl;
}
}
+void CDir::_freeze_dir(Context *c)
+{
+ dout(10) << "_freeze_dir " << *this << endl;
+
+ state_set(CDIR_STATE_FROZENDIR);
+
+ if (is_auth() && !is_subtree_root())
+ inode->auth_pin(); // auth_pin for duration of freeze
+
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
void CDir::freeze_dir_finish(Context *c)
{
// freezeable now?
- if (!is_freezeable_dir()) {
+ if (is_freezeable_dir()) {
+ // freeze now
+ _freeze_dir(c);
+ } else {
// wait again!
dout(10) << "freeze_dir_finish still waiting " << *this << endl;
state_set(CDIR_STATE_FREEZINGDIR);
add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
- return;
- }
-
- dout(10) << "freeze_dir_finish " << *this << endl;
- state_set(CDIR_STATE_FROZENDIR);
- state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context?
-
- inode->auth_pin(); // auth_pin for duration of freeze
-
- // continue to frozen land
- if (c) {
- c->finish(0);
- delete c;
}
}
state_clear(CDIR_STATE_FROZENDIR);
// unpin (may => FREEZEABLE) FIXME: is this order good?
- inode->auth_unpin();
+ if (is_auth() && !is_subtree_root())
+ inode->auth_unpin();
// waiters?
finish_waiting(CDIR_WAIT_UNFREEZE);
// >= 0 is the auth mds
#define CDIR_AUTH_PARENT -1 // default
#define CDIR_AUTH_UNKNOWN -2
-
+#define CDIR_AUTH_DEFAULT pair<int,int>(CDIR_AUTH_PARENT,CDIR_AUTH_UNKNOWN)
+#define CDIR_AUTH_UNDEF pair<int,int>(CDIR_AUTH_UNKNOWN,CDIR_AUTH_UNKNOWN)
#define CDIR_NONCE_EXPORT 1
#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports)
#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing
-#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import)
+//#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import)
#define CDIR_STATE_FROZENDIR (1<<7)
#define CDIR_STATE_FREEZINGDIR (1<<8)
#define CDIR_STATE_DELETED (1<<11)
-#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import.
-#define CDIR_STATE_EXPORT (1<<13)
-#define CDIR_STATE_IMPORTINGEXPORT (1<<14)
+#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import.
+#define CDIR_STATE_EXPORT (1<<13)
+#define CDIR_STATE_IMPORTBOUND (1<<14)
+#define CDIR_STATE_EXPORTBOUND (1<<15)
-#define CDIR_STATE_HASHED (1<<15) // if hashed
-#define CDIR_STATE_HASHING (1<<16)
-#define CDIR_STATE_UNHASHING (1<<17)
+#define CDIR_STATE_HASHED (1<<16) // if hashed
+#define CDIR_STATE_HASHING (1<<17)
+#define CDIR_STATE_UNHASHING (1<<18)
|CDIR_STATE_DIRTY)
#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\
|CDIR_STATE_EXPORT\
- |CDIR_STATE_IMPORTINGEXPORT\
+ |CDIR_STATE_IMPORTBOUND\
|CDIR_STATE_FROZENTREE\
|CDIR_STATE_PROXY)
static const int PIN_IMPORT = 3;
static const int PIN_EXPORT = 4;
//static const int PIN_FREEZE = 5;
- static const int PIN_FREEZELEAF = 6;
+ // static const int PIN_FREEZELEAF = 6;
static const int PIN_PROXY = 7; // auth just changed.
static const int PIN_AUTHPIN = 8;
static const int PIN_IMPORTING = 9;
- static const int PIN_IMPORTINGEXPORT = 10;
- static const int PIN_HASHED = 11;
- static const int PIN_HASHING = 12;
- static const int PIN_DIRTY = 13;
- static const int PIN_REQUEST = 14;
+ static const int PIN_EXPORTING = 10;
+ static const int PIN_IMPORTBOUND = 11;
+ static const int PIN_EXPORTBOUND = 12;
+ static const int PIN_HASHED = 13;
+ static const int PIN_HASHING = 14;
+ static const int PIN_DIRTY = 15;
+ static const int PIN_REQUEST = 16;
+ static const int PIN_LOGGINGEXPORTFINISH = 17;
static const char *pin_name(int p) {
switch (p) {
case PIN_CHILD: return "child";
case PIN_WAITER: return "waiter";
case PIN_IMPORT: return "import";
case PIN_EXPORT: return "export";
+ case PIN_EXPORTING: return "exporting";
+ case PIN_IMPORTING: return "importing";
+ case PIN_IMPORTBOUND: return "importbound";
+ case PIN_EXPORTBOUND: return "exportbound";
//case PIN_FREEZE: return "freeze";
- case PIN_FREEZELEAF: return "freezeleaf";
+ // case PIN_FREEZELEAF: return "freezeleaf";
case PIN_PROXY: return "proxy";
case PIN_AUTHPIN: return "authpin";
- case PIN_IMPORTING: return "importing";
- case PIN_IMPORTINGEXPORT: return "importingexport";
case PIN_HASHED: return "hashed";
case PIN_HASHING: return "hashing";
case PIN_DIRTY: return "dirty";
case PIN_REQUEST: return "request";
+ case PIN_LOGGINGEXPORTFINISH: return "loggingexportfinish";
default: assert(0);
}
}
version_t last_committed_version; // slight lie; we bump this on import.
version_t projected_version;
- // authority, replicas
- int dir_auth, dir_auth_pending;
-
// lock nesting, freeze
int auth_pins;
int nested_auth_pins;
void remove_null_dentries(); // on empty, clean dir
// -- authority --
+ /*
+ * normal: <parent,unknown> !subtree_root
+ * delegation: <mds,unknown> subtree_root
+ * ambiguous: <mds1,mds2> subtree_root
+ * <parent,mds2> subtree_root
+ */
+ pair<int,int> dir_auth;
+
public:
- int authority(int *a2=0);
- int dentry_authority(const string& d, int *a2=0);
- int get_dir_auth(int *a2=0) {
- if (a2)
- *a2 = dir_auth_pending;
- return dir_auth;
- }
- int get_dir_auth_pending() {
- return dir_auth_pending;
+ pair<int,int> authority();
+ pair<int,int> dentry_authority(const string& d);
+ pair<int,int> get_dir_auth() { return dir_auth; }
+ //int get_dir_auth_pending() { return dir_auth->second; }
+ void set_dir_auth(pair<int,int> a, bool iamauth=false);
+ void set_dir_auth(int a, bool iamauth=false) {
+ set_dir_auth(pair<int,int>(a, dir_auth.second), iamauth);
}
- void set_dir_auth(int d, int d2=CDIR_AUTH_UNKNOWN);
- void set_dir_auth_pending(int d2);
-
- bool is_subtree_root() {
- if (dir_auth != CDIR_AUTH_PARENT ||
- dir_auth_pending != CDIR_AUTH_PARENT)
- return true;
- else
- return false;
+ void set_dir_auth_pending(int b) {
+ set_dir_auth(pair<int,int>(dir_auth.first, b));
}
+ bool is_subtree_root();
+
+
// for giving to clients
void freeze_tree(Context *c);
void freeze_tree_finish(Context *c);
void unfreeze_tree();
+ void _freeze_tree(Context *c=0);
void freeze_dir(Context *c);
void freeze_dir_finish(Context *c);
+ void _freeze_dir(Context *c=0);
void unfreeze_dir();
bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); }
bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); }
bool is_frozen_tree();
bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; }
- bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; }
bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; }
bool is_freezeable() {
- if (auth_pins == 0 && nested_auth_pins == 0) return true;
- return false;
+ // no nested auth pins.
+ if (auth_pins > 0 || nested_auth_pins > 0)
+ return false;
+
+ // inode must not be frozen.
+ if (inode->is_frozen())
+ return false;
+
+ return true;
}
bool is_freezeable_dir() {
- if (auth_pins == 0) return true;
- return false;
+ if (auth_pins > 0)
+ return false;
+
+ // inode must not be frozen.
+ if (inode->is_frozen())
+ return false;
+
+ return true;
}
CDir *get_frozen_tree_root();
class CDirDiscover {
inodeno_t ino;
int nonce;
- int dir_auth;
+ int dir_auth;
int dir_rep;
set<int> rep_by;
CDirDiscover(CDir *dir, int nonce) {
ino = dir->ino();
this->nonce = nonce;
- dir_auth = dir->dir_auth;
+ dir_auth = dir->dir_auth.first;
dir_rep = dir->dir_rep;
rep_by = dir->dir_rep_by;
}
assert(!dir->is_auth());
dir->replica_nonce = nonce;
- dir->dir_auth = dir_auth;
+ dir->set_dir_auth( dir_auth );
dir->dir_rep = dir_rep;
dir->dir_rep_by = rep_by;
}
// only auth can open dir alone.
assert(is_auth());
set_dir( new CDir(this, mdcache, true) );
- dir->dir_auth = -1;
return dir;
}
dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl;
string ref_dn;
trace.push_back( new Anchor(ino(),
- MDS_INO_INODEFILE_OFFSET+dangling_auth,
+ MDS_INO_INODEFILE_OFFSET+dangling_auth.first,
ref_dn) );
}
else
parent->dir->adjust_nested_auth_pins( -1 );
}
+void CInode::adjust_nested_auth_pins(int a)
+{
+ if (!parent) return;
+ nested_auth_pins += a;
+ parent->get_dir()->adjust_nested_auth_pins(a);
+}
+
// authority
-int CInode::authority(int *a2) {
- if (is_dangling()) {
- if (a2) *a2 = dangling_auth2;
+pair<int,int> CInode::authority()
+{
+ if (is_dangling())
return dangling_auth; // explicit
- }
if (is_root()) { // i am root
if (dir)
- return dir->get_dir_auth(a2); // bit of a chicken/egg issue here!
+ return dir->get_dir_auth(); // bit of a chicken/egg issue here!
else {
- if (a2) *a2 = CDIR_AUTH_UNKNOWN;
- return CDIR_AUTH_UNKNOWN;
+ return CDIR_AUTH_UNDEF;
}
}
+ // this is useless if we hose the hashing crap.
if (parent)
- return parent->dir->dentry_authority( parent->name, a2 );
+ return parent->dir->dentry_authority( parent->name );
- if (a2) *a2 = CDIR_AUTH_UNKNOWN;
- return -1; // undefined (inode must not be linked yet!)
+ return CDIR_AUTH_UNDEF;
}
CInodeDiscover* CInode::replicate_to( int rep )
set<CDentry*> remote_parents; // if hard linked
// -- distributed caching
- int dangling_auth; // explicit auth, when dangling.
- int dangling_auth2; // explicit auth, when dangling.
+ pair<int,int> dangling_auth; // explicit auth, when dangling.
- int num_request_pins;
+ int num_request_pins;
// waiters
multimap<int, Context*> waiting;
// -- authority --
- int authority(int *a2=0);
+ pair<int,int> authority();
// -- auth pins --
int is_auth_pinned() {
return auth_pins;
}
- int adjust_nested_auth_pins(int a);
+ void adjust_nested_auth_pins(int a);
bool can_auth_pin();
void auth_pin();
void auth_unpin();
--- /dev/null
+
+
+// =======================================================================
+// HASHING
+
+
+void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
+{
+ int off = 0;
+
+ for (; nden>0; nden--) {
+ // dentry
+ string dname;
+ _decode(dname, bl, off);
+ dout(15) << "dname is " << dname << endl;
+
+ char icode;
+ bl.copy(off, 1, &icode);
+ off++;
+
+ CDentry *dn = dir->lookup(dname);
+ if (!dn)
+ dn = dir->add_dentry(dname); // null
+
+ // mark dn dirty _after_ we link the inode (scroll down)
+
+ if (icode == 'N') {
+
+ // null dentry
+ assert(dn->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ dir->link_inode(dn, ino);
+ }
+ else if (icode == 'I') {
+ // inode
+ decode_import_inode(dn, bl, off, oldauth);
+
+ // fix up subdir export?
+ if (dn->inode->dir) {
+ assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTBOUND));
+ dn->inode->dir->put(CDir::PIN_IMPORTBOUND);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORTBOUND);
+
+ if (dn->inode->dir->is_auth()) {
+ // mine. must have been an import.
+ assert(dn->inode->dir->is_import());
+ dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ cache->imports.erase(dn->inode->dir);
+ dn->inode->dir->put(CDir::PIN_IMPORT);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
+
+ // move nested under hashdir
+ for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
+ it != cache->nested_exports[dn->inode->dir].end();
+ it++)
+ cache->nested_exports[dir].insert(*it);
+ cache->nested_exports.erase(dn->inode->dir);
+
+ // now it matches the inode
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ else {
+ // not mine. make it an export.
+ dout(7) << "making subdir into export " << *dn->inode->dir << endl;
+ dn->inode->dir->get(CDir::PIN_EXPORT);
+ dn->inode->dir->state_set(CDIR_STATE_EXPORT);
+ cache->exports.insert(dn->inode->dir);
+ cache->nested_exports[dir].insert(dn->inode->dir);
+
+ if (dn->inode->dir->get_dir_auth().first == CDIR_AUTH_PARENT)
+ dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
+ assert(dn->inode->dir->get_dir_auth().first >= 0);
+ }
+ }
+ }
+
+ // mark dentry dirty? (only _after_ we link the inode!)
+ dn->_mark_dirty(); // fixme
+ }
+}
+
+/*
+
+ notes on interaction of hashing and export/import:
+
+ - dir->is_auth() is completely independent of hashing. for a hashed dir,
+ - all nodes are partially authoritative
+ - all nodes dir->is_hashed() == true
+ - all nodes dir->inode->dir_is_hashed() == true
+ - one node dir->is_auth() == true, the rest == false
+ - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
+
+ - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
+
+ - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export)
+ - on export, -1 is changed to mds->get_nodeid()
+ - on import, nothing special, actually.
+
+ - hashed dir files aren't included in export; subdirs are converted to imports
+ or exports as necessary.
+ - hashed dir subdirs are discovered on export. this is important
+ because dirs are needed to tie together auth hierarchy, for auth to know about
+ imports/exports, etc.
+
+ - dir state is maintained on auth.
+ - COMPLETE and HASHED are transfered to importers.
+ - DIRTY is set everywhere.
+
+ - hashed dir is like an import: hashed dir used for nested_exports map.
+ - nested_exports is updated appropriately on auth and replicas.
+ - a subtree terminates as a hashed dir, since the hashing explicitly
+ redelegates all inodes. thus export_dir_walk includes hashed dirs, but
+ not their inodes.
+*/
+
+// HASH on auth
+
+class C_MDC_HashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->hash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_HashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashComplete(Migrator *mig, CDir *dir) {
+ this->mig = mig;
+ this->dir = dir;
+ }
+ virtual void finish(int r) {
+ mig->hash_dir_complete(dir);
+ }
+};
+
+
+/** hash_dir(dir)
+ * start hashing a directory.
+ */
+void Migrator::hash_dir(CDir *dir)
+{
+ dout(-7) << "hash_dir " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't hash, freezing|frozen." << endl;
+ return;
+ }
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "hash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // ok, go
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDir::PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // discover on all mds
+ assert(hash_gather.count(dir) == 0);
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue; // except me
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
+ }
+ dir->auth_pin(); // pin until discovers are all acked.
+
+ // start freeze
+ dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_HashComplete(this, dir));
+ } else
+ hash_dir_complete(dir);
+}
+
+
+/*
+ * wait for everybody to discover and open the hashing dir
+ * then auth_unpin, to let the freeze happen
+ */
+void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+ dir->auth_unpin(); // unpin to allow freeze to complete
+ } else {
+ dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m; // done
+}
+
+
+
+/*
+ * once the dir is completely in memory,
+ * mark all migrating inodes dirty (to pin in cache)
+ */
+void Migrator::hash_dir_complete(CDir *dir)
+{
+ dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ in->_mark_dirty(); // fixme
+ }
+
+ if (dir->is_frozen_dir())
+ hash_dir_go(dir);
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * send the prep messages!
+ */
+void Migrator::hash_dir_frozen(CDir *dir)
+{
+ dout(7) << "hash_dir_frozen " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
+ return;
+ }
+
+ // send prep messages w/ export directories to open
+ vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
+
+ // check for subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) continue;
+
+ // msg?
+ if (msgs[dentryhashcode] == 0) {
+ msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
+ }
+ msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
+ }
+
+ // send them!
+ assert(hash_gather[dir].empty());
+ for (unsigned i=0; i<msgs.size(); i++) {
+ if (msgs[i]) {
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ hash_gather[dir].insert(i);
+ }
+ }
+
+ if (hash_gather[dir].empty()) {
+ // no subdirs! continue!
+ hash_gather.erase(dir);
+ hash_dir_go(dir);
+ } else {
+ // wait!
+ }
+}
+
+/*
+ * wait for peers to open all subdirs
+ */
+void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
+ hash_dir_go(dir);
+ } else {
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * do the hashing!
+ */
+void Migrator::hash_dir_go(CDir *dir)
+{
+ dout(7) << "hash_dir_go " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ // get messages to other nodes ready
+ vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ msgs[i] = new MHashDir(dir->ino());
+ }
+
+ // pick a hash seed.
+ dir->inode->inode.hash_seed = 1;//dir->ino();
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // get containing import. might be me.
+ CDir *containing_import = cache->get_auth_container(dir);
+ assert(containing_import != dir || dir->is_import());
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) {
+ continue; // still mine!
+ }
+
+ bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
+ assert(bl);
+
+ // -- dentry
+ dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, *bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl->append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl->append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl->append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl->append("I", 1); // inode dentry
+
+ encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
+ msgs[dentryhashcode]->inc_nden();
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // add to proxy
+ hash_proxy_inos[dir].push_back(in);
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
+
+ // fix up subdirs
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDir::PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+
+ // fix nested bits
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == in->dir) {
+ dout(10) << "moving nested export " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[in->dir].insert(ex);
+ }
+ }
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDir::PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[containing_import].erase(in->dir);
+ if (in->dir->authority() == dentryhashcode)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ in->dir->set_dir_auth( in->dir->authority() );
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDir::PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->mark_dirty(dir->pre_dirty()); // fixme
+ mds->mdlog->submit_entry(new EString("dirty dir fixme"));
+
+ // inode state
+ if (dir->inode->is_auth()) {
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash dirty fixme"));
+ }
+
+ // fix up nested_exports?
+ if (containing_import != dir) {
+ dout(7) << "moving nested exports under hashed dir" << endl;
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == dir) {
+ dout(7) << " moving nested export under hashed dir: " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[dir].insert(ex);
+ } else {
+ dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
+ }
+ }
+ }
+
+ // send hash messages
+ assert(hash_gather[dir].empty());
+ assert(hash_notify_gather[dir].empty());
+ assert(dir->hashed_subset.empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ // all nodes hashed locally..
+ dir->hashed_subset.insert(i);
+
+ if (i == mds->get_nodeid()) continue;
+
+ // init hash_gather and hash_notify_gather sets
+ hash_gather[dir].insert(i);
+
+ assert(hash_notify_gather[dir][i].empty());
+ for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
+ if (j == mds->get_nodeid()) continue;
+ if (j == i) continue;
+ hash_notify_gather[dir][i].insert(j);
+ }
+
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ }
+
+ // wait for all the acks.
+}
+
+
+void Migrator::handle_hash_dir_ack(MHashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "got notifies too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "waiting on notifies " << endl;
+ }
+
+ } else {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+void Migrator::hash_dir_finish(CDir *dir)
+{
+ dout(7) << "hash_dir_finish finishing " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ // dir state
+ hash_gather.erase(dir);
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDir::PIN_HASHING);
+ dir->hashed_subset.clear();
+
+ // unproxy inodes
+ // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CInode::STATE_PROXY));
+ in->state_clear(CInode::STATE_PROXY);
+ in->put(CInode::PIN_PROXY);
+ }
+ hash_proxy_inos.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ show_imports();
+ assert(hash_gather.count(dir) == 0);
+
+ // stats
+ //if (mds->logger) mds->logger->inc("nh", 1);
+
+}
+
+
+
+
+// HASH on auth and non-auth
+
+void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir_notify " << *dir << endl;
+ int from = m->get_from();
+
+ int source = m->get_source().num();
+ if (dir->is_auth()) {
+ // gather notifies
+ assert(dir->is_hashed());
+
+ assert( hash_notify_gather[dir][from].count(source) );
+ hash_notify_gather[dir][from].erase(source);
+
+ if (hash_notify_gather[dir][from].empty()) {
+ dout(7) << "last notify from " << from << endl;
+ hash_notify_gather[dir].erase(from);
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "last notify!" << endl;
+ hash_notify_gather.erase(dir);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "got acks too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
+ }
+
+ // delete msg
+ delete m;
+ } else {
+ // update dir hashed_subset
+ assert(dir->hashed_subset.count(from) == 0);
+ dir->hashed_subset.insert(from);
+
+ // update open subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->get_inode();
+ if (!in) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != from) continue; // we'll import these in a minute
+
+ if (in->dir->authority() != dentryhashcode)
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+
+ // remove from notify gather set
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ // last notify?
+ if (hash_gather[dir].empty()) {
+ dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
+ hash_gather.erase(dir);
+
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDir::PIN_HASHING);
+ dir->hashed_subset.clear();
+ } else {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ }
+
+ // fw notify to auth
+ mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
+ }
+}
+
+
+
+
+// HASH on non-auth
+
+/*
+ * discover step:
+ * each peer needs to open up the directory and pin it before we start
+ */
+class C_MDC_HashDirDiscover : public Context {
+ Migrator *mig;
+ MHashDirDiscover *m;
+public:
+ vector<CDentry*> trace;
+ C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
+ this->mig = mig;
+ this->m = m;
+ }
+ void finish(int r) {
+ CInode *in = 0;
+ if (r >= 0) {
+ if (trace.size())
+ in = trace[trace.size()-1]->get_inode();
+ else
+ in = mig->cache->get_root();
+ }
+ mig->handle_hash_dir_discover_2(m, in, r);
+ }
+};
+
+void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
+{
+ assert(m->get_source().num() != mds->get_nodeid());
+
+ dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
+
+ // must discover it!
+ C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
+ filepath fpath(m->get_path());
+ cache->path_traverse(fpath, onfinish->trace, true,
+ m, new C_MDS_RetryMessage(mds,m), // on delay/retry
+ MDS_TRAVERSE_DISCOVER,
+ onfinish); // on completion|error
+}
+
+void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
+{
+ // yay!
+ if (in) {
+ dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
+ }
+
+ if (r < 0 || !in->is_dir()) {
+ dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+ assert(0); // this shouldn't happen if the auth pins his path properly!!!!
+ }
+ assert(in->is_dir());
+
+ // is dir open?
+ if (!in->dir) {
+ dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ CDir *dir = in->dir;
+
+ // pin dir, set hashing flag
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDir::PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // inode state
+ dir->inode->inode.hash_seed = 1;// dir->ino();
+ if (dir->inode->is_auth()) {
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash dirty fixme"));
+ }
+
+ // get gather set ready for notifies
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == dir->authority()) continue;
+ hash_gather[dir].insert(i);
+ }
+
+ // reply
+ dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
+ mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+}
+
+/*
+ * prep step:
+ * peers need to open up all subdirs of the hashed dir
+ */
+
+void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_hash_dir_prep " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(mds->mdcache, false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDir::PIN_IMPORTBOUND);
+ in->dir->state_set(CDIR_STATE_IMPORTBOUND);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ack!
+ mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+}
+
+
+/*
+ * hash step:
+ */
+
+void Migrator::handle_hash_dir(MHashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(!dir->is_auth());
+ assert(!dir->is_hashed());
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir " << *dir << endl;
+ int oldauth = m->get_source().num();
+
+ // content
+ import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDir::PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->hashed_subset.insert(mds->get_nodeid());
+
+ // dir is complete
+ dir->mark_complete();
+ dir->mark_dirty(dir->pre_dirty()); // fixme
+ mds->mdlog->submit_entry(new EString("dirty dir fixme"));
+
+ // commit
+ mds->mdstore->commit_dir(dir, 0);
+
+ // send notifies
+ dout(7) << "sending notifies" << endl;
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == m->get_source().num()) continue;
+ mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
+ i, MDS_PORT_MIGRATOR);
+ }
+
+ // ack
+ dout(7) << "acking" << endl;
+ mds->send_message_mds(new MHashDirAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+
+ show_imports();
+}
+
+
+
+
+
+// UNHASH on auth
+
+class C_MDC_UnhashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_UnhashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_complete(dir);
+ }
+};
+
+
+void Migrator::unhash_dir(CDir *dir)
+{
+ dout(-7) << "unhash_dir " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(!dir->is_unhashing());
+ assert(dir->is_auth());
+ assert(hash_gather.count(dir)==0);
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "unhash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // twiddle state
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // first, freeze the dir.
+ dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else
+ unhash_dir_complete(dir);
+
+}
+
+void Migrator::unhash_dir_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep(dir);
+}
+
+
+/*
+ * ask peers to freeze and complete hashed dir
+ */
+void Migrator::unhash_dir_prep(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * wait for peers to freeze and complete hashed dirs
+ */
+void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+ dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(mds->mdcache, false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDir::PIN_IMPORTBOUND);
+ in->dir->state_set(CDIR_STATE_IMPORTBOUND);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ok, done with this PrepAck
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
+ unhash_dir_go(dir);
+ } else {
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * auth:
+ * send out MHashDir's to peers
+ */
+void Migrator::unhash_dir_go(CDir *dir)
+{
+ dout(7) << "unhash_dir_go " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDir(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * auth:
+ * assimilate unhashing content
+ */
+void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // assimilate content
+ int from = m->get_source().num();
+ import_hashed_content(dir, m->get_state(), m->get_nden(), from);
+ delete m;
+
+ // done?
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
+ return;
+ }
+
+ // done!
+
+ // fix up nested_exports
+ CDir *containing_import = cache->get_auth_container(dir);
+ if (containing_import != dir) {
+ for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
+ it != cache->nested_exports[dir].end();
+ it++) {
+ dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
+ cache->nested_exports[containing_import].insert(*it);
+ }
+ cache->nested_exports.erase(dir);
+ }
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); //later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDir::PIN_HASHED);
+ cache->hashdirs.erase(dir);
+
+ // commit!
+ assert(dir->is_complete());
+ //dir->mark_complete();
+ dir->mark_dirty(dir->pre_dirty()); // fixme
+ mds->mdstore->commit_dir(dir, 0);
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash inode dirty fixme"));
+ }
+
+ // notify
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+
+ hash_gather[dir].insert(i);
+
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+
+/*
+ * sent by peer to flush mds links. unfreeze when all gathered.
+ */
+void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(dir->is_frozen_dir());
+
+ // done?
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+ delete m;
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
+ } else {
+ unhash_dir_finish(dir);
+ }
+}
+
+
+/*
+ * all mds links are flushed. unfreeze dir!
+ */
+void Migrator::unhash_dir_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_finish " << *dir << endl;
+ hash_gather.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // state
+ dir->state_clear(CDIR_STATE_UNHASHING);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+}
+
+
+
+// UNHASH on all
+
+/*
+ * hashed dir is complete.
+ * mark all migrating inodes dirty (to pin in cache)
+ * if frozen too, then go to next step (depending on auth)
+ */
+void Migrator::unhash_dir_complete(CDir *dir)
+{
+ dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_complete());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ if (in->is_auth()) {
+ in->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("unhash dirty fixme"));
+ }
+ }
+
+ if (!dir->is_frozen_dir()) {
+ dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
+ } else {
+ if (dir->is_auth())
+ unhash_dir_prep(dir); // auth
+ else
+ unhash_dir_prep_finish(dir); // nonauth
+ }
+}
+
+
+// UNHASH on non-auth
+
+class C_MDC_UnhashPrepFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_prep_frozen(dir);
+ }
+};
+
+
+/*
+ * peers need to freeze their dir and make them complete
+ */
+void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // freeze
+ dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else {
+ unhash_dir_complete(dir);
+ }
+
+ delete m;
+}
+
+/*
+ * peer has hashed dir frozen.
+ * complete too?
+ */
+void Migrator::unhash_dir_prep_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_frozen_dir());
+ assert(!dir->is_auth());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep_finish(dir);
+}
+
+/*
+ * peer has hashed dir complete and frozen. ack.
+ */
+void Migrator::unhash_dir_prep_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_finish " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(!dir->is_auth());
+ assert(dir->is_frozen());
+ assert(dir->is_complete());
+
+ // twiddle state
+ if (dir->is_unhashing())
+ return; // already replied.
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // send subdirs back to auth
+ MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
+ int auth = dir->authority();
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) continue;
+
+ // msg?
+ ack->add_inode(it->first, in->replicate_to(auth));
+ }
+
+ // ack
+ mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
+}
+
+
+
+/*
+ * peer needs to send hashed dir content back to auth.
+ * unhash dir.
+ */
+void Migrator::handle_unhash_dir(MUnhashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ // get message ready
+ bufferlist bl;
+ int nden = 0;
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) {
+ // not mine!
+ // twiddle dir_auth?
+ if (in->dir) {
+ if (in->dir->authority() != dir->authority())
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ continue;
+ }
+
+ // -- dentry
+ dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl.append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl.append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl.append("I", 1); // inode dentry
+
+ encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
+ nden++;
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // proxy
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
+ hash_proxy_inos[dir].push_back(in);
+
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDir::PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDir::PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[dir].erase(in->dir);
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // we should have no nested exports; we're not auth for the dir!
+ assert(cache->nested_exports[dir].empty());
+ cache->nested_exports.erase(dir);
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); // later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDir::PIN_HASHED);
+ cache->hashdirs.erase(dir);
+ dir->mark_clean();
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("unhash inode dirty fixme"));
+ }
+
+ // init gather set
+ mds->get_mds_map()->get_active_mds_set( hash_gather[dir] );
+ hash_gather[dir].erase(mds->get_nodeid());
+
+ // send unhash message
+ mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
+ dir->authority(), MDS_PORT_MIGRATOR);
+}
+
+
+/*
+ * first notify comes from auth.
+ * send notifies to all other peers, with peer = self
+ * if we get notify from peer=other, remove from our gather list.
+ * when we've gotten notifies from everyone,
+ * unpin proxies,
+ * send notify_ack to auth.
+ * this ensures that all mds links are flushed of cache_expire type messages.
+ */
+void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_finish " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+ delete m;
+
+ // did we send our shout out?
+ if (from == dir->authority()) {
+ // send notify to everyone else in weird chatter storm
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == from) continue;
+ if (i == mds->get_nodeid()) continue;
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
+ }
+ }
+
+ // are we done?
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ return;
+ }
+ hash_gather.erase(dir);
+
+ // all done!
+ dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
+
+ // unpin proxies
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CInode::STATE_PROXY));
+ in->state_clear(CInode::STATE_PROXY);
+ in->put(CInode::PIN_PROXY);
+ }
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ // ack
+ dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
+ mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+}
}
assert(!in->is_auth());
- int auth = in->authority();
+ int auth = in->authority().first;
dout(7) << "request_inode_file_caps " << cap_string(wanted)
<< " was " << cap_string(in->replica_caps_wanted)
<< " on " << *in << " to mds" << auth << endl;
if (in->is_proxy()) {
dout(7) << "proxy, fw" << endl;
- mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER);
+ mds->send_message_mds(m, in->authority().first, MDS_PORT_LOCKER);
return;
}
} else {
// replica
// fw to auth
- int auth = in->authority();
+ int auth = in->authority().first;
dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl;
assert(auth != mds->get_nodeid());
mdcache->request_forward(m, auth);
if (in->is_proxy()) {
// fw
- int newauth = in->authority();
+ int newauth = in->authority().first;
assert(newauth >= 0);
if (from == newauth) {
dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
if (in->filelock.is_stable()) {
// fw to auth
- int auth = in->authority();
+ int auth = in->authority().first;
dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl;
assert(auth != mds->get_nodeid());
mdcache->request_forward(m, auth);
} else {
// replica
// fw to auth
- int auth = in->authority();
+ int auth = in->authority().first;
dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
assert(auth != mds->get_nodeid());
mdcache->request_forward(m, auth);
// ack
MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER);
}
break;
// ack
MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER);
}
break;
if (in->is_proxy()) {
// fw
- int newauth = in->authority();
+ int newauth = in->authority().first;
assert(newauth >= 0);
if (from == newauth) {
dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
{
dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl;
// send request
- int dauth = dir->dentry_authority(dname);
+ int dauth = dir->dentry_authority(dname).first;
MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid());
m->set_dn(dir->ino(), dname);
mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
// normally we have it always
if (diri && dir) {
- int dauth = dir->dentry_authority(dname);
+ int dauth = dir->dentry_authority(dname).first;
assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy,
m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak
m->get_action() == LOCK_AC_REQXLOCKNAK);
it++) {
CDir *im = *it;
if (im->inode->is_root()) continue;
- int from = im->inode->authority();
+ int from = im->inode->authority().first;
import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load();
}
mds_import_map[ mds->get_nodeid() ] = import_map;
if (!dir->is_auth()) continue;
dout(0) << "do_hashing hashing " << *dir << endl;
- mds->mdcache->migrator->hash_dir(dir);
+ //mds->mdcache->migrator->hash_dir(dir);
}
hash_queue.clear();
}
dout(-5) << " exporting idle import " << **it
<< " back to mds" << (*it)->inode->authority()
<< endl;
- mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority());
+ mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority().first);
continue;
}
import_pop_map[ pop ] = *it;
- int from = (*it)->inode->authority();
+ int from = (*it)->inode->authority().first;
dout(15) << " map: i imported " << **it << " from " << from << endl;
import_from_map.insert(pair<int,CDir*>(from, *it));
}
if (dir->is_hashed()) continue;
if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
double pop = dir->popularity[MDS_POP_CURDOM].meta_load();
- assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy
+ assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
if (pop <= amount-have) {
dout(-5) << "reexporting " << *dir
<< " back to mds" << imp->inode->authority()
<< endl;
have += pop;
- mds->mdcache->migrator->export_dir(imp, imp->inode->authority());
+ mds->mdcache->migrator->export_dir(imp, imp->inode->authority().first);
}
if (amount-have < MIN_OFFLOAD) break;
}
if (migrator->is_importing(im->ino())) {
// ambiguous (mid-import)
m->add_ambiguous_import(im->ino(),
- migrator->get_import_bounds(im->ino()));
+ migrator->get_import_bound_inos(im->ino()));
} else {
// not ambiguous.
m->add_import(im->ino());
CDir *ex = exi->dir;
if (!ex) continue;
- if (ex->get_dir_auth() == CDIR_AUTH_PARENT)
+ if (ex->get_dir_auth().first == CDIR_AUTH_PARENT)
ex->set_dir_auth(CDIR_AUTH_UNKNOWN);
}
}
CDir *dir = diri->dir;
if (!dir) continue;
- if (dir->authority() >= CDIR_AUTH_UNKNOWN) {
+ if (dir->authority().first >= CDIR_AUTH_UNKNOWN) {
dout(10) << "mds" << who << " did not import " << *dir << endl;
} else {
dout(10) << "mds" << who << " did import " << *dir << endl;
- int was = dir->authority();
+ int was = dir->authority().first;
dir->set_dir_auth(who);
for (set<inodeno_t>::iterator r = q->second.begin();
if (!exi) continue;
CDir *ex = exi->dir;
if (!ex) continue;
- if (ex->get_dir_auth() == CDIR_AUTH_PARENT)
+ if (ex->get_dir_auth().first == CDIR_AUTH_PARENT)
ex->set_dir_auth(was);
dout(10) << " bound " << *ex << endl;
}
CDir *dir = diri->dir;
if (!dir) continue;
- if (dir->authority() != CDIR_AUTH_UNKNOWN) {
+ if (dir->authority().first != CDIR_AUTH_UNKNOWN) {
dout(10) << "ambiguous import auth known, must not be me " << *dir << endl;
cancel_ambiguous_import(q->first);
} else {
void MDCache::finish_ambiguous_import(inodeno_t dirino)
{
assert(my_ambiguous_imports.count(dirino));
- set<inodeno_t> bounds;
- bounds.swap(my_ambiguous_imports[dirino]);
+ set<inodeno_t> bound_inos;
+ bound_inos.swap(my_ambiguous_imports[dirino]);
my_ambiguous_imports.erase(dirino);
dout(10) << "finish_ambiguous_import " << dirino
- << " bounds " << bounds
+ << " bounds " << bound_inos
<< endl;
CInode *diri = get_inode(dirino);
CDir *dir = diri->dir;
assert(dir);
- // adjust dir_auth
- CDir *im = dir;
- if (dir->get_inode()->authority() == mds->get_nodeid()) {
- // parent is already me. adding to existing import.
- im = get_auth_container(dir);
- if (!im) im = dir;
- nested_exports[im].erase(dir);
- exports.erase(dir);
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- dir->state_clear(CDIR_STATE_EXPORT);
- dir->put(CDir::PIN_EXPORT);
- } else {
- // parent isn't me. new import.
- imports.insert(dir);
- dir->set_dir_auth( mds->get_nodeid() );
- dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDir::PIN_IMPORT);
- }
-
- dout(10) << " base " << *dir << endl;
- if (dir != im)
- dout(10) << " under " << *im << endl;
-
- // bounds (exports, before)
- for (set<inodeno_t>::iterator p = bounds.begin();
- p != bounds.end();
+ // make bounds list
+ set<CDir*> bounds;
+ for (set<inodeno_t>::iterator p = bound_inos.begin();
+ p != bound_inos.end();
++p) {
CInode *bi = get_inode(*p);
assert(bi);
CDir *bd = bi->dir;
assert(bd);
-
- if (bd->get_dir_auth() == mds->get_nodeid()) {
- // still me. was an import.
- imports.erase(bd);
- bd->set_dir_auth( CDIR_AUTH_PARENT );
- bd->state_clear(CDIR_STATE_IMPORT);
- bd->put(CDir::PIN_IMPORT);
- // move nested exports.
- for (set<CDir*>::iterator q = nested_exports[bd].begin();
- q != nested_exports[bd].end();
- ++q)
- nested_exports[im].insert(*q);
- nested_exports.erase(bd);
-
- } else {
- // not me anymore. now an export.
- exports.insert(bd);
- nested_exports[im].insert(bd);
- //hrm. assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
- bd->set_dir_auth( CDIR_AUTH_UNKNOWN );
- bd->state_set(CDIR_STATE_EXPORT);
- bd->get(CDir::PIN_EXPORT);
- }
-
- dout(10) << " bound " << *bd << endl;
+ bounds.insert(bd);
}
+
+ // adjust dir_auth, import maps
+ import_subtree(dir, bounds);
}
void MDCache::finish_ambiguous_export(inodeno_t dirino, set<inodeno_t>& bounds)
// adjust dir_auth
CDir *im = get_auth_container(dir);
- if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) {
+ if (dir->get_inode()->authority().first == CDIR_AUTH_UNKNOWN) {
// was an import, hose it
assert(im == dir);
assert(imports.count(dir));
dir->state_set(CDIR_STATE_EXPORT);
dir->get(CDir::PIN_EXPORT);
}
- dout(10) << " base " << *dir << endl;
+ dout(10) << " root " << *dir << endl;
if (dir != im)
dout(10) << " under " << *im << endl;
nested_exports[im].erase(bd);
// fix dir_auth
- assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
+ assert(bd->get_dir_auth().first != CDIR_AUTH_PARENT);
bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me
bd->state_clear(CDIR_STATE_EXPORT);
+/** import_subtree
+ * adjust dir_auth.first.
+ * adjust import/export/nested_export maps and pins.
+ */
+void MDCache::import_subtree(CDir *root, set<CDir*>& bounds)
+{
+ dout(7) << "import_subtree_start " << *root << endl;
+
+ CDir *im = root; // the new subtree root (an import)
+
+ // root
+ if (root->inode->is_auth()) {
+ // parent is already me. was export, adding back to existing import.
+ im = get_auth_container(root);
+ assert(im);
+ nested_exports[im].erase(root);
+ exports.erase(root);
+ root->set_dir_auth(CDIR_AUTH_PARENT, true);
+ root->state_clear(CDIR_STATE_EXPORT);
+ root->put(CDir::PIN_EXPORT);
+ } else {
+ // parent isn't me. new import.
+ imports.insert(root);
+ root->set_dir_auth(mds->get_nodeid(), true);
+ root->state_set(CDIR_STATE_IMPORT);
+ root->get(CDir::PIN_IMPORT);
+ }
+
+ dout(10) << " root " << *root << endl;
+ if (root != im)
+ dout(10) << " under " << *im << endl;
+
+ // i should have no pins in this region.
+ assert(root->get_cum_auth_pins() == 0);
+
+ // bounds
+ for (set<CDir*>::iterator it = bounds.begin();
+ it != bounds.end();
+ it++) {
+ CDir *bd = *it;
+
+ if (bd->is_import()) {
+ // bound is still me. was an import.
+ imports.erase(bd);
+ bd->set_dir_auth(CDIR_AUTH_PARENT, true);
+ bd->state_clear(CDIR_STATE_IMPORT);
+ bd->put(CDir::PIN_IMPORT);
+ // move nested exports under this bound to my subtree root.
+ for (set<CDir*>::iterator q = nested_exports[bd].begin();
+ q != nested_exports[bd].end();
+ ++q)
+ nested_exports[im].insert(*q);
+ nested_exports.erase(bd);
+ } else {
+ // not me anymore. now an export.
+ exports.insert(bd);
+ nested_exports[im].insert(bd);
+ assert(bd->get_dir_auth().first != CDIR_AUTH_PARENT);
+ bd->state_set(CDIR_STATE_EXPORT);
+ bd->get(CDir::PIN_EXPORT);
+ }
+
+ dout(10) << " bound " << *bd << endl;
+ }
+}
+
+void MDCache::import_subtree_finish(CDir *root, set<CDir*>& bounds)
+{
+
+}
+
+void MDCache::export_subtree(CDir *root, set<CDir*>& bounds, int dest)
+{
+ dout(7) << "export_subtree " << *root << endl;
+
+ CDir *im = get_auth_container(root);
+
+ // root
+ if (root->is_import()) {
+ // was an import, hose it
+ assert(im == root);
+ assert(imports.count(root));
+ imports.erase(root);
+ root->set_dir_auth(CDIR_AUTH_PARENT);
+ root->state_clear(CDIR_STATE_IMPORT);
+ root->put(CDir::PIN_IMPORT);
+ } else {
+ // i'm now an export
+ exports.insert(root);
+ nested_exports[im].insert(root);
+ root->set_dir_auth( dest ); // not me
+ root->state_set(CDIR_STATE_EXPORT);
+ root->get(CDir::PIN_EXPORT);
+ }
+
+ // fix dir_auth
+ if (root->inode->authority().first == dest)
+ root->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ root->set_dir_auth( dest );
+
+ dout(10) << " root " << *root << endl;
+ if (root != im)
+ dout(10) << " under " << *im << endl;
+
+ // bounds (there were exports, before)
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bd = *p;
+
+ // hose export
+ assert(exports.count(bd));
+ exports.erase(bd);
+ nested_exports[im].erase(bd);
+
+ bd->state_clear(CDIR_STATE_EXPORT);
+ bd->put(CDir::PIN_EXPORT);
+
+ // fix dir_auth
+ assert(bd->get_dir_auth().first != CDIR_AUTH_PARENT);
+ if (bd->get_dir_auth().first == dest)
+ bd->set_dir_auth(CDIR_AUTH_PARENT);
+ else
+ bd->set_dir_auth(dest);
+
+ dout(10) << " bound " << *bd << endl;
+ }
+}
+
+
+/*
+ * adjust the dir_auth of a subtree.
+ * merge with parent and/or child subtrees, if is it appropriate.
+ */
+void MDCache::adjust_subtree_auth(CDir *dir, pair<int,int> auth)
+{
+ dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+ << " under " << *dir << endl;
+
+ // note my current bounds.
+ set<CDir*> bounds = subtree_bounds[dir];
+
+ // join with parent?
+ CDir *root = dir;
+ if (dir->ino() != 1) // i'm the root, screw you
+ root = get_subtree_root(dir->get_parent_dir());
+
+ if (root != dir && root->get_dir_auth() == auth) {
+ // join the subtrees.
+ dir->set_dir_auth(CDIR_AUTH_DEFAULT);
+ dout(10) << " merge with parent " << *root << endl;
+
+ // move our bounds under new root
+ subtree_bounds.erase(dir);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p)
+ subtree_bounds[root].insert(*p);
+
+ // dir is no longer a subtree
+ subtree_bounds.erase(dir);
+ } else {
+ // don't merge with parent, just update our auth.
+ dir->set_dir_auth(auth);
+ }
+
+ // bounds
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bound = *p;
+
+ if (bound->dir_auth == auth) {
+ // merge with child.
+ dout(10) << " merging bound " << *bound << endl;
+ bound->set_dir_auth(CDIR_AUTH_DEFAULT);
+
+ // move child's children under root.
+ for (set<CDir*>::iterator q = subtree_bounds[bound].begin();
+ q != subtree_bounds[bound].end();
+ ++q)
+ subtree_bounds[root].insert(*q);
+
+ // bound is no longer a subtree.
+ subtree_bounds[root].erase(bound);
+ } else {
+ // don't merge.
+ dout(10) << " bound " << *bound << endl;
+ }
+ }
+}
+
+
/*
* rejoin phase!
++p) {
if (!p->second->is_dir()) continue;
if (!p->second->dir) continue;
- if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue;
+ if (p->second->dir->get_dir_auth().first == CDIR_AUTH_PARENT) continue;
- int auth = p->second->dir->get_dir_auth();
+ int auth = p->second->dir->get_dir_auth().first;
assert(auth >= 0);
if (auth == mds->get_nodeid()) continue; // skip my own regions!
p != dir_auth_regions.end();
++p) {
CDir *dir = *p;
- int to = dir->authority();
+ int to = dir->authority().first;
cache_rejoin_walk(dir, rejoins[to]);
}
// dir?
if (in->dir &&
- in->dir->get_dir_auth() == CDIR_AUTH_PARENT)
+ in->dir->get_dir_auth().first == CDIR_AUTH_PARENT)
nested.push_back(in->dir);
}
}
p != inode_map.end();
++p) {
CInode *in = p->second;
- if (in->authority() == mds->get_nodeid())
+ if (in->authority().first == mds->get_nodeid())
in->state_set(CInode::STATE_AUTH);
else {
in->state_clear(CInode::STATE_AUTH);
}
if (in->parent) {
- if (in->parent->authority() == mds->get_nodeid())
+ if (in->parent->authority().first == mds->get_nodeid())
in->parent->state_set(CDentry::STATE_AUTH);
else {
in->parent->state_clear(CDentry::STATE_AUTH);
}
if (in->dir) {
- if (in->dir->authority() == mds->get_nodeid())
+ if (in->dir->authority().first == mds->get_nodeid())
in->dir->state_set(CDIR_STATE_AUTH);
else {
in->dir->state_clear(CDIR_STATE_AUTH);
map<int, MCacheExpire*> expiremap;
-
// DENTRIES from the LRU
while (lru.lru_get_size() > (unsigned)max) {
CDir *dir = dn->get_dir();
assert(dir);
- CDir *con = get_realm_root(dir);
+ CDir *con = get_subtree_root(dir);
assert(con);
dout(12) << "trim removing " << *dn << endl;
dout(12) << " in container " << *con << endl;
// notify dentry authority?
- int auth2 = CDIR_AUTH_UNKNOWN;
- int auth = CDIR_AUTH_UNKNOWN;
if (!dn->is_auth()) {
- auth = dn->authority(&auth2);
-
- dout(12) << " sending expire to mds" << auth << " on " << *dn << endl;
- assert(auth != mds->get_nodeid());
- if (expiremap.count(auth) == 0)
- expiremap[auth] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth]->add_dentry(con->ino(), dir->ino(), dn->get_name(), dn->get_replica_nonce());
-
- if (auth2 >= 0 && auth2 != mds->get_nodeid()) {
- dout(12) << " sending expire2 to mds" << auth2 << " on " << *dn << endl;
- if (expiremap.count(auth2) == 0)
- expiremap[auth2] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth2]->add_dentry(con->ino(), dir->ino(), dn->get_name(), dn->get_replica_nonce());
+ pair<int,int> auth = dn->authority();
+
+ for (int a=auth.first;
+ a != auth.second && auth.second >= 0 && auth.second != mds->get_nodeid();
+ a=auth.second) {
+ dout(12) << " sending expire to mds" << a << " on " << *dn << endl;
+ assert(a != mds->get_nodeid());
+ if (expiremap.count(a) == 0)
+ expiremap[a] = new MCacheExpire(mds->get_nodeid());
+ expiremap[a]->add_dentry(con->ino(), dir->ino(), dn->get_name(), dn->get_replica_nonce());
}
}
// expire the inode, too.
CInode *in = dn->get_inode();
assert(in);
- assert(in->get_num_ref() == 0);
-
- // DIR
- if (in->dir) {
- if (!in->dir->is_auth()) {
- int dirauth2;
- int dirauth = in->dir->authority(&dirauth2);
-
- assert(dirauth2 < 100); // hack die bug die
-
- // was this an auth delegation? (if so, slightly modified container)
- CDir *dcon = con;
- if (in->dir->dir_auth >= 0 ||
- in->dir->dir_auth_pending >= 0) {
- dout(12) << " for just this dir, the container is " << *dcon << endl;
- dcon = in->dir;
- }
-
- dout(12) << " sending expire to mds" << dirauth << " on " << *in->dir << endl;
- assert(dirauth != mds->get_nodeid());
- if (expiremap.count(dirauth) == 0)
- expiremap[dirauth] = new MCacheExpire(mds->get_nodeid());
- expiremap[dirauth]->add_dir(dcon->ino(), in->ino(), in->dir->replica_nonce);
-
- if (dirauth2 >= 0 && dirauth2 != mds->get_nodeid()) {
- dout(12) << " sending expire2 to mds" << dirauth2 << " on " << *in->dir << endl;
- if (expiremap.count(dirauth2) == 0)
- expiremap[dirauth2] = new MCacheExpire(mds->get_nodeid());
- expiremap[dirauth2]->add_dir(dcon->ino(), in->ino(), in->dir->replica_nonce);
- }
- }
-
- in->close_dir();
- }
-
- // INODE
- if (!in->is_auth()) {
- assert(auth >= 0);
-
- dout(12) << " sending expire to mds" << auth << " on " << *in << endl;
- assert(auth != mds->get_nodeid());
- if (expiremap.count(auth) == 0)
- expiremap[auth] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth]->add_inode(con->ino(), in->ino(), in->get_replica_nonce());
-
- if (auth2 >= 0 && auth2 != mds->get_nodeid()) {
- dout(12) << " sending expire2 to mds" << auth2 << " on " << *in << endl;
- if (expiremap.count(auth2) == 0)
- expiremap[auth2] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth2]->add_inode(con->ino(), in->ino(), in->get_replica_nonce());
- }
- }
-
- dout(15) << " trim removing " << *in << endl;
- if (in == root) root = 0;
-
- // unlink
- dir->unlink_inode(dn);
- remove_inode(in);
+ trim_inode(dn, in, con->ino(), expiremap);
}
else {
assert(dn->is_null());
if (mds->logger) mds->logger->inc("cex");
}
+ // troot inode+dir?
+ while (max == 0 && // only if we're trimming everything!
+ lru.lru_get_size() == 0 &&
+ root &&
+ root->get_num_ref() == 0 &&
+ root->dir &&
+ root->dir->get_num_ref() == 0)
+ trim_inode(0, root, 1, expiremap);
+
// send expires
for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
it != expiremap.end();
mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
}
-
return true;
}
+void MDCache::trim_inode(CDentry *dn, CInode *in, inodeno_t conino, map<int, MCacheExpire*>& expiremap)
+{
+ assert(in->get_num_ref() == 0);
+
+ // DIR
+ pair<int,int> dirauth = CDIR_AUTH_UNDEF;
+ if (in->dir) {
+ if (!in->dir->is_auth()) {
+ dirauth = in->dir->authority();
+ assert(dirauth.second < 100); // hack die bug die
+
+ // was this an auth delegation? (if so, slightly modified container)
+ inodeno_t dconino = conino;
+ if (in->dir->is_subtree_root()) {
+ dout(12) << " for just this dir, the container is " << *in->dir << endl;
+ dconino = in->ino();
+ }
+
+ for (int a=dirauth.first;
+ a != dirauth.second && dirauth.second >= 0 && dirauth.second != mds->get_nodeid();
+ a=dirauth.second) {
+ dout(12) << " sending expire to mds" << a << " on " << *in->dir << endl;
+ assert(a != mds->get_nodeid());
+ if (expiremap.count(a) == 0)
+ expiremap[a] = new MCacheExpire(mds->get_nodeid());
+ expiremap[a]->add_dir(dconino, in->ino(), in->dir->replica_nonce);
+ }
+ }
+
+ in->close_dir();
+ }
+
+ // INODE
+ if (!in->is_auth()) {
+ pair<int,int> auth = in->authority();
+ if (auth.first < 0) { // e.g., root
+ assert(in->ino() == 1);
+ auth = dirauth;
+ }
+
+ for (int a=auth.first;
+ a != auth.second && auth.second >= 0 && auth.second != mds->get_nodeid();
+ a=auth.second) {
+ dout(12) << " sending expire to mds" << a << " on " << *in << endl;
+ assert(a != mds->get_nodeid());
+ if (expiremap.count(a) == 0)
+ expiremap[a] = new MCacheExpire(mds->get_nodeid());
+ expiremap[a]->add_inode(conino, in->ino(), in->get_replica_nonce());
+ }
+ }
+
+ dout(15) << " trim removing " << *in << endl;
+
+ // unlink
+ if (dn)
+ dn->get_dir()->unlink_inode(dn);
+ remove_inode(in);
+ if (in == root) root = 0;
+}
+
void MDCache::trim_non_auth()
{
CDir *dir = *it;
if (!dir->is_auth()) continue;
if (dir->is_unhashing()) continue;
- migrator->unhash_dir(dir);
+ //migrator->unhash_dir(dir);
}
dout(7) << "waiting for dirs to unhash" << endl;
// close root?
- if (mds->get_nodeid() == 0 &&
- lru.lru_get_size() == 0 &&
- root &&
- root->dir &&
- root->dir->is_import() &&
- root->dir->get_num_ref() == 1) { // 1 is the import!
- // un-import
- dout(7) << "removing root import" << endl;
- imports.erase(root->dir);
- root->dir->state_clear(CDIR_STATE_IMPORT);
- root->dir->put(CDir::PIN_IMPORT);
-
- if (root->is_pinned_by(CInode::PIN_DIRTY)) {
- dout(7) << "clearing root inode dirty flag" << endl;
- root->put(CInode::PIN_DIRTY);
+ if (lru.lru_get_size() == 0 &&
+ root &&
+ root->dir) {
+
+ if (root->dir->is_import()) {
+ // un-import
+ dout(7) << "removing root import" << endl;
+ imports.erase(root->dir);
+ root->dir->state_clear(CDIR_STATE_IMPORT);
+ root->dir->put(CDir::PIN_IMPORT);
+
+ if (root->is_pinned_by(CInode::PIN_DIRTY)) {
+ dout(7) << "clearing root inode dirty flag" << endl;
+ root->put(CInode::PIN_DIRTY);
+ }
}
- trim(0);
+ // ignore root inode/dir on other nodes, since it's empty anyway.
}
// imports?
cur->ino(),
want,
true), // need this dir too
- cur->authority(), MDS_PORT_CACHE);
+ cur->authority().first, MDS_PORT_CACHE);
}
cur->add_waiter(CINODE_WAIT_DIR, ondelay);
if (onfinish) delete onfinish;
// MISS. don't have it.
- int dauth = cur->dir->dentry_authority( path[depth] );
+ int dauth = cur->dir->dentry_authority( path[depth] ).first;
dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl;
diri->ino(),
want,
true), // need the dir open
- diri->authority(), MDS_PORT_CACHE);
-
+ diri->authority().first, MDS_PORT_CACHE);
+
diri->add_waiter(CINODE_WAIT_DIR, fin);
}
dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl;
assert(dn->is_xlocked());
- int dauth = dn->dir->dentry_authority(dn->name);
+ int dauth = dn->dir->dentry_authority(dn->name).first;
MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid());
m->set_dn(dn->dir->ino(), dn->name);
mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
if (!in->is_auth()) {
dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl;
- mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+ mds->send_message_mds(m, in->authority().first, MDS_PORT_CACHE);
return;
}
// crazyness?
if (!cur->dir && !cur->is_auth()) {
- int iauth = cur->authority();
+ int iauth = cur->authority().first;
dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl;
mds->send_message_mds( dis, iauth, MDS_PORT_CACHE);
return;
if (dis->get_want().depth() == 0) break;
// lookup dentry
- int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) );
+ int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) ).first;
if (dentry_auth != mds->get_nodeid()) {
dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl;
break; // that's it for us!
if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) &&
!cur->dir->is_auth()) {
// fwd to dir auth
- int dirauth = cur->dir->authority();
+ int dirauth = cur->dir->authority().first;
if (dirauth == dis->get_asker()) {
dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right?
//assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it.
mds->anchorclient->destroy(dn->inode->ino(), NULL);
}
} else {
- int auth = dn->inode->authority();
+ int auth = dn->inode->authority().first;
dout(7) << "remote target is remote, sending unlink request to " << auth << endl;
mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()),
// proxy?
if (in->is_proxy()) {
dout(7) << "handle_inode_unlink proxy on " << *in << endl;
- mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+ mds->send_message_mds(m, in->authority().first, MDS_PORT_CACHE);
return;
}
assert(in->is_auth());
return imp;
}
-CDir *MDCache::get_realm_root(CDir *dir)
+CDir *MDCache::get_subtree_root(CDir *dir)
{
// find the underlying dir that delegates (or is about to delegate) auth
while (true) {
- if (dir->get_dir_auth() >= 0 ||
- dir->get_dir_auth_pending() >= 0)
+ if (dir->is_subtree_root())
return dir;
dir = dir->get_parent_dir();
if (!dir)
list<CInode*> inode_expire_queue; // inodes to delete
+ friend class MDS; // for inode_map, FIXME
// root
list<Context*> waiting_for_root;
set<CDir*> exports;
set<CDir*> hashdirs;
map<CDir*,set<CDir*> > nested_exports; // exports nested under imports _or_ hashdirs
+
+ // subtrees
+ map<CDir*,set<CDir*> > subtree_bounds; // nested bounds on subtrees.
- //void adjust_export(int to, CDir *root, set<CDir*>& bounds);
- //void adjust_import(int from, CDir *root, set<CDir*>& bounds);
+ // adjust subtree auth specification
+ // dir->dir_auth
+ // imports/exports/nested_exports
+ // join/split subtrees as appropriate
+ void import_subtree(CDir *root, set<CDir*>& bounds);
+ void import_subtree_finish(CDir *root, set<CDir*>& bounds);
+ void export_subtree(CDir *root, set<CDir*>& bounds, int dest);
+ void export_subtree_finish(CDir *root, set<CDir*>& bounds, int dest);
+
+ void adjust_subtree_auth(CDir *root, pair<int,int> auth);
// delayed cache expire
map<CDir*, map<int, MCacheExpire*> > delayed_expire; // import|export dir -> expire msg
void set_cache_size(size_t max) { lru.lru_set_max(max); }
size_t get_cache_size() { return lru.lru_get_size(); }
bool trim(int max = -1); // trim cache
+ void trim_inode(CDentry *dn, CInode *in, inodeno_t conino,
+ map<int,class MCacheExpire*>& expiremap);
void trim_non_auth(); // trim out trimmable non-auth items
// shutdown
public:
CDir *get_auth_container(CDir *in);
- CDir *get_realm_root(CDir *dr);
+ CDir *get_subtree_root(CDir *dir);
CDir *get_export_container(CDir *dir);
void find_nested_exports(CDir *dir, set<CDir*>& s);
void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
// decode and process
mdsmap->decode(m->get_encoded());
-
+
// see who i am
whoami = mdsmap->get_inst_rank(messenger->get_myaddr());
if (oldwhoami != whoami) {
<< ", although i wanted " << mdsmap->get_state_name(want_state)
<< endl;
want_state = state;
+ }
+
+ // contemplate suicide
+ if (mdsmap->get_inst(whoami) != messenger->get_myinst()) {
+ dout(1) << "apparently i've been replaced by " << mdsmap->get_inst(whoami) << ", committing suicide." << endl;
+ exit(-1);
+ }
+ if (mdsmap->is_down(whoami)) {
+ dout(1) << "apparently i'm down. committing suicide." << endl;
+ exit(-1);
}
// now active?
if (m->get_source().is_mds()) {
int from = m->get_source().num();
if (!mdsmap->have_inst(from) ||
- mdsmap->get_inst(from) != m->get_source_inst()) {
+ mdsmap->get_inst(from) != m->get_source_inst() ||
+ mdsmap->is_down(from)) {
// bogus mds?
if (m->get_type() != MSG_MDS_MAP) {
- dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+ dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
<< ", dropping" << endl;
delete m;
return;
}
+ // hack: thrash exports
+ for (int i=0; i<g_conf.mds_thrash_exports; i++) {
+ set<int> s;
+ mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
+ if (s.size() == 1)
+ break; // need peers for this to work.
+
+ dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << endl;
+
+ // pick a random dir inode
+ int n = rand() % mdcache->inode_map.size();
+ hash_map<inodeno_t,CInode*>::iterator p = mdcache->inode_map.begin();
+ while (n--) p++;
+
+ CDir *dir = p->second->dir;
+ if (dir && dir->is_auth()) {
+ int dest;
+ do {
+ int k = rand() % s.size();
+ set<int>::iterator p = s.begin();
+ while (k--) p++;
+ dest = *p;
+ } while (dest != whoami);
+ mdcache->migrator->export_dir(dir,dest);
+ }
+ }
+
// hack: force hash root?
if (false &&
// hashing
+ /*
case MSG_MDS_HASHDIRDISCOVER:
handle_hash_dir_discover((MHashDirDiscover*)m);
break;
case MSG_MDS_UNHASHDIRNOTIFYACK:
handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m);
break;
+ */
default:
assert(0);
return;
}
- int dest = dir->inode->authority();
+ int dest = dir->inode->authority().first;
// comment this out ot wreak havoc?
//if (mds->is_shutting_down()) dest = 0; // this is more efficient.
case EXPORT_LOGGINGSTART:
case EXPORT_PREPPING:
case EXPORT_WARNING:
- dout(10) << "state loggingstart|prepping|warning : logging EExportFinish(false)" << endl;
+ dout(10) << "state loggingstart|prepping|warning : unfreezing, logging EExportFinish(false)" << endl;
+ dir->unfreeze_tree();
mds->mdlog->submit_entry(new EExportFinish(dir,false));
- // logger will unfreeze.
break;
-
+
case EXPORT_EXPORTING:
dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl;
mds->mdlog->submit_entry(new EExportFinish(dir,false));
break;
case EXPORT_LOGGINGFINISH:
- dout(10) << "state loggingfinish : doing nothing, we were successful." << endl;
+ dout(10) << "state loggingfinish : just cleaning up, we were successful." << endl;
+ break;
+
+ case EXPORT_NOTIFYING:
+ dout(10) << "state notifying : just cleaning up, we were successful." << endl;
break;
default:
mds->queue_finished(export_finish_waiters[dir]);
export_finish_waiters.erase(dir);
- // send pending import_maps?
+ // send pending import_maps? (these need to go out when all exports have finished.)
mds->mdcache->send_pending_import_maps();
mds->mdcache->show_imports();
<< " on " << *dir << " to mds" << export_peer[dir]
<< endl;
export_warning_ack_waiting[dir].erase(who);
+ export_notify_ack_waiting[dir].erase(who); // they won't get a notify either.
if (export_warning_ack_waiting[dir].empty())
export_dir_go(dir);
}
}
+ if (p->second == EXPORT_NOTIFYING) {
+ // exporter is waiting for notify acks, fake it
+ if (export_notify_ack_waiting[dir].count(who)) {
+ dout(10) << "faking export_dir_notify_ack from mds" << who
+ << " on " << *dir << " to mds" << export_peer[dir]
+ << endl;
+ export_notify_ack_waiting[dir].erase(who);
+ if (export_notify_ack_waiting[dir].empty())
+ export_dir_finish(dir);
+ }
+ }
}
// next!
dir = diri->dir;
if (import_peer[dirino] == who) {
- switch (import_peer[dirino]) {
+ switch (import_state[dirino]) {
case IMPORT_DISCOVERED:
-
+ dout(10) << "state discovered : unpinning " << *diri << endl;
+ assert(diri);
+ // unpin base
+ diri->put(CInode::PIN_IMPORTING);
break;
- case IMPORT_PREPPING:
-
- break;
+ // NOTE: state order reversal + fall-thru, pay attention.
case IMPORT_PREPPED:
+ dout(10) << "state prepping : unpinning base+bounds, unfreezing, " << *dir << endl;
+ assert(dir);
+ dir->set_dir_auth_pending(CDIR_AUTH_UNKNOWN); // not anymore.
+
+ // unfreeze
+ dir->unfreeze_tree();
+
+ // fall-thru to unpin base+bounds
+ case IMPORT_PREPPING:
+ if (import_state[dirino] == IMPORT_PREPPING) {
+ dout(10) << "state prepping : unpinning base+bounds " << *dir << endl;
+ }
+ assert(dir);
+ // unpin base
+ dir->put(CDir::PIN_IMPORTING);
+ // unpin bounds
+ for (set<CDir*>::iterator it = import_bounds[dir].begin();
+ it != import_bounds[dir].end();
+ it++) {
+ CDir *bd = *it;
+ assert(bd->state_test(CDIR_STATE_IMPORTBOUND));
+ bd->state_clear(CDIR_STATE_IMPORTBOUND);
+ bd->put(CDir::PIN_IMPORTBOUND);
+ }
break;
- case IMPORT_LOGGINGSTART:
+ case IMPORT_LOGGINGSTART:
+ dout(10) << "state loggingstart : reversing import on " << *dir << endl;
+ assert(dir);
+ mds->mdlog->submit_entry(new EImportFinish(dir,false)); // log failure
+ reverse_import(dir);
break;
case IMPORT_ACKING:
// do nothing, exporter is no longer involved.
break;
}
+
+ import_state.erase(dirino);
+ import_peer.erase(dirino);
+ import_bound_inos.erase(dirino);
+ import_bounds.erase(dir);
+
+ mds->mdcache->show_imports();
+ mds->mdcache->show_cache();
}
// next!
show_imports();
+ // -- note/mark subtree bounds --
+ // also include traces to all nested exports.
+ cache->find_nested_exports(dir, export_bounds[dir]);
+ set<CDir*> &bounds = export_bounds[dir];
+
+ // note that dest an ambiguous auth for this subtree.
+ dir->set_dir_auth_pending(export_peer[dir]);
+
+ // generate prep message, log entry.
EExportStart *le = new EExportStart(dir, dest);
MExportDirPrep *prep = new MExportDirPrep(dir->inode);
prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) );
le->metablob.add_dir( dir, false );
- // also include traces to all nested exports.
- set<CDir*> my_nested;
- cache->find_nested_exports(dir, my_nested);
- export_bounds[dir] = my_nested;
-
- for (set<CDir*>::iterator it = my_nested.begin();
- it != my_nested.end();
+ // check bounds
+ for (set<CDir*>::iterator it = bounds.begin();
+ it != bounds.end();
it++) {
CDir *exp = *it;
+
+ // pin it.
+ exp->get(CDir::PIN_EXPORTBOUND);
dout(7) << " including nested export " << *exp << " in prep" << endl;
if (export_state.count(dir) == 0 ||
export_state[dir] != EXPORT_PREPPING) {
// export must have aborted.
- dout(7) << "export must have aborted, unfreezing" << endl;
- dir->unfreeze_tree();
+ dout(7) << "export must have aborted" << endl;
+ delete m;
return;
}
- // note that dest an ambiguous auth for this subtree.
- dir->set_dir_auth_pending(export_peer[dir]);
-
// send warnings
assert(export_peer.count(dir));
int dest = export_peer[dir];
p != dir->replicas_end();
++p) {
if (p->first == dest) continue;
+ if (!mds->mdsmap->is_active(p->first) ||
+ !mds->mdsmap->is_stopping(p->first))
+ continue; // only if active
export_warning_ack_waiting[dir].insert(p->first);
export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too!
mds->send_message_mds(new MExportDirWarning(dir->ino(), export_peer[dir]),
if (export_state.count(dir) == 0 ||
export_state[dir] != EXPORT_WARNING) {
// export must have aborted.
- dout(7) << "export must have aborted, unfreezing" << endl;
- dir->unfreeze_tree();
+ dout(7) << "export must have aborted" << endl;
+ delete m;
return;
}
assert(dir->get_cum_auth_pins() == 0);
// update imports/exports
- CDir *containing_import = cache->get_auth_container(dir);
-
- if (containing_import == dir) {
- dout(7) << " i'm rexporting a previous import" << endl;
- assert(dir->is_import());
- cache->imports.erase(dir);
- dir->state_clear(CDIR_STATE_IMPORT);
- dir->put(CDir::PIN_IMPORT); // unpin, no longer an import
-
- // discard nested exports (that we're handing off
- for (set<CDir*>::iterator p = cache->nested_exports[dir].begin();
- p != cache->nested_exports[dir].end(); ) {
- CDir *nested = *p;
- p++;
-
- // nested beneath our new export *in; remove!
- dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
- assert(cache->exports.count(nested) == 1);
- cache->nested_exports[dir].erase(nested);
- }
-
- } else {
- dout(7) << " i'm a subdir nested under import " << *containing_import << endl;
- cache->exports.insert(dir);
- cache->nested_exports[containing_import].insert(dir);
-
- dir->state_set(CDIR_STATE_EXPORT);
- dir->get(CDir::PIN_EXPORT); // i must keep it pinned
-
- // discard nested exports (that we're handing off)
- for (set<CDir*>::iterator p = cache->nested_exports[containing_import].begin();
- p != cache->nested_exports[containing_import].end(); ) {
- CDir *nested = *p;
- p++;
- if (nested == dir) continue; // ignore myself
-
- // container of parent; otherwise we get ourselves.
- CDir *containing_export = nested->get_parent_dir();
- while (containing_export && !containing_export->is_export())
- containing_export = containing_export->get_parent_dir();
- if (!containing_export) continue;
-
- if (containing_export == dir) {
- // nested beneath our new export *in; remove!
- dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl;
- cache->nested_exports[containing_import].erase(nested);
- // exports.erase(nested); _walk does this
- } else {
- dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
- assert(cache->get_auth_container(containing_export) != containing_import);
- }
- }
- }
-
- // note new authority (locally)
- if (dir->inode->authority() == dest)
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- dir->set_dir_auth( dest );
-
+ cache->export_subtree(dir, export_bounds[dir], dest);
// fill export message with cache data
C_Contexts *fin = new C_Contexts; // collect all the waiters
if (in->is_dir() && in->dir) {
if (in->dir->is_auth()) {
// nested subdir
- assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+ assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT);
subdirs.push_back(in->dir); // it's ours, recurse (later)
-
- } else {
- // nested export
- assert(in->dir->get_dir_auth() >= 0);
- dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl;
- assert(cache->exports.count(in->dir) == 1);
- cache->exports.erase(in->dir); // discard nested export (nested_exports updated above)
-
- in->dir->state_clear(CDIR_STATE_EXPORT);
- in->dir->put(CDir::PIN_EXPORT);
-
- // simplify dir_auth?
- if (in->dir->get_dir_auth() == newauth)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
+ }
}
// add to proxy
export_bounds.erase(dir);
// log export completion, then finish (unfreeze, trigger finish context, etc.)
+ dir->get(CDir::PIN_LOGGINGEXPORTFINISH);
mds->mdlog->submit_entry(new EExportFinish(dir, true),
new C_MDS_ExportFinishLogged(this, dir));
assert(export_bounds.count(dir));
assert(export_data.count(dir));
- // re-import it.
- set<CDir*> bounds;
- bounds.swap(export_bounds[dir]);
- export_bounds.erase(dir);
-
- // -- adjust dir_auth --
- // base
- CDir *im = dir;
- if (dir->get_inode()->authority() == mds->get_nodeid()) {
- // parent is already me. was export, adding back to existing import.
- im = mds->mdcache->get_auth_container(dir);
- assert(im);
- mds->mdcache->nested_exports[im].erase(dir);
- mds->mdcache->exports.erase(dir);
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- dir->state_clear(CDIR_STATE_EXPORT);
- dir->put(CDir::PIN_EXPORT);
- } else {
- // parent isn't me. new import.
- mds->mdcache->imports.insert(dir);
- dir->set_dir_auth( mds->get_nodeid() );
- dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDir::PIN_IMPORT);
- }
-
- dout(10) << " base " << *dir << endl;
- if (dir != im)
- dout(10) << " under " << *im << endl;
-
- assert(dir->get_cum_auth_pins() == 0);
+ // adjust dir_auth, exports
+ cache->import_subtree(dir, export_bounds[dir]);
- // bounds
- for (set<CDir*>::iterator p = bounds.begin();
- p != bounds.end();
+ // unpin bounds
+ for (set<CDir*>::iterator p = export_bounds[dir].begin();
+ p != export_bounds[dir].end();
++p) {
CDir *bd = *p;
-
- if (bd->get_dir_auth() == mds->get_nodeid()) {
- // still me. was an import.
- mds->mdcache->imports.erase(bd);
- bd->set_dir_auth( CDIR_AUTH_PARENT );
- bd->state_clear(CDIR_STATE_IMPORT);
- bd->put(CDir::PIN_IMPORT);
- // move nested exports.
- for (set<CDir*>::iterator q = mds->mdcache->nested_exports[bd].begin();
- q != mds->mdcache->nested_exports[bd].end();
- ++q)
- mds->mdcache->nested_exports[im].insert(*q);
- mds->mdcache->nested_exports.erase(bd);
- } else {
- // not me anymore. now an export.
- mds->mdcache->exports.insert(bd);
- mds->mdcache->nested_exports[im].insert(bd);
- assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
- bd->state_set(CDIR_STATE_EXPORT);
- bd->get(CDir::PIN_EXPORT);
- }
-
- dout(10) << " bound " << *bd << endl;
+ bd->put(CDir::PIN_EXPORTBOUND);
}
-
- // reimport the dirs
+ // re-import the metadata
list<inodeno_t> imported_subdirs;
int num_imported_inodes = 0;
// process delayed expires
cache->process_delayed_expire(dir);
-
+
+ // send out notify(abort) to bystanders. no ack necessary.
+ for (set<int>::iterator p = export_notify_ack_waiting[dir].begin();
+ p != export_notify_ack_waiting[dir].end();
+ ++p) {
+ MExportDirNotify *notify = new MExportDirNotify(dir->ino(),
+ mds->get_nodeid(), mds->get_nodeid());
+ notify->copy_exports(export_bounds[dir]);
+ mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR);
+ }
+
// some clean up
export_data.erase(dir);
export_bounds.erase(dir);
void Migrator::export_dir_logged_finish(CDir *dir)
{
dout(7) << "export_dir_commit " << *dir << endl;
-
- if (!export_state.count(dir)) {
+ dir->put(CDir::PIN_LOGGINGEXPORTFINISH);
+
+ if (export_state.count(dir) == 0||
+ export_state[dir] != EXPORT_LOGGINGFINISH) {
dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl;
return;
}
void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m)
{
CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
+ CDir *dir = in ? in->dir : 0;
- dout(7) << "handle_export_dir_notify_ack from " << m->get_source()
- << " on " << *dir << endl;
-
- if (export_state.count(dir) == 0 ||
+ if (dir) {
+ dout(7) << "handle_export_dir_notify_ack from " << m->get_source()
+ << " on " << *dir << endl;
+ } else {
+ dout(7) << "handle_export_dir_notify_ack from " << m->get_source()
+ << " on dir " << m->get_ino() << endl;
+ }
+
+ // aborted?
+ if (!dir ||
+ export_state.count(dir) == 0 ||
export_state[dir] != EXPORT_NOTIFYING) {
- // import must have aborted.
- assert(0); // FIXME WRITE ME
+ dout(7) << "target must have failed, not sending finish message. export succeeded anyway." << endl;
+
delete m;
return;
}
+ // process.
int from = m->get_source().num();
assert(export_notify_ack_waiting.count(dir));
export_notify_ack_waiting[dir].erase(from);
if (export_state.count(dir)) {
// send finish/commit to new auth
- mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+ mds->send_message_mds(new MExportDirFinish(dir->ino()), export_peer[dir], MDS_PORT_MIGRATOR);
// remove from exporting list
export_state.erase(dir);
dout(7) << "export_dir_finish unfreezing" << endl;
dir->unfreeze_tree();
+ // unpin bounds
+ for (set<CDir*>::iterator p = export_bounds[dir].begin();
+ p != export_bounds[dir].end();
+ ++p) {
+ CDir *bd = *p;
+ bd->put(CDir::PIN_EXPORTBOUND);
+ }
+
// unpin path
dout(7) << "export_dir_finish unpinning path" << endl;
vector<CDentry*> trace;
assert(in->is_dir());
+ /*
if (in->is_frozen()) {
dout(7) << "frozen, waiting." << endl;
in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
return;
}
- // pin inode in the cache (for now)
- in->get(CInode::PIN_IMPORTING);
-
// pin auth too, until the import completes.
in->auth_pin();
+ */
+
+ // pin inode in the cache (for now)
+ in->get(CInode::PIN_IMPORTING);
import_state[in->ino()] = IMPORT_DISCOVERED;
import_peer[in->ino()] = m->get_source().num();
diri->put(CInode::PIN_IMPORTING);
dir->get(CDir::PIN_IMPORTING);
- // auth pin too
- dir->auth_pin();
- diri->auth_unpin();
-
// change import state
import_state[diri->ino()] = IMPORT_PREPPING;
}
// open export dirs/bounds?
- assert(import_bounds.count(diri->ino()) == 0);
+ assert(import_bound_inos.count(diri->ino()) == 0);
for (list<inodeno_t>::iterator it = m->get_exports().begin();
it != m->get_exports().end();
it++) {
assert(in);
// note bound.
- import_bounds[dir->ino()].insert(*it);
+ import_bound_inos[dir->ino()].insert(*it);
if (!in->dir) {
dout(7) << " opening nested export on " << *in << endl;
cache->open_remote_dir(in,
new C_MDS_RetryMessage(mds, m));
-
- // pin it!
- in->get(CInode::PIN_OPENINGDIR);
- in->state_set(CInode::STATE_OPENINGDIR);
}
}
} else {
CInode *in = cache->get_inode(ino);
assert(in);
if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(7) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDir::PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
-
- if (in->state_test(CInode::STATE_OPENINGDIR)) {
- in->put(CInode::PIN_OPENINGDIR);
- in->state_clear(CInode::STATE_OPENINGDIR);
- }
+ if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) {
+ dout(7) << " pinning import bound " << *in->dir << endl;
+ in->dir->get(CDir::PIN_IMPORTBOUND);
+ in->dir->state_set(CDIR_STATE_IMPORTBOUND);
+ import_bounds[dir].insert(in->dir);
} else {
- dout(7) << " already pinned nested export " << *in << endl;
+ dout(7) << " already pinned import bound " << *in << endl;
}
} else {
dout(7) << " waiting for nested export dir on " << *in << endl;
// (note: this is a manual freeze.. hack hack hack!)
dout(7) << " all ready, freezing import region" << endl;
- // note that i am an ambiguous auth for this subtree.
+ // then, note that i am an ambiguous auth for this subtree.
dir->set_dir_auth_pending(mds->get_nodeid());
// mark import point frozen
- dir->get_inode()->auth_pin();
- dir->state_set(CDIR_STATE_FROZENTREE);
+ dir->_freeze_tree();
- // mark bounds as leaves
- for (list<inodeno_t>::iterator p = m->get_exports().begin();
- p != m->get_exports().end();
- ++p) {
- CInode *bdi = cache->get_inode(*p);
- assert(bdi);
- CDir *bd = bdi->dir;
- assert(bd);
-
- // mark export point frozenleaf
- bd->get(CDir::PIN_FREEZELEAF);
- bd->state_set(CDIR_STATE_FROZENTREELEAF);
- assert(import_bounds[dir->ino()].count(*p)); // we took note during assim, above
- }
-
// ok!
dout(7) << " sending export_dir_prep_ack on " << *dir << endl;
mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
EImportStart *le = new EImportStart(dir->ino(), m->get_exports());
le->metablob.add_dir_context(dir);
- // note new authority (locally)
- CDir *im = dir;
- if (dir->inode->is_auth()) {
- // parent is already me. was export, adding back to existing import.
- im = mds->mdcache->get_auth_container(dir);
- assert(im);
- mds->mdcache->nested_exports[im].erase(dir);
- mds->mdcache->exports.erase(dir);
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- dir->state_clear(CDIR_STATE_EXPORT);
- dir->put(CDir::PIN_EXPORT);
- } else {
- // parent isn't me. new import.
- mds->mdcache->imports.insert(dir);
- dir->set_dir_auth( mds->get_nodeid() );
- dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDir::PIN_IMPORT);
- }
+ // update dir_auth, import maps
+ cache->import_subtree(dir, import_bounds[dir]);
- // take out my temp pin
+ // take out my importing pin
dir->put(CDir::PIN_IMPORTING);
- dout(10) << " base " << *dir << endl;
- if (dir != im)
- dout(10) << " under " << *im << endl;
-
- assert(dir->get_cum_auth_pins() == 0);
-
- // bounds
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
- it++) {
- CInode *bdi = cache->get_inode(*it);
- CDir *bd = bdi->dir;
-
- if (bd->get_dir_auth() == mds->get_nodeid()) {
- // still me. was an import.
- assert(bd->is_import());
- mds->mdcache->imports.erase(bd);
- bd->set_dir_auth( CDIR_AUTH_PARENT );
- bd->state_clear(CDIR_STATE_IMPORT);
- bd->put(CDir::PIN_IMPORT);
- // move nested exports.
- for (set<CDir*>::iterator q = mds->mdcache->nested_exports[bd].begin();
- q != mds->mdcache->nested_exports[bd].end();
- ++q)
- mds->mdcache->nested_exports[im].insert(*q);
- mds->mdcache->nested_exports.erase(bd);
-
- // adjust nested auth pins
- bdi->adjust_nested_auth_pins(bd->get_cum_auth_pins());
- } else {
- // not me anymore. now an export.
- mds->mdcache->exports.insert(bd);
- mds->mdcache->nested_exports[im].insert(bd);
- assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
- bd->state_set(CDIR_STATE_EXPORT);
- bd->get(CDir::PIN_EXPORT);
- }
-
- // remove our pin
- bd->put(CDir::PIN_IMPORTINGEXPORT);
- bd->state_clear(CDIR_STATE_IMPORTINGEXPORT);
-
- dout(10) << " bound " << *bd << endl;
- }
-
// add this crap to my cache
list<inodeno_t> imported_subdirs;
int num_imported_inodes = 0;
dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
- // include bounding dirs in EImportStart
- // (now that the interior metadata is already in the event)
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
+ // remove bound pins
+ // include bounds in EImportStart
+ for (set<CDir*>::iterator it = import_bounds[dir].begin();
+ it != import_bounds[dir].end();
it++) {
- CInode *bdi = cache->get_inode(*it);
- CDir *bd = bdi->dir;
+ CDir *bd = *it;
+
+ // remove bound pin
+ bd->put(CDir::PIN_IMPORTBOUND);
+ bd->state_clear(CDIR_STATE_IMPORTBOUND);
+
+ // include bounding dirs in EImportStart
+ // (now that the interior metadata is already in the event)
le->metablob.add_dir(bd, false);
}
}
+void Migrator::reverse_import(CDir *dir)
+{
+ dout(7) << "reverse_import " << *dir << endl;
+
+ assert(0); // implement me.
+
+ // update dir_auth, import maps
+ cache->export_subtree(dir, import_bounds[dir], import_peer[dir->ino()]);
+
+ // remove bound pins
+ for (set<CDir*>::iterator it = import_bounds[dir].begin();
+ it != import_bounds[dir].end();
+ it++) {
+ CDir *bd = *it;
+ bd->put(CDir::PIN_IMPORTBOUND);
+ bd->state_clear(CDIR_STATE_IMPORTBOUND);
+ }
+
+ // ...
+}
+
+
void Migrator::import_dir_logged_start(CDir *dir, int from,
list<inodeno_t> &imported_subdirs,
list<inodeno_t> &exports)
{
dout(7) << "import_dir_logged_finish " << *dir << endl;
- // un auth pin (other exports can now proceed)
- dir->auth_unpin();
-
// unfreeze!
- for (set<inodeno_t>::iterator p = import_bounds[dir->ino()].begin();
- p != import_bounds[dir->ino()].end();
- ++p) {
- CInode *diri = mds->mdcache->get_inode(*p);
- CDir *dir = diri->dir;
- assert(dir->state_test(CDIR_STATE_FROZENTREELEAF));
- dir->put(CDir::PIN_FREEZELEAF);
- dir->state_clear(CDIR_STATE_FROZENTREELEAF);
- }
-
dir->unfreeze_tree();
// clear import state (we're done!)
import_state.erase(dir->ino());
import_peer.erase(dir->ino());
- import_bounds.erase(dir->ino());
+ import_bound_inos.erase(dir->ino());
+ import_bounds.erase(dir);
// process delayed expires
cache->process_delayed_expire(dir);
dout(7) << "handle_export_dir_notify mds" << m->get_old_auth()
<< " -> mds" << m->get_new_auth()
<< " on missing dir " << m->get_ino() << endl;
+ } else if (m->get_old_auth() == m->get_new_auth()) {
+ dout(7) << "handle_export_dir_notify mds" << m->get_old_auth()
+ << " aborted export on "
+ << *dir << endl;
+ // clear dir_auth_pending
+ dir->set_dir_auth_pending(CDIR_AUTH_UNKNOWN);
+
+ // no ack necessary.
+ delete m;
+ return;
} else {
dout(7) << "handle_export_dir_notify mds" << m->get_old_auth()
<< " -> mds" << m->get_new_auth()
CDir *ndir = n->dir;
if (!ndir) continue;
- int boundauth = ndir->authority();
+ int boundauth = ndir->authority().first;
dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl;
- if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) {
+ if (ndir->get_dir_auth().first == CDIR_AUTH_PARENT) {
if (boundauth != m->get_new_auth())
ndir->set_dir_auth( boundauth );
else
- assert(dir->authority() == m->get_new_auth()); // apparently we already knew!
+ assert(dir->authority().first == m->get_new_auth()); // apparently we already knew!
} else {
if (boundauth == m->get_new_auth())
ndir->set_dir_auth( CDIR_AUTH_PARENT );
}
// update dir_auth
- if (diri->authority() == m->get_new_auth()) {
+ if (diri->authority().first == m->get_new_auth()) {
dout(7) << "handle_export_dir_notify on " << *diri << ": inode auth is the same, setting dir_auth -1" << endl;
dir->set_dir_auth( CDIR_AUTH_PARENT );
assert(!diri->is_auth());
} else {
dir->set_dir_auth( m->get_new_auth() );
}
- assert(dir->authority() != mds->get_nodeid());
+ assert(dir->authority().first != mds->get_nodeid());
assert(!dir->is_auth());
// DEBUG: verify subdirs
-// =======================================================================
-// HASHING
-
-
-void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
-{
- int off = 0;
-
- for (; nden>0; nden--) {
- // dentry
- string dname;
- _decode(dname, bl, off);
- dout(15) << "dname is " << dname << endl;
-
- char icode;
- bl.copy(off, 1, &icode);
- off++;
-
- CDentry *dn = dir->lookup(dname);
- if (!dn)
- dn = dir->add_dentry(dname); // null
-
- // mark dn dirty _after_ we link the inode (scroll down)
-
- if (icode == 'N') {
-
- // null dentry
- assert(dn->is_null());
-
- // fall thru
- }
- else if (icode == 'L') {
- // remote link
- inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- dir->link_inode(dn, ino);
- }
- else if (icode == 'I') {
- // inode
- decode_import_inode(dn, bl, off, oldauth);
-
- // fix up subdir export?
- if (dn->inode->dir) {
- assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
- dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
-
- if (dn->inode->dir->is_auth()) {
- // mine. must have been an import.
- assert(dn->inode->dir->is_import());
- dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- cache->imports.erase(dn->inode->dir);
- dn->inode->dir->put(CDir::PIN_IMPORT);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
-
- // move nested under hashdir
- for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
- it != cache->nested_exports[dn->inode->dir].end();
- it++)
- cache->nested_exports[dir].insert(*it);
- cache->nested_exports.erase(dn->inode->dir);
-
- // now it matches the inode
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- else {
- // not mine. make it an export.
- dout(7) << "making subdir into export " << *dn->inode->dir << endl;
- dn->inode->dir->get(CDir::PIN_EXPORT);
- dn->inode->dir->state_set(CDIR_STATE_EXPORT);
- cache->exports.insert(dn->inode->dir);
- cache->nested_exports[dir].insert(dn->inode->dir);
-
- if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT)
- dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
- assert(dn->inode->dir->get_dir_auth() >= 0);
- }
- }
- }
-
- // mark dentry dirty? (only _after_ we link the inode!)
- dn->_mark_dirty(); // fixme
- }
-}
-
-/*
-
- notes on interaction of hashing and export/import:
-
- - dir->is_auth() is completely independent of hashing. for a hashed dir,
- - all nodes are partially authoritative
- - all nodes dir->is_hashed() == true
- - all nodes dir->inode->dir_is_hashed() == true
- - one node dir->is_auth() == true, the rest == false
- - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
-
- - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
-
- - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export)
- - on export, -1 is changed to mds->get_nodeid()
- - on import, nothing special, actually.
-
- - hashed dir files aren't included in export; subdirs are converted to imports
- or exports as necessary.
- - hashed dir subdirs are discovered on export. this is important
- because dirs are needed to tie together auth hierarchy, for auth to know about
- imports/exports, etc.
-
- - dir state is maintained on auth.
- - COMPLETE and HASHED are transfered to importers.
- - DIRTY is set everywhere.
-
- - hashed dir is like an import: hashed dir used for nested_exports map.
- - nested_exports is updated appropriately on auth and replicas.
- - a subtree terminates as a hashed dir, since the hashing explicitly
- redelegates all inodes. thus export_dir_walk includes hashed dirs, but
- not their inodes.
-*/
-
-// HASH on auth
-
-class C_MDC_HashFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->hash_dir_frozen(dir);
- }
-};
-
-class C_MDC_HashComplete : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_HashComplete(Migrator *mig, CDir *dir) {
- this->mig = mig;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mig->hash_dir_complete(dir);
- }
-};
-
-
-/** hash_dir(dir)
- * start hashing a directory.
- */
-void Migrator::hash_dir(CDir *dir)
-{
- dout(-7) << "hash_dir " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- if (dir->is_frozen() ||
- dir->is_freezing()) {
- dout(7) << " can't hash, freezing|frozen." << endl;
- return;
- }
-
- // pin path?
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- if (!cache->path_pin(trace, 0, 0)) {
- dout(7) << "hash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // ok, go
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDir::PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // discover on all mds
- assert(hash_gather.count(dir) == 0);
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue; // except me
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
- }
- dir->auth_pin(); // pin until discovers are all acked.
-
- // start freeze
- dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_HashComplete(this, dir));
- } else
- hash_dir_complete(dir);
-}
-
-
-/*
- * wait for everybody to discover and open the hashing dir
- * then auth_unpin, to let the freeze happen
- */
-void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
- dir->auth_unpin(); // unpin to allow freeze to complete
- } else {
- dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
- }
-
- delete m; // done
-}
-
-
-
-/*
- * once the dir is completely in memory,
- * mark all migrating inodes dirty (to pin in cache)
- */
-void Migrator::hash_dir_complete(CDir *dir)
-{
- dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- in->_mark_dirty(); // fixme
- }
-
- if (dir->is_frozen_dir())
- hash_dir_go(dir);
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * send the prep messages!
- */
-void Migrator::hash_dir_frozen(CDir *dir)
-{
- dout(7) << "hash_dir_frozen " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
- return;
- }
-
- // send prep messages w/ export directories to open
- vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
-
- // check for subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) continue;
-
- // msg?
- if (msgs[dentryhashcode] == 0) {
- msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
- }
- msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
- }
-
- // send them!
- assert(hash_gather[dir].empty());
- for (unsigned i=0; i<msgs.size(); i++) {
- if (msgs[i]) {
- mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
- hash_gather[dir].insert(i);
- }
- }
-
- if (hash_gather[dir].empty()) {
- // no subdirs! continue!
- hash_gather.erase(dir);
- hash_dir_go(dir);
- } else {
- // wait!
- }
-}
-
-/*
- * wait for peers to open all subdirs
- */
-void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
-
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
- hash_dir_go(dir);
- } else {
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * do the hashing!
- */
-void Migrator::hash_dir_go(CDir *dir)
-{
- dout(7) << "hash_dir_go " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- // get messages to other nodes ready
- vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- msgs[i] = new MHashDir(dir->ino());
- }
-
- // pick a hash seed.
- dir->inode->inode.hash_seed = 1;//dir->ino();
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // get containing import. might be me.
- CDir *containing_import = cache->get_auth_container(dir);
- assert(containing_import != dir || dir->is_import());
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) {
- continue; // still mine!
- }
-
- bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
- assert(bl);
-
- // -- dentry
- dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, *bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl->append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl->append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl->append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl->append("I", 1); // inode dentry
-
- encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
- msgs[dentryhashcode]->inc_nden();
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // add to proxy
- hash_proxy_inos[dir].push_back(in);
- in->state_set(CInode::STATE_PROXY);
- in->get(CInode::PIN_PROXY);
-
- // fix up subdirs
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- cache->imports.insert(in->dir);
- in->dir->get(CDir::PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
-
- // fix nested bits
- for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
- it != cache->nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (cache->get_auth_container(ex) == in->dir) {
- dout(10) << "moving nested export " << *ex << endl;
- cache->nested_exports[containing_import].erase(ex);
- cache->nested_exports[in->dir].insert(ex);
- }
- }
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDir::PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- cache->exports.erase(in->dir);
- cache->nested_exports[containing_import].erase(in->dir);
- if (in->dir->authority() == dentryhashcode)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- in->dir->set_dir_auth( in->dir->authority() );
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDir::PIN_HASHED);
- cache->hashdirs.insert(dir);
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdlog->submit_entry(new EString("dirty dir fixme"));
-
- // inode state
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash dirty fixme"));
- }
-
- // fix up nested_exports?
- if (containing_import != dir) {
- dout(7) << "moving nested exports under hashed dir" << endl;
- for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
- it != cache->nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (cache->get_auth_container(ex) == dir) {
- dout(7) << " moving nested export under hashed dir: " << *ex << endl;
- cache->nested_exports[containing_import].erase(ex);
- cache->nested_exports[dir].insert(ex);
- } else {
- dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
- }
- }
- }
-
- // send hash messages
- assert(hash_gather[dir].empty());
- assert(hash_notify_gather[dir].empty());
- assert(dir->hashed_subset.empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- // all nodes hashed locally..
- dir->hashed_subset.insert(i);
-
- if (i == mds->get_nodeid()) continue;
-
- // init hash_gather and hash_notify_gather sets
- hash_gather[dir].insert(i);
-
- assert(hash_notify_gather[dir][i].empty());
- for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
- if (j == mds->get_nodeid()) continue;
- if (j == i) continue;
- hash_notify_gather[dir][i].insert(j);
- }
-
- mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
- }
-
- // wait for all the acks.
-}
-
-
-void Migrator::handle_hash_dir_ack(MHashDirAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "got notifies too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "waiting on notifies " << endl;
- }
-
- } else {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-void Migrator::hash_dir_finish(CDir *dir)
-{
- dout(7) << "hash_dir_finish finishing " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- // dir state
- hash_gather.erase(dir);
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDir::PIN_HASHING);
- dir->hashed_subset.clear();
-
- // unproxy inodes
- // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CInode::STATE_PROXY));
- in->state_clear(CInode::STATE_PROXY);
- in->put(CInode::PIN_PROXY);
- }
- hash_proxy_inos.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- cache->path_unpin(trace, 0);
-
- // unfreeze
- dir->unfreeze_dir();
-
- show_imports();
- assert(hash_gather.count(dir) == 0);
-
- // stats
- //if (mds->logger) mds->logger->inc("nh", 1);
-
-}
-
-
-
-
-// HASH on auth and non-auth
-
-void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir_notify " << *dir << endl;
- int from = m->get_from();
-
- int source = m->get_source().num();
- if (dir->is_auth()) {
- // gather notifies
- assert(dir->is_hashed());
-
- assert( hash_notify_gather[dir][from].count(source) );
- hash_notify_gather[dir][from].erase(source);
-
- if (hash_notify_gather[dir][from].empty()) {
- dout(7) << "last notify from " << from << endl;
- hash_notify_gather[dir].erase(from);
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "last notify!" << endl;
- hash_notify_gather.erase(dir);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "got acks too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
- }
- } else {
- dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
- }
- } else {
- dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
- }
-
- // delete msg
- delete m;
- } else {
- // update dir hashed_subset
- assert(dir->hashed_subset.count(from) == 0);
- dir->hashed_subset.insert(from);
-
- // update open subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->get_inode();
- if (!in) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != from) continue; // we'll import these in a minute
-
- if (in->dir->authority() != dentryhashcode)
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
-
- // remove from notify gather set
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- // last notify?
- if (hash_gather[dir].empty()) {
- dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
- hash_gather.erase(dir);
-
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDir::PIN_HASHING);
- dir->hashed_subset.clear();
- } else {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- }
-
- // fw notify to auth
- mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
- }
-}
-
-
-
-
-// HASH on non-auth
-
-/*
- * discover step:
- * each peer needs to open up the directory and pin it before we start
- */
-class C_MDC_HashDirDiscover : public Context {
- Migrator *mig;
- MHashDirDiscover *m;
-public:
- vector<CDentry*> trace;
- C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
- this->mig = mig;
- this->m = m;
- }
- void finish(int r) {
- CInode *in = 0;
- if (r >= 0) {
- if (trace.size())
- in = trace[trace.size()-1]->get_inode();
- else
- in = mig->cache->get_root();
- }
- mig->handle_hash_dir_discover_2(m, in, r);
- }
-};
-
-void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
-{
- assert(m->get_source().num() != mds->get_nodeid());
-
- dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
-
- // must discover it!
- C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
- filepath fpath(m->get_path());
- cache->path_traverse(fpath, onfinish->trace, true,
- m, new C_MDS_RetryMessage(mds,m), // on delay/retry
- MDS_TRAVERSE_DISCOVER,
- onfinish); // on completion|error
-}
-
-void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
-{
- // yay!
- if (in) {
- dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
- }
-
- if (r < 0 || !in->is_dir()) {
- dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
- assert(0); // this shouldn't happen if the auth pins his path properly!!!!
- }
- assert(in->is_dir());
-
- // is dir open?
- if (!in->dir) {
- dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- return;
- }
- CDir *dir = in->dir;
-
- // pin dir, set hashing flag
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDir::PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // inode state
- dir->inode->inode.hash_seed = 1;// dir->ino();
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash dirty fixme"));
- }
-
- // get gather set ready for notifies
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == dir->authority()) continue;
- hash_gather[dir].insert(i);
- }
-
- // reply
- dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
- mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
- delete m;
-}
-
-/*
- * prep step:
- * peers need to open up all subdirs of the hashed dir
- */
-
-void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_hash_dir_prep " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(mds->mdcache, false);
- it->second->update_inode(in);
- cache->add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDir::PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ack!
- mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
-
- // done.
- delete m;
-}
-
-
-/*
- * hash step:
- */
-
-void Migrator::handle_hash_dir(MHashDir *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(!dir->is_auth());
- assert(!dir->is_hashed());
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir " << *dir << endl;
- int oldauth = m->get_source().num();
-
- // content
- import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDir::PIN_HASHED);
- cache->hashdirs.insert(dir);
- dir->hashed_subset.insert(mds->get_nodeid());
-
- // dir is complete
- dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdlog->submit_entry(new EString("dirty dir fixme"));
-
- // commit
- mds->mdstore->commit_dir(dir, 0);
-
- // send notifies
- dout(7) << "sending notifies" << endl;
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == m->get_source().num()) continue;
- mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
- i, MDS_PORT_MIGRATOR);
- }
-
- // ack
- dout(7) << "acking" << endl;
- mds->send_message_mds(new MHashDirAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
-
- // done.
- delete m;
-
- show_imports();
-}
-
-
-
-
-
-// UNHASH on auth
-
-class C_MDC_UnhashFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_frozen(dir);
- }
-};
-
-class C_MDC_UnhashComplete : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_complete(dir);
- }
-};
-
-
-void Migrator::unhash_dir(CDir *dir)
-{
- dout(-7) << "unhash_dir " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(!dir->is_unhashing());
- assert(dir->is_auth());
- assert(hash_gather.count(dir)==0);
-
- // pin path?
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- if (!cache->path_pin(trace, 0, 0)) {
- dout(7) << "unhash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // twiddle state
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // first, freeze the dir.
- dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(this, dir));
- } else
- unhash_dir_complete(dir);
-
-}
-
-void Migrator::unhash_dir_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep(dir);
-}
-
-
-/*
- * ask peers to freeze and complete hashed dir
- */
-void Migrator::unhash_dir_prep(CDir *dir)
-{
- dout(7) << "unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-/*
- * wait for peers to freeze and complete hashed dirs
- */
-void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
- dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(mds->mdcache, false);
- it->second->update_inode(in);
- cache->add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDir::PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ok, done with this PrepAck
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
- unhash_dir_go(dir);
- } else {
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * auth:
- * send out MHashDir's to peers
- */
-void Migrator::unhash_dir_go(CDir *dir)
-{
- dout(7) << "unhash_dir_go " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDir(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-/*
- * auth:
- * assimilate unhashing content
- */
-void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(dir->is_hashed());
-
- // assimilate content
- int from = m->get_source().num();
- import_hashed_content(dir, m->get_state(), m->get_nden(), from);
- delete m;
-
- // done?
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
- return;
- }
-
- // done!
-
- // fix up nested_exports
- CDir *containing_import = cache->get_auth_container(dir);
- if (containing_import != dir) {
- for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
- it != cache->nested_exports[dir].end();
- it++) {
- dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
- cache->nested_exports[containing_import].insert(*it);
- }
- cache->nested_exports.erase(dir);
- }
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); //later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDir::PIN_HASHED);
- cache->hashdirs.erase(dir);
-
- // commit!
- assert(dir->is_complete());
- //dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdstore->commit_dir(dir, 0);
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash inode dirty fixme"));
- }
-
- // notify
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
-
- hash_gather[dir].insert(i);
-
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-
-/*
- * sent by peer to flush mds links. unfreeze when all gathered.
- */
-void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(dir->is_frozen_dir());
-
- // done?
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
- delete m;
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
- } else {
- unhash_dir_finish(dir);
- }
-}
-
-
-/*
- * all mds links are flushed. unfreeze dir!
- */
-void Migrator::unhash_dir_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_finish " << *dir << endl;
- hash_gather.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- cache->path_unpin(trace, 0);
-
- // state
- dir->state_clear(CDIR_STATE_UNHASHING);
-
- // unfreeze
- dir->unfreeze_dir();
-
-}
-
-
-
-// UNHASH on all
-
-/*
- * hashed dir is complete.
- * mark all migrating inodes dirty (to pin in cache)
- * if frozen too, then go to next step (depending on auth)
- */
-void Migrator::unhash_dir_complete(CDir *dir)
-{
- dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_complete());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- if (in->is_auth()) {
- in->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("unhash dirty fixme"));
- }
- }
-
- if (!dir->is_frozen_dir()) {
- dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
- } else {
- if (dir->is_auth())
- unhash_dir_prep(dir); // auth
- else
- unhash_dir_prep_finish(dir); // nonauth
- }
-}
-
-
-// UNHASH on non-auth
-
-class C_MDC_UnhashPrepFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_prep_frozen(dir);
- }
-};
-
-
-/*
- * peers need to freeze their dir and make them complete
- */
-void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
-
- // freeze
- dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(this, dir));
- } else {
- unhash_dir_complete(dir);
- }
-
- delete m;
-}
-
-/*
- * peer has hashed dir frozen.
- * complete too?
- */
-void Migrator::unhash_dir_prep_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_frozen_dir());
- assert(!dir->is_auth());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep_finish(dir);
-}
-
-/*
- * peer has hashed dir complete and frozen. ack.
- */
-void Migrator::unhash_dir_prep_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_finish " << *dir << endl;
- assert(dir->is_hashed());
- assert(!dir->is_auth());
- assert(dir->is_frozen());
- assert(dir->is_complete());
-
- // twiddle state
- if (dir->is_unhashing())
- return; // already replied.
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // send subdirs back to auth
- MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
- int auth = dir->authority();
-
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) continue;
-
- // msg?
- ack->add_inode(it->first, in->replicate_to(auth));
- }
-
- // ack
- mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
-}
-
-
-
-/*
- * peer needs to send hashed dir content back to auth.
- * unhash dir.
- */
-void Migrator::handle_unhash_dir(MUnhashDir *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
- assert(dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- // get message ready
- bufferlist bl;
- int nden = 0;
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) {
- // not mine!
- // twiddle dir_auth?
- if (in->dir) {
- if (in->dir->authority() != dir->authority())
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- continue;
- }
-
- // -- dentry
- dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl.append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl.append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl.append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl.append("I", 1); // inode dentry
-
- encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
- nden++;
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // proxy
- in->state_set(CInode::STATE_PROXY);
- in->get(CInode::PIN_PROXY);
- hash_proxy_inos[dir].push_back(in);
-
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- cache->imports.insert(in->dir);
- in->dir->get(CDir::PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDir::PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- cache->exports.erase(in->dir);
- cache->nested_exports[dir].erase(in->dir);
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // we should have no nested exports; we're not auth for the dir!
- assert(cache->nested_exports[dir].empty());
- cache->nested_exports.erase(dir);
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); // later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDir::PIN_HASHED);
- cache->hashdirs.erase(dir);
- dir->mark_clean();
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("unhash inode dirty fixme"));
- }
-
- // init gather set
- mds->get_mds_map()->get_active_mds_set( hash_gather[dir] );
- hash_gather[dir].erase(mds->get_nodeid());
-
- // send unhash message
- mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
- dir->authority(), MDS_PORT_MIGRATOR);
-}
-
-
-/*
- * first notify comes from auth.
- * send notifies to all other peers, with peer = self
- * if we get notify from peer=other, remove from our gather list.
- * when we've gotten notifies from everyone,
- * unpin proxies,
- * send notify_ack to auth.
- * this ensures that all mds links are flushed of cache_expire type messages.
- */
-void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_finish " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
- delete m;
-
- // did we send our shout out?
- if (from == dir->authority()) {
- // send notify to everyone else in weird chatter storm
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == from) continue;
- if (i == mds->get_nodeid()) continue;
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
- }
- }
-
- // are we done?
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- return;
- }
- hash_gather.erase(dir);
-
- // all done!
- dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
-
- // unpin proxies
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CInode::STATE_PROXY));
- in->state_clear(CInode::STATE_PROXY);
- in->put(CInode::PIN_PROXY);
- }
-
- // unfreeze
- dir->unfreeze_dir();
-
- // ack
- dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
- mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
-
-}
-
-
void Migrator::show_imports()
const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks
// export fun
- map<CDir*, int> export_state;
- map<CDir*, int> export_peer;
- map<CDir*, set<CDir*> > export_bounds;
- map<CDir*, list<bufferlist> > export_data; // only during EXPORTING state
- map<CDir*, set<int> > export_warning_ack_waiting;
- map<CDir*, set<int> > export_notify_ack_waiting;
-
- //map<CDir*, list<inodeno_t> > export_proxy_inos;
- //map<CDir*, list<inodeno_t> > export_proxy_dirinos;
-
- map<CDir*, list<Context*> > export_finish_waiters;
+ map<CDir*,int> export_state;
+ map<CDir*,int> export_peer;
+ map<CDir*,set<CDir*> > export_bounds;
+ map<CDir*,list<bufferlist> > export_data; // only during EXPORTING state
+ map<CDir*,set<int> > export_warning_ack_waiting;
+ map<CDir*,set<int> > export_notify_ack_waiting;
+
+ map<CDir*,list<Context*> > export_finish_waiters;
// -- imports --
map<inodeno_t,int> import_state;
map<inodeno_t,int> import_peer;
- map<inodeno_t,set<inodeno_t> > import_bounds;
+ map<inodeno_t,set<inodeno_t> > import_bound_inos;
+ map<CDir*,set<CDir*> > import_bounds;
// -- hashing madness --
return 0;
}
bool is_importing() { return !import_state.empty(); }
- const set<inodeno_t>& get_import_bounds(inodeno_t base) {
+ const set<inodeno_t>& get_import_bound_inos(inodeno_t base) {
+ assert(import_bound_inos.count(base));
+ return import_bound_inos[base];
+ }
+ const set<CDir*>& get_import_bounds(CDir *base) {
assert(import_bounds.count(base));
return import_bounds[base];
}
void got_hashed_replica(CDir *import,
inodeno_t dir_ino,
inodeno_t replica_ino);
+ void reverse_import(CDir *dir);
void import_dir_logged_start(CDir *dir, int from,
list<inodeno_t> &imported_subdirs,
list<inodeno_t> &exports);
// inode was ours, still ours.
dout(7) << "inode was ours, still ours." << endl;
assert(!in->dir->is_import());
- assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+ assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT);
// move any exports nested beneath me?
CDir *newcon = cache->get_auth_container(in->dir);
assert(in->dir->is_import());
// verify dir_auth
- assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir.
+ assert(in->dir->get_dir_auth().first == mds->get_nodeid()); // me, because i'm auth for dir.
assert(in->authority() != in->dir->get_dir_auth()); // inode not me.
}
// sanity
assert(in->dir->is_export());
- assert(in->dir->get_dir_auth() >= 0);
+ assert(in->dir->get_dir_auth().first >= 0);
assert(in->dir->get_dir_auth() != in->authority());
// moved under new import?
in->dir->set_dir_auth( CDIR_AUTH_PARENT );
dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl;
} else {
- assert(in->dir->get_dir_auth() >= 0); // someone else's export,
+ assert(in->dir->get_dir_auth().first >= 0); // someone else's export,
}
} else {
dout(7) << "inode was replica, still replica. do nothing." << endl;
// fix dir_auth?
- if (in->authority() == dir_auth)
+ if (in->authority().first == dir_auth)
in->dir->set_dir_auth( CDIR_AUTH_PARENT );
else
in->dir->set_dir_auth( dir_auth );
// determine the players
- int srcauth = srcdir->dentry_authority(srcdn->name);
- int destauth = destdir->dentry_authority(destname);
+ int srcauth = srcdir->dentry_authority(srcdn->name).first;
+ int destauth = destdir->dentry_authority(destname).first;
// FOREIGN rename?
// note old dir auth
int old_dir_auth = -1;
- if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority();
+ if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority().first;
// rename replica into position
if (destdn->inode && destdn->inode->is_dirty())
CInode *in = srcdn->inode;
int old_dir_auth = -1;
- if (in && in->dir) old_dir_auth = in->dir->authority();
+ if (in && in->dir) old_dir_auth = in->dir->authority().first;
if (!destdn) {
destdn = destdir->add_dentry(m->get_destname()); // create null dentry
if (refpath.last_bit() == ".ceph.hash" &&
refpath.depth() > 1) {
dout(1) << "got explicit hash command " << refpath << endl;
+ /*
CDir *dir = trace[trace.size()-1]->get_inode()->dir;
if (!dir->is_hashed() &&
!dir->is_hashing() &&
dir->is_auth())
mdcache->migrator->hash_dir(dir);
+ */
}
else if (refpath.last_bit() == ".ceph.commit") {
dout(1) << "got explicit commit command on " << *dir << endl;
// auth?
if (!cur->dir_is_auth()) {
- int dirauth = cur->authority();
+ int dirauth = cur->authority().first;
if (cur->dir)
- dirauth = cur->dir->authority();
+ dirauth = cur->dir->authority().first;
assert(dirauth >= 0);
assert(dirauth != mds->get_nodeid());
// am i not open, not auth?
if (!diri->dir && !diri->is_auth()) {
- int dirauth = diri->authority();
+ int dirauth = diri->authority().first;
dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl;
mdcache->request_forward(req, dirauth);
return false;
CDir *dir = diri->dir;
// make sure it's my dentry
- int dnauth = dir->dentry_authority(name);
+ int dnauth = dir->dentry_authority(name).first;
if (dnauth != mds->get_nodeid()) {
// fw
dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir
} else {
// remote: send nlink++ request, wait
dout(7) << "target is remote, sending InodeLink" << endl;
- mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority(), MDS_PORT_CACHE);
+ mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority().first, MDS_PORT_CACHE);
// wait
targeti->add_waiter(CINODE_WAIT_LINK,
// am i not open, not auth?
if (!diri->dir && !diri->is_auth()) {
- int dirauth = diri->authority();
+ int dirauth = diri->authority().first;
dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
mdcache->request_forward(req, dirauth);
return;
if (!try_open_dir(diri, req)) return;
CDir *dir = diri->dir;
- int dnauth = dir->dentry_authority(name);
+ int dnauth = dir->dentry_authority(name).first;
// does it exist?
CDentry *dn = dir->lookup(name);
return;
}
if (!in->dir->is_auth()) {
- int dirauth = in->dir->authority();
+ int dirauth = in->dir->authority().first;
dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl;
mdcache->request_forward(req, dirauth);
return;
// am i not open, not auth?
if (!srcdiri->dir && !srcdiri->is_auth()) {
- int dirauth = srcdiri->authority();
+ int dirauth = srcdiri->authority().first;
dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
mdcache->request_forward(req, dirauth);
return;
dout(7) << "handle_client_rename srcdir is " << *srcdir << endl;
// make sure it's my dentry
- int srcauth = srcdir->dentry_authority(srcname);
+ int srcauth = srcdir->dentry_authority(srcname).first;
if (srcauth != mds->get_nodeid()) {
// fw
dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl;
// local or remote?
- int srcauth = srcdir->dentry_authority(srcdn->name);
- int destauth = destdir->dentry_authority(destname);
+ int srcauth = srcdir->dentry_authority(srcdn->name).first;
+ int destauth = destdir->dentry_authority(destname).first;
dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl;
//
//everybody = true;
//}
- bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == mds->get_nodeid();
- bool destlocal = destdir->dentry_authority(destname) == mds->get_nodeid();
+ bool srclocal = srcdn->dir->dentry_authority(srcdn->name).first == mds->get_nodeid();
+ bool destlocal = destdir->dentry_authority(destname).first == mds->get_nodeid();
dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl;
if (destdn) {
// auth for write access
if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY &&
!cur->is_auth()) {
- int auth = cur->authority();
+ int auth = cur->authority().first;
assert(auth != mds->get_nodeid());
dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl;
spec_defined = false;
if (in->dir)
- dir_auth = in->dir->get_dir_auth();
+ dir_auth = in->dir->get_dir_auth().first;
else
dir_auth = -1;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONCOMMAND_H
+#define __MMONCOMMAND_H
+
+#include "msg/Message.h"
+
+#include <vector>
+using std::vector;
+
+class MMonCommand : public Message {
+ public:
+ vector<string> cmd;
+
+ MMonCommand() : Message(MSG_MON_COMMAND) {}
+
+ virtual char *get_type_name() { return "mon_command"; }
+ void print(ostream& o) {
+ o << "mon_command(";
+ for (unsigned i=0; i<cmd.size(); i++) {
+ if (i) o << ' ';
+ o << cmd[i];
+ }
+ o << ")";
+ }
+
+ void encode_payload() {
+ ::_encode(cmd, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(cmd, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONCOMMANDACK_H
+#define __MMONCOMMANDACK_H
+
+#include "msg/Message.h"
+
+class MMonCommandAck : public Message {
+ public:
+ int r;
+ string rs;
+
+ MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {}
+ MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK),
+ r(_r), rs(s) { }
+
+ virtual char *get_type_name() { return "mon_command"; }
+ void print(ostream& o) {
+ o << "mon_command_ack(" << r << " " << rs << ")";
+ }
+
+ void encode_payload() {
+ payload.append((char*)&r, sizeof(r));
+ ::_encode(rs, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(r), (char*)&r);
+ off += sizeof(r);
+ ::_decode(rs, payload, off);
+ }
+};
+
+#endif
#include "messages/MMDSGetMap.h"
#include "messages/MMDSBeacon.h"
+#include "messages/MMonCommand.h"
+
#include "common/Timer.h"
+#include <sstream>
+
#include "config.h"
#undef dout
#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " "
}
+void MDSMonitor::handle_command(MMonCommand *m, int& r, string& rs)
+{
+ stringstream ss;
+
+ if (m->cmd.size() > 1) {
+ if (m->cmd[1] == "stop" && m->cmd.size() > 2) {
+ int who = atoi(m->cmd[2].c_str());
+ if (mdsmap.is_active(who)) {
+ r = 0;
+ ss << "telling mds" << who << " to stop";
+ getline(ss,rs);
+
+ // hack
+ mdsmap.mds_state[who] = MDSMap::STATE_STOPPING;
+
+ // inc map version
+ mdsmap.inc_epoch();
+ mdsmap.encode(maps[mdsmap.get_epoch()]);
+
+ print_map();
+
+ // bcast map
+ bcast_latest_mds();
+ send_current();
+ } else {
+ ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")";
+ getline(ss,rs);
+ }
+ }
+ }
+}
+
+
void MDSMonitor::handle_mds_beacon(MMDSBeacon *m)
{
void send_latest(entity_inst_t dest);
+ void handle_command(class MMonCommand *m, int& r, string& rs);
+
};
#endif
#include "messages/MPing.h"
#include "messages/MPingAck.h"
#include "messages/MGenericMessage.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
#include "common/Timer.h"
#include "common/Clock.h"
}
+void Monitor::handle_command(MMonCommand *m)
+{
+ dout(0) << "handle_command " << *m << endl;
+
+ int r = -1;
+ string rs = "unrecognized command";
+
+ if (!m->cmd.empty()) {
+ if (m->cmd[0] == "stop") {
+ r = 0;
+ rs = "stopping";
+ }
+ else if (m->cmd[0] == "mds") {
+ mdsmon->handle_command(m, r, rs);
+ }
+ else if (m->cmd[0] == "osd") {
+
+ }
+ }
+
+ // reply
+ messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst());
+ delete m;
+}
+
+
void Monitor::dispatch(Message *m)
{
osdmon->dispatch(m);
break;
+ case MSG_MON_COMMAND:
+ handle_command((MMonCommand*)m);
+ break;
+
// OSDs
case MSG_OSD_GETMAP:
// messages
void handle_shutdown(Message *m);
void handle_ping_ack(class MPingAck *m);
+ void handle_command(class MMonCommand *m);
friend class OSDMonitor;
friend class MDSMonitor;
#include "messages/MNSFailure.h"
*/
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
#include "messages/MMonElectionAck.h"
#include "messages/MMonElectionPropose.h"
#include "messages/MMonElectionVictory.h"
break;
*/
+ case MSG_MON_COMMAND:
+ m = new MMonCommand;
+ break;
+ case MSG_MON_COMMAND_ACK:
+ m = new MMonCommandAck;
+ break;
case MSG_MON_ELECTION_PROPOSE:
m = new MMonElectionPropose;
break;
#define MSG_PING 10
#define MSG_PING_ACK 11
-#define MSG_FAILURE 12
-#define MSG_FAILURE_ACK 13
-
#define MSG_SHUTDOWN 99999
+#define MSG_MON_COMMAND 13
+#define MSG_MON_COMMAND_ACK 14
#define MSG_MON_ELECTION_ACK 15
#define MSG_MON_ELECTION_PROPOSE 16
k != j->second.end();
++k) {
derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_addr << endl;
- i->first->ms_handle_failure(*k, j->first, peer_addr);
+ if (i->first)
+ i->first->ms_handle_failure(*k, j->first, peer_addr);
}
}
static const int TYPE_MDS = 2;
static const int TYPE_OSD = 3;
static const int TYPE_CLIENT = 4;
+ static const int TYPE_ADMIN = 5;
static const int NEW = -1;
// cons
entity_name_t() : _type(0), _num(0) {}
- entity_name_t(int t, int n) : _type(t), _num(n) {}
+ entity_name_t(int t, int n=NEW) : _type(t), _num(n) {}
int num() const { return _num; }
int type() const { return _type; }
case TYPE_OSD: return "osd";
case TYPE_MON: return "mon";
case TYPE_CLIENT: return "client";
+ case TYPE_ADMIN: return "admin";
default: return "unknown";
}
}
bool is_mds() const { return type() == TYPE_MDS; }
bool is_osd() const { return type() == TYPE_OSD; }
bool is_mon() const { return type() == TYPE_MON; }
+ bool is_admin() const { return type() == TYPE_ADMIN; }
};
inline bool operator== (const entity_name_t& l, const entity_name_t& r) {